diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt index 3fca45cc068f9916b52b3f99df2baa679d4c3546..49ba9479d49e93143665b8314d04ee8e0efcbf51 100644 --- a/paddle/fluid/distributed/collective/CMakeLists.txt +++ b/paddle/fluid/distributed/collective/CMakeLists.txt @@ -1,5 +1,5 @@ cc_library(processgroup SRCS ProcessGroup.cc DEPS phi phi_api eager_api) -cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup phi phi_api) +cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup phi phi_api string_helper) if (WITH_DISTRIBUTE) cc_library(processgroup_gloo SRCS ProcessGroupGloo.cc DEPS phi phi_api eager_api gloo_wrapper) diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc index 5dc43af117825bf95407255e93e1e4600e8ddd9a..cb82677a281e990d9837f081b0d4d2f3b0a34a26 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc @@ -171,10 +171,10 @@ ProcessGroupGloo::GlooTask::GlooTask(int rank, "Only CPU place is supported for ProcessGroupGloo.")); } -ProcessGroupGloo::ProcessGroupGloo(const std::shared_ptr& store, - int rank, int world_size, - const std::shared_ptr options) - : ProcessGroup(rank, world_size), _tag(0), _store(store) { +ProcessGroupGloo::ProcessGroupGloo( + const std::shared_ptr& store, int rank, + int world_size, const std::shared_ptr options) + : ProcessGroup(rank, world_size), _tag(0), _store(new GlooStore(store)) { _context = std::make_shared(rank, world_size); auto prefix_store = ::gloo::rendezvous::PrefixStore(std::to_string(0), *_store); diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.h b/paddle/fluid/distributed/collective/ProcessGroupGloo.h index 24f156571a427128f09cd28e632212f47fa4cd47..71e0a40f8a76181d9f4db13ddd57b31de676910b 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupGloo.h +++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.h @@ -52,8 +52,7 @@ class ProcessGroupGloo : public ProcessGroup { class GlooStore : public ::gloo::rendezvous::Store { public: - explicit GlooStore( - const std::shared_ptr& store) + explicit GlooStore(const std::shared_ptr& store) : _store(store) {} ~GlooStore() = default; @@ -87,7 +86,7 @@ class ProcessGroupGloo : public ProcessGroup { } protected: - std::shared_ptr _store; + std::shared_ptr _store; }; class GlooOptions { @@ -100,9 +99,9 @@ class ProcessGroupGloo : public ProcessGroup { std::shared_ptr<::gloo::transport::Device> device; }; - explicit ProcessGroupGloo(const std::shared_ptr& store, int rank, - int world_size, - std::shared_ptr options); + explicit ProcessGroupGloo( + const std::shared_ptr& store, int rank, + int world_size, std::shared_ptr options); ~ProcessGroupGloo() = default; @@ -145,7 +144,7 @@ class ProcessGroupGloo : public ProcessGroup { protected: uint32_t _tag; std::shared_ptr _context; - std::shared_ptr _store; + std::shared_ptr<::gloo::rendezvous::Store> _store; }; } // namespace distributed diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc index 5533f3f4cbf4b136c52b35cb74afefb86cbe73d7..be4c5423943f5076201b75e307094c75d3d9c103 100644 --- a/paddle/fluid/distributed/collective/reducer.cc +++ b/paddle/fluid/distributed/collective/reducer.cc @@ -17,6 +17,20 @@ namespace paddle { namespace distributed { +static Backend TransToBackend(platform::Place place) { + static const std::map type_backend = { + {phi::AllocationType::GPU, Backend::GPU}, + {phi::AllocationType::CPU, Backend::CPU}, + }; + + phi::AllocationType type = place.GetType(); + auto it = type_backend.find(type); + PADDLE_ENFORCE_EQ(it != type_backend.end(), true, + platform::errors::InvalidArgument( + "Place type (%s) is not supported. ", place)); + return it->second; +} + std::vector> Eager_AssignGroupBySize( const std::vector tensors, const std::vector &is_sparse_gradient, @@ -297,10 +311,18 @@ EagerReducer::EagerReducer( std::dynamic_pointer_cast(grad_node); accumulation_grad_node->RegisterReduceHook( std::make_shared(reduce_hook)); + + gradnode_index_map_[grad_node.get()] = global_var_index; } vars_marked_ready_.resize(tensors_.size(), false); local_used_vars_.resize(tensors_.size(), 0); + + if (find_unused_vars_each_step_) { + global_used_vars_ = paddle::experimental::empty( + ScalarArray({static_cast(tensors_.size())}), DataType::INT32, + TransToBackend(inner_place_)); + } } std::shared_ptr EagerReducer::GetGradNodeFromTensor( @@ -341,21 +363,10 @@ void EagerReducer::InitializeGroups( } else { // process the dense gradient. InitializeDenseGroups(tensor_indices_, &group); - experimental::Backend backend; - switch (inner_place_.GetType()) { - case phi::AllocationType::GPU: - backend = experimental::Backend::GPU; - break; - case phi::AllocationType::CPU: - backend = experimental::Backend::CPU; - break; - default: - PADDLE_THROW(platform::errors::Unimplemented( - "Place type (%s) is not supported. ", inner_place_)); - break; - } + // experimental::Backend backend = TransToBackend(inner_place_); group.dense_contents_ = paddle::experimental::empty( - ScalarArray({group.all_length_}), group.dtype_, backend); + ScalarArray({group.all_length_}), group.dtype_, + TransToBackend(inner_place_)); } // map tensors to this group by VariableLocator @@ -418,6 +429,53 @@ void EagerReducer::InitializeDenseGroups( p_group->all_length_ = all_length; } +void EagerReducer::TraverseBackwardGraph(const std::vector &outputs) { + std::queue queue; + std::set visited; + + for (const auto &output : outputs) { + auto *auto_grad_meta = + static_cast(output.get_autograd_meta()); + if (!auto_grad_meta) continue; + auto shared_grad_node = auto_grad_meta->GetMutableGradNode(); + if (shared_grad_node == nullptr || shared_grad_node.get() == nullptr || + auto_grad_meta->StopGradient()) { + continue; + } + egr::GradNodeBase *grad_node = shared_grad_node.get(); + queue.emplace(grad_node); + } + + while (!queue.empty()) { + egr::GradNodeBase *node = queue.front(); + queue.pop(); + const std::vector> &edges = node->GetEdges(); + for (size_t i = 0; i < edges.size(); i++) { + for (size_t j = 0; j < edges[i].size(); j++) { + const egr::Edge &edge = edges[i][j]; + auto next_node_shared = edge.GetMutableGradNode(); + if (!next_node_shared || !next_node_shared.get()) { + continue; + } + auto *next_node = next_node_shared.get(); + const bool was_inserted = visited.insert(next_node).second; + if (was_inserted) { + queue.emplace(next_node); + } + } + } + } + + for (const auto &it : gradnode_index_map_) { + if (visited.count(it.first) == 0) { + unused_vars_.push_back(it.second); + VLOG(3) << "[Rank " << process_group_->GetRank() << "]: " + << "Tensor " << tensors_[it.second].name() << " at index " + << it.second << " is marked as unused."; + } + } +} + void EagerReducer::PrepareForBackward(const std::vector &outputs) { VLOG(3) << "after forward, then reset count for backward."; grad_need_hooks_ = true; @@ -429,6 +487,51 @@ void EagerReducer::PrepareForBackward(const std::vector &outputs) { // reinitialize vars_marked_ready_ for next iteration vars_marked_ready_.clear(); vars_marked_ready_.resize(tensors_.size(), false); + + PADDLE_ENFORCE_EQ( + groups_need_finalize_, false, + platform::errors::PreconditionNotMet( + "A serious error has occurred here. Please " + "set find_unused_parameters=True to traverse backward graph " + "in each step to prepare reduce in advance. If you have " + "set, There may be several reasons for this error: " + "1) Please note that all forward outputs derived from the module " + "parameters must participate in the calculation of losses and " + "subsequent gradient calculations. If not, the wrapper will hang, " + "waiting for autograd to generate gradients for these parameters. " + "you can use detach or stop_gradient to make the unused parameters " + "detached from the autograd graph. " + "2) Used multiple forwards and one backward. You may be able to wrap " + "multiple forwards in a model.")); + + // The first var to trigger the unused parameter + has_marked_unused_vars_ = false; + + if (find_unused_vars_once_ || find_unused_vars_each_step_) { + unused_vars_.clear(); + TraverseBackwardGraph(outputs); + // only check once in first step + find_unused_vars_once_ = false; + } + + if (find_unused_vars_each_step_ && unused_vars_.empty()) { + LOG_FIRST_N(WARNING, 1) + << "All parameters are involved in the backward pass. " + "It is recommended to set find_unused_parameters to False " + "to improve performance. However, if unused parameters " + "appear in subsequent iterative training, then an error " + "will occur. Please make it clear that in the subsequent " + "training, there will be no parameters that are not used " + "in the backward pass, and then set find_unused_parameters"; + } + + if (unused_vars_.size() == tensors_.size()) { + LOG_FIRST_N(WARNING, 1) + << "There is no parameter in the device involved " + "in the backward calculation. If there are " + "parameters on other devices involved in the " + "backward, then a serious error will occur here."; + } } void EagerReducer::AddDistHook(size_t var_index) { @@ -446,36 +549,104 @@ void EagerReducer::AddDistHook(size_t var_index) { auto &tensor = tensors_[var_index]; const auto &grad_node = GetGradNodeFromTensor(&tensor); - VLOG(3) << "Var[" << var_index << "] [" << (*grad_node).name() - << "] arrived and triggered disthook"; + VLOG(3) << "Tensor[" << var_index << "] [" << tensors_[var_index].name() + << "@Grad] arrived and triggered disthook"; local_used_vars_[var_index] = 1; + if (!has_marked_unused_vars_) { + has_marked_unused_vars_ = true; + for (const auto unused_index : unused_vars_) { + MarkVarReady(unused_index, false); + } + } MarkVarReady(var_index, true); } void EagerReducer::MarkVarReady(const size_t var_index, const bool is_used_var) { + VLOG(3) << "Tensor[" << var_index << "][" << tensors_[var_index].name() + << "] is marked ready."; + // error happened, if the var is ready before. + if (vars_marked_ready_[var_index]) { + auto error_info = string::Sprintf( + "Error happened, when parameter[%d][%s] has been ready before. " + "Please set find_unused_parameters=True to traverse backward graph " + "in each step to prepare reduce in advance. If you have set, " + "there may be several reasons for this error: " + "1) In multiple reentrant backward phase, some parameters are reused." + "2) Using model parameters outside of forward function. Please " + "make sure that model parameters are not shared in concurrent " + "forward-backward passes.", + var_index, tensors_[var_index].name()); + + PADDLE_ENFORCE_EQ(has_marked_unused_vars_, false, + platform::errors::PreconditionNotMet(error_info)); + + error_info += + "3) Unused parameters retrieval is incorrect. " + "The return value of forward will be used to retrieve" + " the unused parameters of the entire model. These " + "gradients of unused parameters will not be synchronized " + "between multiple cards. However, if the unused " + "parameters participate in the backward calculation " + "again at a later time (e.g. after the forward function, " + "the loss calculation uses the unused " + "paramters of the forward and trigger backward), " + "its gradient will be wrong."; + + PADDLE_ENFORCE_EQ(has_marked_unused_vars_, true, + platform::errors::PreconditionNotMet(error_info)); + } else { + vars_marked_ready_[var_index] = true; + } + groups_need_finalize_ = true; + const auto &var_locator = variable_locators_[var_index]; const auto group_index = var_locator.group_index; const auto inside_group_index = var_locator.inside_group_index; auto &group = groups_[group_index]; auto &group_tensor = group.dense_tensors_[inside_group_index]; - auto *autograd_meta = tensors_[var_index].get_autograd_meta(); - auto &grad_tensor = static_cast(autograd_meta)->Grad(); - - group_tensor - .ShareDataWith( - *(std::dynamic_pointer_cast(grad_tensor.impl()))) - .Resize({grad_tensor.numel()}); - - vars_marked_ready_[var_index] = true; + const auto length = group.length_[inside_group_index]; + + if (is_used_var) { + auto *autograd_meta = tensors_[var_index].get_autograd_meta(); + auto &grad_tensor = static_cast(autograd_meta)->Grad(); + group_tensor + .ShareDataWith( + *(std::dynamic_pointer_cast(grad_tensor.impl()))) + .Resize({grad_tensor.numel()}); + } else { + // TODO(shenliang03): maybe save the memory by avoiding tensor construction + if (!group_tensor.initialized()) { + group_tensor.Resize({static_cast(length)}); + group_tensor.mutable_data(inner_place_, group.dtype_); + } + if (HasGrad(var_index)) { + VLOG(3) << "Tensor[" << tensors_[var_index].name() << "] has grad"; + auto grad_tensor = egr::EagerUtils::mutable_grad(tensors_[var_index]); + group_tensor + .ShareDataWith(*( + std::dynamic_pointer_cast(grad_tensor->impl()))) + .Resize({length}); + } else { + VLOG(3) << "Tensor[" << tensors_[var_index].name() + << "] doesn't have grad"; + auto *dev_ctx = platform::DeviceContextPool::Instance().Get(inner_place_); + group_tensor.Resize({static_cast(length)}); + phi::funcs::set_constant(*dev_ctx, &group_tensor, 0.0); + } + } if (--group.pending_ == 0) { // can start allreduce MarkGroupReady(group_index); } + + if (next_group_ == groups_.size()) { + FinalizeBackward(); + } } void EagerReducer::MarkGroupReady(size_t group_index) { @@ -501,6 +672,92 @@ void EagerReducer::MarkGroupReady(size_t group_index) { } } +bool EagerReducer::HasGrad(size_t var_index) { + auto grad = egr::EagerUtils::mutable_grad(tensors_[var_index]); + if (grad && grad->is_initialized()) { + return true; + } else { + return false; + } +} + +void EagerReducer::ProcessUnusedDenseVars() { + // The calculation stream must be used here to + // avoid conflicts with communication. + VLOG(3) << "Local used vars : " + << string::join_strings(local_used_vars_, ','); + + const auto *dev_ctx = + platform::DeviceContextPool::Instance().Get(inner_place_); + auto *global_used_tensor = + std::dynamic_pointer_cast(global_used_vars_.impl()) + .get(); + framework::TensorFromVector(local_used_vars_, *dev_ctx, + global_used_tensor); + + distributed::AllreduceOptions opts; + opts.reduce_op = ReduceOp::SUM; + std::vector reduce_tensors = {global_used_vars_}; + process_group_->AllReduce(reduce_tensors, opts)->Synchronize(); + + framework::TensorToVector(*global_used_tensor, *dev_ctx, + &local_used_vars_); + dev_ctx->Wait(); + + // sync compute stream to get global used var message, + // but maybe affect speed performance + VLOG(3) << "Global used vars : " + << string::join_strings(local_used_vars_, ','); + + for (const auto var_index : unused_vars_) { + const bool global_unused = (local_used_vars_[var_index] == 0); + + // global used but local unused, set grad + VLOG(3) << "[Rank " << process_group_->GetRank() << "]: " + << "Var [" << var_index << "] [" << tensors_[var_index].name() + << "] global_unused: " << global_unused + << " has grad: " << HasGrad(var_index); + + if (!global_unused) { + VLOG(3) << "Set Tensor[" << var_index << "]'s Grad for [Rank " + << process_group_->GetRank() << "]"; + const auto &var_locator = variable_locators_[var_index]; + const auto group_index = var_locator.group_index; + const auto &group = groups_[group_index]; + const auto inside_group_index = var_locator.inside_group_index; + auto &src_tensor = group.dense_tensors_[inside_group_index]; + + Tensor grad_value(std::make_shared(src_tensor)); + + auto dest_var_base = tensors_[var_index]; + auto grad_tensor = egr::EagerUtils::mutable_grad(dest_var_base); + grad_tensor->copy_(grad_value, inner_place_, true); + grad_tensor->reshape(dest_var_base.shape()); + } + } +} + +void EagerReducer::FinalizeBackward() { + groups_need_finalize_ = false; + grad_need_hooks_ = false; + for (auto &group : groups_) { + group.task->Synchronize(); + } + + for (auto &group : groups_) { + group.SplitTensors(inner_place_); + } + + if (find_unused_vars_each_step_) { + ProcessUnusedDenseVars(); + local_used_vars_.clear(); + local_used_vars_.resize(tensors_.size(), 0); + VLOG(3) << "ProcessUnusedDenseVars is finished."; + } + + VLOG(3) << "In the batch, Reducer is finished."; +} + void EagerReducer::FusedAllReduceSchedule(EagerGroup *group, const int curr_group_index) { // The overall timeline: concat > div_nranks > allreduce > split @@ -513,24 +770,14 @@ void EagerReducer::FusedAllReduceSchedule(EagerGroup *group, group->ConcatTensors(inner_place_); // div nranks - double scaling = 1.0 / nranks_; - paddle::experimental::scale_(group->dense_contents_, scaling, 0.0, false); + paddle::experimental::scale_(group->dense_contents_, 1.0 / nranks_, 0.0, + false); // all_reduce std::vector reduce_tensors = {group->dense_contents_}; - tasks_.push_back(process_group_->AllReduce(reduce_tensors, opts)); + group->task = process_group_->AllReduce(reduce_tensors, opts); - if (tasks_.size() == groups_.size()) { - for (size_t index = 0; index < tasks_.size(); index++) { - auto &task = tasks_.back(); - task->Synchronize(); - tasks_.pop_back(); - } - for (size_t index = 0; index < groups_.size(); index++) { - auto &group = groups_[index]; - group.SplitTensors(inner_place_); - } - } + // split in FinalizeBackward() } std::ostream &operator<<(std::ostream &out, const EagerGroup &group) { diff --git a/paddle/fluid/distributed/collective/reducer.h b/paddle/fluid/distributed/collective/reducer.h index ac6f3fbe5956cd47d4385343509d41afec0b69a4..d3ffa8498a14b0d0ade02ea459e1c6058550122f 100644 --- a/paddle/fluid/distributed/collective/reducer.h +++ b/paddle/fluid/distributed/collective/reducer.h @@ -28,6 +28,8 @@ #include "paddle/phi/api/include/tensor.h" #include "paddle/phi/api/lib/ext_compat_utils.h" #include "paddle/phi/common/data_type.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/utils/string/string_helper.h" namespace paddle { namespace distributed { @@ -35,6 +37,7 @@ using Tensor = paddle::experimental::Tensor; using Scalar = paddle::experimental::ScalarBase; using ScalarArray = paddle::experimental::ScalarArrayBase; +using Backend = paddle::experimental::Backend; std::vector> Eager_AssignGroupBySize( const std::vector, const std::vector &is_sparse_gradient, @@ -61,6 +64,9 @@ class EagerGroup { // external message of group phi::DataType dtype_; + // help to sync + std::shared_ptr task; + // context is used to select the stream for concat void ConcatTensors(const platform::Place &); @@ -98,6 +104,10 @@ class EagerReducer { void MarkVarReady(const size_t var_index, const bool is_used_var); void MarkGroupReady(const size_t group_index); void FusedAllReduceSchedule(EagerGroup *group, const int curr_group_index); + void FinalizeBackward(); + void TraverseBackwardGraph(const std::vector &outputs); + void ProcessUnusedDenseVars(); + bool HasGrad(size_t var_index); private: std::vector tensors_; @@ -105,7 +115,6 @@ class EagerReducer { std::vector is_sparse_gradient_; std::shared_ptr process_group_; std::vector group_size_limits_; - bool find_unused_vars_each_step_; std::vector groups_; std::vector variable_locators_; @@ -113,12 +122,20 @@ class EagerReducer { platform::Place inner_place_; size_t next_group_ = 0; int64_t nranks_ = -1; - std::vector> tasks_; bool grad_need_hooks_{false}; std::vector vars_marked_ready_; - std::vector local_used_vars_; + std::vector local_used_vars_; + + // Following variables are to help unused vars + std::vector unused_vars_; + std::map gradnode_index_map_; + bool has_marked_unused_vars_{false}; + bool find_unused_vars_each_step_{false}; + bool find_unused_vars_once_{true}; + bool groups_need_finalize_{false}; + Tensor global_used_vars_; }; } // namespace distributed diff --git a/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc b/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc index ba6a936d68651c0bcf3815eab58b5a6e66d7024c..1be3b31de00a6bb94b8ad16bff4bf9c1fa61123f 100644 --- a/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc +++ b/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc @@ -86,9 +86,9 @@ paddle::experimental::Tensor scale(const paddle::experimental::Tensor& x, scale_node->SetTensorWrappers_X({x}); // Set Grad out rank as same as fwd input and set stop gradient to bwd - scale_node->SetGradOutMeta(p_autograd_in, /*slot id*/ 0); + scale_node->SetGradOutMeta(x, /*slot id*/ 0); // Set Grad out rank as same as fwd input and set stop gradient to bwd - scale_node->SetGradInMeta(p_autograd_out, /*slot id*/ 0); + scale_node->SetGradInMeta(out, /*slot id*/ 0); // Set History for output set current Grad Node for EagerUtils::SetHistory(p_autograd_out, scale_node); diff --git a/paddle/fluid/eager/api/utils/tensor_utils.cc b/paddle/fluid/eager/api/utils/tensor_utils.cc index 77c39d1b0a37c3946e4c170484118a5fb6f79170..b485beca57a214bc00cb813e9de6a53eca1e67ea 100644 --- a/paddle/fluid/eager/api/utils/tensor_utils.cc +++ b/paddle/fluid/eager/api/utils/tensor_utils.cc @@ -30,7 +30,8 @@ namespace egr_utils_api { bool IsLeafTensor(const paddle::experimental::Tensor& target) { std::shared_ptr grad_node = EagerUtils::grad_node(target); - if (std::dynamic_pointer_cast(grad_node)) { + if (!grad_node || + std::dynamic_pointer_cast(grad_node)) { return true; } diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc index d9f201dc9f1e8b9a0296288917b82f3e2903330e..b8d59e8dd8b4c60e28323955effd232eb2b51945 100644 --- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc +++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc @@ -979,7 +979,9 @@ static bool CollectGradInformationFromOpInfo( /* --------------------------------------------------- */ static std::string GenerateGradNodeCreationContent( const ForwardGenerationInfo& fwd_info, - const GradNodeGenerationInfo& bwd_info) { + const GradNodeGenerationInfo& bwd_info, + const std::string& trace_op_body_str, + std::map inplace_map = {}) { VLOG(6) << "Generating GradNode Creation codes"; const std::string& op_type = fwd_info.GetOpType(); @@ -998,7 +1000,8 @@ static std::string GenerateGradNodeCreationContent( // If single output slotname and not duplicable, // then generate: "egr::AutogradMeta* p_autograd_out = // egr::EagerUtils::autograd_meta("op_proto->outputs()[0].name()")" - std::string get_autograd_meta_str = " // Prepare Autograd Meta \n"; + std::string get_input_autograd_meta_str = " // Prepare Autograd Meta \n"; + std::string get_output_autograd_meta_str = ""; // If single output slotname and not duplicable, // then generate: "egr::AutogradMeta* p_autograd_out = // egr::EagerUtils::autograd_meta("op_proto.outputs()[0].name()")" @@ -1006,22 +1009,39 @@ static std::string GenerateGradNodeCreationContent( const std::string& output_name = output.name(); const std::string& output_autograd_name = "p_autograd_" + output_name; + // output autograd_meta should be got after running TraceOP. if (output.duplicable()) { const char* GET_MULTI_AUTOGRAD_META_TEMPLATE = - " std::vector %s = " + " std::vector %s = " "egr::EagerUtils::autograd_meta(&%s);\n"; - get_autograd_meta_str += paddle::string::Sprintf( + get_output_autograd_meta_str += paddle::string::Sprintf( GET_MULTI_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name); } else { - const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE = - " egr::AutogradMeta* %s = " - "egr::EagerUtils::autograd_meta(&%s);\n"; - get_autograd_meta_str += paddle::string::Sprintf( - GET_SINGLE_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name); + // In inplace op, the case where output is duplicable is not considered. + // Replace output directly with input in inplace op. + if (!inplace_map.empty() && inplace_map.count(output_name)) { + auto inplace_input_name = inplace_map[output_name]; + const std::string& inplace_input_autograd_name = + "p_autograd_" + inplace_input_name; + const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE = + " %s = egr::EagerUtils::autograd_meta(&%s);\n"; + get_output_autograd_meta_str += paddle::string::Sprintf( + GET_SINGLE_AUTOGRAD_META_TEMPLATE, inplace_input_autograd_name, + inplace_input_name); + } else { + const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE = + " egr::AutogradMeta* %s = " + "egr::EagerUtils::autograd_meta(&%s);\n"; + get_output_autograd_meta_str += + paddle::string::Sprintf(GET_SINGLE_AUTOGRAD_META_TEMPLATE, + output_autograd_name, output_name); + } } } VLOG(6) << "Generated outputs autograd_meta"; + // input autograd_meta should be got before running TraceOP (for checking + // inplace). for (const proto::OpProto::Var& input : in_vars) { const std::string& input_name = input.name(); const std::string& input_autograd_name = "p_autograd_" + input_name; @@ -1030,28 +1050,46 @@ static std::string GenerateGradNodeCreationContent( const char* GET_MULTI_AUTOGRAD_META_TEMPLATE = " std::vector %s = " "egr::EagerUtils::nullable_autograd_meta(%s);\n"; - get_autograd_meta_str += paddle::string::Sprintf( + get_input_autograd_meta_str += paddle::string::Sprintf( GET_MULTI_AUTOGRAD_META_TEMPLATE, input_autograd_name, input_name); } else if (input.dispensable()) { const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE = " egr::AutogradMeta* %s = " "egr::EagerUtils::nullable_autograd_meta(%s);\n"; - get_autograd_meta_str += paddle::string::Sprintf( + get_input_autograd_meta_str += paddle::string::Sprintf( GET_SINGLE_AUTOGRAD_META_TEMPLATE, input_autograd_name, input_name); } else { const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE = " egr::AutogradMeta* %s = " "egr::EagerUtils::nullable_autograd_meta(%s);\n"; - get_autograd_meta_str += paddle::string::Sprintf( + get_input_autograd_meta_str += paddle::string::Sprintf( GET_SINGLE_AUTOGRAD_META_TEMPLATE, input_autograd_name, input_name); } } VLOG(6) << "Generated inputs autograd_meta"; + // check inplace input to avoid inplace operations on leaf nodes with + // stop_gradient=False. + std::string check_inplace_str = ""; + if (!inplace_map.empty()) { + const char* CHECKING_INPLACE_TEMPLATE = + " // Check Inplace\n" + " egr::EagerUtils::CheckInplace(%s, p_autograd_%s, " + "require_any_grad);\n"; + for (auto& inplace_pair : inplace_map) { + std::string inplace_name = inplace_pair.second; + check_inplace_str += paddle::string::Sprintf(CHECKING_INPLACE_TEMPLATE, + inplace_name, inplace_name); + } + VLOG(6) << "Check Inplace Input"; + } + std::string prepare_autograd_meta_str = ""; - prepare_autograd_meta_str += get_autograd_meta_str; + // only generate input autograd_meta in temporary. + // output autograd_meta will be generated after running TraceOP. + prepare_autograd_meta_str += get_input_autograd_meta_str; prepare_autograd_meta_str += "\n"; // [GradOpNode] GetTraceBackward @@ -1066,7 +1104,7 @@ static std::string GenerateGradNodeCreationContent( size_t bwd_in_slot_num = out_vars.size(); size_t bwd_out_slot_num = in_vars.size(); const char* GRAD_OP_NODE_TEMPLATE = - " auto grad_node = std::make_shared(%d, %d);\n"; + " auto grad_node = std::make_shared(%d, %d);\n"; grad_node_creation_str += " // Create GradOpNode\n"; grad_node_creation_str += paddle::string::Sprintf( GRAD_OP_NODE_TEMPLATE, op_type, bwd_in_slot_num, bwd_out_slot_num); @@ -1075,14 +1113,14 @@ static std::string GenerateGradNodeCreationContent( VLOG(6) << "Generated GradOpNode construction"; // [GradOpNode] Set Attrs - grad_node_creation_str += " // Set Attributes\n"; - grad_node_creation_str += " grad_node->SetAttrMap(std::move(attrs));\n"; + grad_node_creation_str += " // Set Attributes\n"; + grad_node_creation_str += " grad_node->SetAttrMap(std::move(attrs));\n"; grad_node_creation_str += - " grad_node->SetDefaultAttrMap(std::move(default_attrs));\n"; + " grad_node->SetDefaultAttrMap(std::move(default_attrs));\n"; grad_node_creation_str += "\n"; // [GradOpNode] Set TensorWrappers - grad_node_creation_str += " // Set Tensor Wrappers\n"; + grad_node_creation_str += " // Set Tensor Wrappers\n"; for (const auto& iter : op_base_infos) { const std::map& grad_ins_fwd_slotname_map = iter.GetGradInsFwdSlotnameMap(); @@ -1094,10 +1132,18 @@ static std::string GenerateGradNodeCreationContent( full_reserved = "true"; } const char* SET_TENSOR_WRAPPER_TEMPLATE = - " grad_node->SetTensorWrapper%s(%s, %s);\n"; - grad_node_creation_str += paddle::string::Sprintf( - SET_TENSOR_WRAPPER_TEMPLATE, tensor_wrapper_name, tensor_wrapper_name, - full_reserved); + " grad_node->SetTensorWrapper%s(%s, %s);\n"; + // Replace output directly with input in inplace op. + if (!inplace_map.empty() && inplace_map.count(tensor_wrapper_name)) { + auto inplace_input_name = inplace_map[tensor_wrapper_name]; + grad_node_creation_str += paddle::string::Sprintf( + SET_TENSOR_WRAPPER_TEMPLATE, tensor_wrapper_name, + inplace_input_name, full_reserved); + } else { + grad_node_creation_str += paddle::string::Sprintf( + SET_TENSOR_WRAPPER_TEMPLATE, tensor_wrapper_name, + tensor_wrapper_name, full_reserved); + } } } grad_node_creation_str += "\n"; @@ -1115,12 +1161,12 @@ static std::string GenerateGradNodeCreationContent( size_t input_position = fwd_inputs_name_pos_map.at(input_name); const char* SET_GRAD_OUT_META_TEMPLATE = - " grad_node->SetGradOutMeta(%s, %d);\n"; + " grad_node->SetGradOutMeta(%s, %d);\n"; grad_node_creation_str += paddle::string::Sprintf( - SET_GRAD_OUT_META_TEMPLATE, input_autograd_name, input_position); + SET_GRAD_OUT_META_TEMPLATE, input_name, input_position); const char* ADD_EDGES_TEMPLATE = - " if(%s) grad_node->AddEdges(%s, %d);\n"; + " if(%s) grad_node->AddEdges(%s, %d);\n"; grad_node_creation_str += paddle::string::Sprintf(ADD_EDGES_TEMPLATE, input_autograd_name, input_autograd_name, input_position); @@ -1129,11 +1175,11 @@ static std::string GenerateGradNodeCreationContent( size_t input_position = fwd_inputs_name_pos_map.at(input_name); const char* SET_GRAD_OUT_META_TEMPLATE = - " grad_node->SetGradOutMeta(&%s, %d);\n"; + " grad_node->SetGradOutMeta(%s, %d);\n"; grad_node_creation_str += paddle::string::Sprintf( - SET_GRAD_OUT_META_TEMPLATE, input_autograd_name, input_position); + SET_GRAD_OUT_META_TEMPLATE, input_name, input_position); - const char* ADD_EDGES_TEMPLATE = " grad_node->AddEdges(&%s, %d);\n"; + const char* ADD_EDGES_TEMPLATE = " grad_node->AddEdges(&%s, %d);\n"; grad_node_creation_str += paddle::string::Sprintf( ADD_EDGES_TEMPLATE, input_autograd_name, input_position); } @@ -1145,73 +1191,125 @@ static std::string GenerateGradNodeCreationContent( std::string pass_stop_gradient_args = "false"; for (const proto::OpProto::Var& output : out_vars) { const std::string& output_name = output.name(); - const std::string& output_autograd_name = "p_autograd_" + output_name; - size_t output_position = fwd_outputs_name_pos_map.at(output_name); - - // Intermediate Tensor does not require SetHistory, nor RetainGrad - - if (output.duplicable()) { - pass_stop_gradient_args += ", &" + output_autograd_name; + // Replace output directly with input in inplace op. + if (!inplace_map.empty() && inplace_map.count(output_name)) { + auto inplace_input_name = inplace_map[output_name]; + const std::string& inplace_input_autograd_name = + "p_autograd_" + inplace_input_name; + size_t output_position = fwd_outputs_name_pos_map.at(output_name); + + // Intermediate Tensor does not require SetHistory, nor RetainGrad + pass_stop_gradient_args += ", " + inplace_input_autograd_name; const char* SET_OUT_RANK_TEMPLATE = - " egr::EagerUtils::SetOutRankWithSlot(&%s, %d);\n"; + " egr::EagerUtils::SetOutRankWithSlot(%s, %d);\n"; grad_node_creation_str += paddle::string::Sprintf( - SET_OUT_RANK_TEMPLATE, output_autograd_name, output_position); + SET_OUT_RANK_TEMPLATE, inplace_input_autograd_name, output_position); // Intermediate Tensor does not require SetHistory if (!output.intermediate()) { const char* SET_HISTORY_TEMPLATE = - " egr::EagerUtils::SetHistory(&%s, grad_node);\n"; - grad_node_creation_str += - paddle::string::Sprintf(SET_HISTORY_TEMPLATE, output_autograd_name); + " egr::EagerUtils::SetHistory(%s, grad_node);\n"; + grad_node_creation_str += paddle::string::Sprintf( + SET_HISTORY_TEMPLATE, inplace_input_autograd_name); } const char* SET_GRAD_IN_META_TEMPLATE = - " grad_node->SetGradInMeta(&%s, %d);\n"; + " grad_node->SetGradInMeta(%s, %d);\n"; grad_node_creation_str += paddle::string::Sprintf( - SET_GRAD_IN_META_TEMPLATE, output_autograd_name, output_position); + SET_GRAD_IN_META_TEMPLATE, inplace_input_name, output_position); + // Intermediate Tensor does not require CheckAndRetainGrad + if (!output.intermediate()) { + VLOG(6) << "Generated Call RetainGradForTensor"; + const char* RETAIN_GRAD_TEMPLATE = + " egr::EagerUtils::CheckAndRetainGrad(%s);\n"; + grad_node_creation_str += + paddle::string::Sprintf(RETAIN_GRAD_TEMPLATE, inplace_input_name); + } } else { - pass_stop_gradient_args += ", " + output_autograd_name; - const char* SET_OUT_RANK_TEMPLATE = - " egr::EagerUtils::SetOutRankWithSlot(%s, %d);\n"; - grad_node_creation_str += paddle::string::Sprintf( - SET_OUT_RANK_TEMPLATE, output_autograd_name, output_position); + const std::string& output_autograd_name = "p_autograd_" + output_name; + size_t output_position = fwd_outputs_name_pos_map.at(output_name); - // Intermediate Tensor does not require SetHistory + // Intermediate Tensor does not require SetHistory, nor RetainGrad + + if (output.duplicable()) { + pass_stop_gradient_args += ", &" + output_autograd_name; + const char* SET_OUT_RANK_TEMPLATE = + " egr::EagerUtils::SetOutRankWithSlot(&%s, %d);\n"; + grad_node_creation_str += paddle::string::Sprintf( + SET_OUT_RANK_TEMPLATE, output_autograd_name, output_position); + + // Intermediate Tensor does not require SetHistory + if (!output.intermediate()) { + const char* SET_HISTORY_TEMPLATE = + " egr::EagerUtils::SetHistory(&%s, grad_node);\n"; + grad_node_creation_str += paddle::string::Sprintf( + SET_HISTORY_TEMPLATE, output_autograd_name); + } + const char* SET_GRAD_IN_META_TEMPLATE = + " grad_node->SetGradInMeta(%s, %d);\n"; + grad_node_creation_str += paddle::string::Sprintf( + SET_GRAD_IN_META_TEMPLATE, output_name, output_position); + + } else { + pass_stop_gradient_args += ", " + output_autograd_name; + const char* SET_OUT_RANK_TEMPLATE = + " egr::EagerUtils::SetOutRankWithSlot(%s, %d);\n"; + grad_node_creation_str += paddle::string::Sprintf( + SET_OUT_RANK_TEMPLATE, output_autograd_name, output_position); + + // Intermediate Tensor does not require SetHistory + if (!output.intermediate()) { + const char* SET_HISTORY_TEMPLATE = + " egr::EagerUtils::SetHistory(%s, grad_node);\n"; + grad_node_creation_str += paddle::string::Sprintf( + SET_HISTORY_TEMPLATE, output_autograd_name); + } + const char* SET_GRAD_IN_META_TEMPLATE = + " grad_node->SetGradInMeta(%s, %d);\n"; + grad_node_creation_str += paddle::string::Sprintf( + SET_GRAD_IN_META_TEMPLATE, output_name, output_position); + } + + // Intermediate Tensor does not require CheckAndRetainGrad if (!output.intermediate()) { - const char* SET_HISTORY_TEMPLATE = - " egr::EagerUtils::SetHistory(%s, grad_node);\n"; + VLOG(6) << "Generated Call RetainGradForTensor"; + const char* RETAIN_GRAD_TEMPLATE = + " egr::EagerUtils::CheckAndRetainGrad(%s);\n"; grad_node_creation_str += - paddle::string::Sprintf(SET_HISTORY_TEMPLATE, output_autograd_name); + paddle::string::Sprintf(RETAIN_GRAD_TEMPLATE, output_name); } - const char* SET_GRAD_IN_META_TEMPLATE = - " grad_node->SetGradInMeta(%s, %d);\n"; - grad_node_creation_str += paddle::string::Sprintf( - SET_GRAD_IN_META_TEMPLATE, output_autograd_name, output_position); - } - - // Intermediate Tensor does not require CheckAndRetainGrad - if (!output.intermediate()) { - VLOG(6) << "Generated Call RetainGradForTensor"; - const char* RETAIN_GRAD_TEMPLATE = - " egr::EagerUtils::CheckAndRetainGrad(%s);\n"; - grad_node_creation_str += - paddle::string::Sprintf(RETAIN_GRAD_TEMPLATE, output_name); } } VLOG(6) << "Generated SetGradIn/OutMeta"; // [Generation] GradNode Creation + // After getting require_any_grad, firstly use CheckInplace method for inplace + // op. + // Then execute TraceOp and generate output autograd_meta. + // Finally, Construct GradNode. (Replace output directly with input in inplace + // op.) + // Add event record + std::string event_name = op_type + " node_creation"; const char* GRAD_NODE_CREATION_TEMPLATE = - " %s" + "%s" " bool require_any_grad = egr::EagerUtils::ComputeRequireGrad(%s);\n" - " if(require_any_grad) {\n" - " VLOG(6) << \" Construct Grad for %s \"; \n" - " egr::EagerUtils::PassStopGradient(%s);\n" - "%s\n }"; + "%s\n" + "%s" + " {\n" + " paddle::platform::RecordEvent node_creation_record_event(\"%s\", " + "paddle::platform::TracerEventType::Operator, 1);\n" + "%s" + " if(require_any_grad) {\n" + " VLOG(6) << \" Construct Grad for %s \"; \n" + " egr::EagerUtils::PassStopGradient(%s);\n" + " %s\n" + " }\n" + " }"; std::string grad_node_creation_body_str = paddle::string::Sprintf( GRAD_NODE_CREATION_TEMPLATE, prepare_autograd_meta_str, - compute_require_grad_args, op_type, pass_stop_gradient_args, - grad_node_creation_str); + compute_require_grad_args, check_inplace_str, trace_op_body_str, + event_name, get_output_autograd_meta_str, op_type, + pass_stop_gradient_args, grad_node_creation_str); return grad_node_creation_body_str; } @@ -1221,7 +1319,8 @@ static std::string GenerateGradNodeCreationContent( /* -------------------------------- */ static std::pair GenerateForwardFunctionContents( const ForwardGenerationInfo& fwd_info, - const GradNodeGenerationInfo& bwd_info) { + const GradNodeGenerationInfo& bwd_info, + std::map inplace_map = {}) { /* --- Process Forward Info ---*/ const std::string& op_type = fwd_info.GetOpType(); const std::unordered_map& fwd_inputs_name_pos_map = @@ -1301,8 +1400,21 @@ static std::pair GenerateForwardFunctionContents( core_ops_args_type_info[op_type][input_position] = "list"; } else { - const char* FWD_INS_ARG_TEMPLATE = - "const paddle::experimental::Tensor& %s"; + // inplace tensor can't be const + const char* FWD_INS_ARG_TEMPLATE; + bool flag_find_input_name = false; + if (!inplace_map.empty()) { + for (auto& inplace_pair : inplace_map) { + if (inplace_pair.second == input_name) { + flag_find_input_name = true; + FWD_INS_ARG_TEMPLATE = "paddle::experimental::Tensor& %s"; + break; + } + } + } + if (!flag_find_input_name) { + FWD_INS_ARG_TEMPLATE = "const paddle::experimental::Tensor& %s"; + } input_args_str_list[input_position] = paddle::string::Sprintf(FWD_INS_ARG_TEMPLATE, input_name); @@ -1362,6 +1474,7 @@ static std::pair GenerateForwardFunctionContents( // [Generation] Get Outs Map std::string outs_contents_str = ""; + std::string inplace_mapping_str = ""; for (const proto::OpProto::Var& output : out_vars) { const std::string& output_name = output.name(); std::string outnum = "1"; @@ -1404,6 +1517,22 @@ static std::pair GenerateForwardFunctionContents( } core_ops_args_info[op_type].push_back(output_var_name); + } else if (!inplace_map.empty() && inplace_map.count(output_name)) { + // In inplace op, replace the output with the input directly. + PADDLE_ENFORCE_NE( + inplace_map[output_name], "", + paddle::platform::errors::InvalidArgument( + "Inplace op %s has no input corresponding to output %s.", op_type, + output_name)); + const char* FWD_OUTS_CONTENT_TEMPLATE = "{ \"%s\", ins[\"%s\"] },"; + auto inplace_input_name = inplace_map[output_name]; + outs_contents_str += paddle::string::Sprintf( + FWD_OUTS_CONTENT_TEMPLATE, output_name, inplace_input_name); + + // inplace_map used in TraceOp. + const char* INPLACE_MAPPING_TEMPLATE = R"({"%s", "%s"},)"; + inplace_mapping_str += paddle::string::Sprintf( + INPLACE_MAPPING_TEMPLATE, inplace_input_name, output_name); } else { if (output.duplicable()) { outnum = output_name + "Num"; @@ -1430,6 +1559,8 @@ static std::pair GenerateForwardFunctionContents( } if (outs_contents_str.size() > 0) outs_contents_str.pop_back(); // Remove trailing "," + if (inplace_mapping_str.size() > 0) + inplace_mapping_str.pop_back(); // Remove trailing "," const char* FWD_OUTS_MAP_TEMPLATE = " std::map GenerateForwardFunctionContents( dygraph_function_args_str += ", const paddle::framework::AttributeMap& attr_map"; + /* --------- Generate TraceOp ----- */ + // TraceOp should be run after compute require_any_grad. (for checking + // inplace) + // `trace_op_body_str` will be passed as a parameter to + // `GenerateGradNodeCreationContent`. + std::string trace_op_body_str = ""; // [Generation] Get TraceOp const char* FWD_TRACE_OP_TEMPLATE = " paddle::framework::AttributeMap attrs = attr_map;\n" @@ -1470,11 +1607,12 @@ static std::pair GenerateForwardFunctionContents( " egr::Controller::Instance().GetCurrentTracer()->TraceOp(\"%s\", ins, " "outs, attrs, \n" " egr::Controller::Instance().GetExpectedPlace(),\n" - " &default_attrs, true, {});\n"; - std::string trace_op_str = - paddle::string::Sprintf(FWD_TRACE_OP_TEMPLATE, op_type); - generated_function_body += trace_op_str; - generated_function_body += "\n"; + " &default_attrs, true, {%s});\n"; + std::string trace_op_str = paddle::string::Sprintf( + FWD_TRACE_OP_TEMPLATE, op_type, inplace_mapping_str); + + trace_op_body_str += trace_op_str; + trace_op_body_str += "\n"; VLOG(6) << "Generated AttrMap & TraceOp"; @@ -1539,48 +1677,64 @@ static std::pair GenerateForwardFunctionContents( output_varname, output_var_args_name); } } else { - const char* FWD_OUT_TENSOR_TEMPLATE = - " paddle::experimental::Tensor %s;\n" - " egr::EagerUtils::GetOutput(outs[\"%s\"][0], &%s);\n"; - out_tensor_str = - paddle::string::Sprintf(FWD_OUT_TENSOR_TEMPLATE, output_varname, - output_name, output_varname); + if (!inplace_map.empty() && inplace_map.count(output_name)) { + // Modify meta info of inplace tensor. + // Bump inplace version of inplace tensor. + auto inplace_input_name = inplace_map[output_name]; + const char* FWD_OUT_TENSOR_TEMPLATE = + " egr::EagerUtils::ModifyInplaceInput(outs[\"%s\"][0], &%s);\n" + " %s.bump_inplace_version();\n" + " VLOG(3) << \"Tensor(\" << %s.name() << \") uses Inplace " + "Strategy.\";\n"; + out_tensor_str = paddle::string::Sprintf( + FWD_OUT_TENSOR_TEMPLATE, output_name, inplace_input_name, + inplace_input_name, inplace_input_name); + } else { + const char* FWD_OUT_TENSOR_TEMPLATE = + " paddle::experimental::Tensor %s;\n" + " egr::EagerUtils::GetOutput(outs[\"%s\"][0], &%s);\n"; + out_tensor_str = + paddle::string::Sprintf(FWD_OUT_TENSOR_TEMPLATE, output_varname, + output_name, output_varname); + } } return_types[return_position] = "paddle::experimental::Tensor"; } - return_contents[return_position] = output_varname; - generated_function_body += out_tensor_str; + if (!inplace_map.empty() && inplace_map.count(output_name)) { + // Replace output directly with input in inplace op. + return_contents[return_position] = inplace_map[output_name]; + } else { + return_contents[return_position] = output_varname; + } + trace_op_body_str += out_tensor_str; } - generated_function_body += "\n"; + trace_op_body_str += "\n"; VLOG(6) << "Converted Output VarBase to EagerVariable(s)"; + /* ------ END Generate TraceOp ----- */ // [Generation] Handle core_ops_returns_info - core_ops_returns_info[op_type] = return_contents; + // avoid inplace op changing core_ops_returns_info + if (core_ops_returns_info.empty() || !core_ops_returns_info.count(op_type)) { + core_ops_returns_info[op_type] = return_contents; + } // [Generation] ComputeRequireGrad -> GradNodeCreation if (!bwd_info.GenerateForwardOnly()) { - std::string grad_node_creation_body_str = - GenerateGradNodeCreationContent(fwd_info, bwd_info); - - // Add event record - std::string event_name = op_type + " node_creation"; - const char* NODE_CREATION_TEMPLATE = - "{\n" - " paddle::platform::RecordEvent node_creation_record_event(\"%s\", " - "paddle::platform::TracerEventType::Operator, 1);\n" - " %s\n" - "}"; - - grad_node_creation_body_str = paddle::string::Sprintf( - NODE_CREATION_TEMPLATE, event_name, grad_node_creation_body_str); + // If GradNode needs to be generated, pass `trace_op_body_str` + // into `GenerateGradNodeCreationContent`. + std::string grad_node_creation_body_str = GenerateGradNodeCreationContent( + fwd_info, bwd_info, trace_op_body_str, inplace_map); generated_function_body += grad_node_creation_body_str; generated_function_body += "\n"; // [Generation] Call RetainGradForTensor VLOG(6) << "Generated GradNode Creation codes"; + } else { + // If GradNode doesn't need to be generated, generate TraceOP directly. + generated_function_body += trace_op_body_str; } // [Generation] Handle return: Tuple/Vector/Tensor @@ -1627,7 +1781,13 @@ static std::pair GenerateForwardFunctionContents( VLOG(6) << "Generated return codes"; // [Generation] Get Full Function - std::string function_name = op_type + "_dygraph_function"; + std::string function_name; + if (inplace_map.empty()) { + function_name = op_type + "_dygraph_function"; + } else { + // change function_name for inplace op. + function_name = op_type + "__dygraph_function"; + } if (dygraph_function_args_str.size() > 0) { auto iter = dygraph_function_args_str.begin(); @@ -1635,15 +1795,15 @@ static std::pair GenerateForwardFunctionContents( } const char* DYGRAPH_FUNCTION_EVENT_RECORD_FUNCTION_TEMPLATE = - "paddle::platform::RecordEvent dygraph_entrance_record_event(\"%s\", " + " paddle::platform::RecordEvent dygraph_entrance_record_event(\"%s\", " "paddle::platform::TracerEventType::Operator, 1);"; std::string event_name = op_type + " dygraph"; std::string fwd_record_event_str = paddle::string::Sprintf( DYGRAPH_FUNCTION_EVENT_RECORD_FUNCTION_TEMPLATE, event_name); const char* FWD_FUNCTION_TEMPLATE = "%s %s(%s) {\n\n" - " %s\n" - " %s\n" + "%s\n" + "%s\n" "}\n\n"; std::string fwd_function_str = paddle::string::Sprintf( FWD_FUNCTION_TEMPLATE, function_proto_return_type_str, function_name, @@ -1834,7 +1994,7 @@ static std::string GenerateSingleOpBase( !is_op_base_per_duplicable_input) { const char* GRAD_OUTS_CONTENT_TEMPLATE = "{ \"%s\", egr::EagerUtils::CreateVars( " - "this->OutputMeta()[%d].Size() ) },"; + "this->OutputMeta()[%d].size() ) },"; outs_contents_str += paddle::string::Sprintf( GRAD_OUTS_CONTENT_TEMPLATE, grad_output_name, fwd_input_position); } else { @@ -2053,7 +2213,7 @@ static std::string GenerateGradNodeCCContents( if (is_op_base_per_duplicable_input) { const char* OP_BASE_PER_DUP_INPUT_TEMPLATE = - " for(int i = 0; i < this->OutputMeta()[0].Size(); i++) {\n" + " for(size_t i = 0; i < this->OutputMeta()[0].size(); i++) {\n" " %s\n" " }\n"; generated_grad_function_body = paddle::string::Sprintf( @@ -2065,6 +2225,8 @@ static std::string GenerateGradNodeCCContents( "GradNode%s::ApplyGradientHooks(grads);\n" " std::vector> outputs(%d);\n" " %s\n" + " if(NeedComplexToRealConversion()) " + "HandleComplexGradToRealGrad(&outputs);\n" " return outputs;\n"; generated_grad_function_body = paddle::string::Sprintf(BWD_RETURN_TEMPLATE, fwd_op_type, in_vars.size(), @@ -2424,7 +2586,7 @@ static void DygraphCodeGeneration(const std::string& output_dir) { /* --------------------------- */ VLOG(6) << "-------- GenerateForwardFunctionContents -------"; std::pair body_and_declaration = - GenerateForwardFunctionContents(fwd_info, bwd_info); + GenerateForwardFunctionContents(fwd_info, bwd_info, {}); fwd_function_str += body_and_declaration.first + "\n"; @@ -2432,6 +2594,30 @@ static void DygraphCodeGeneration(const std::string& output_dir) { std::string fwd_function_declare_str = body_and_declaration.second; dygraph_forward_api_str += fwd_function_declare_str; + auto& infer_inplace = + paddle::framework::OpInfoMap::Instance().Get(op_type).infer_inplace_; + std::map inplace_map; + // Inplace Function Generator. + // `sum` op has duplicate input. Don't consider adding inplace strategy + // for `sum` in temporary. + if (op_type != "sum" && infer_inplace) { + auto in_to_outs = infer_inplace(true); + for (auto& inplace_pair : in_to_outs) { + inplace_map[inplace_pair.second] = inplace_pair.first; + } + + VLOG(6) << "-------- GenerateInplaceForwardFunctionContents -------"; + std::pair inplace_body_and_declaration = + GenerateForwardFunctionContents(fwd_info, bwd_info, inplace_map); + + fwd_function_str += inplace_body_and_declaration.first + "\n"; + + VLOG(6) << "-------- GenerateInplaceDygraphForwardAPIContents -------"; + std::string inplace_fwd_function_declare_str = + inplace_body_and_declaration.second; + dygraph_forward_api_str += inplace_fwd_function_declare_str; + } + if (bwd_info.GenerateForwardOnly()) continue; VLOG(6) << "-------- GenerateGradNodeHeaderContents -------"; diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt index 53af6c1048d2454b1e9f375b837103930026ae54..771351dd4affbb355748c275a59681a6d5ba5577 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt @@ -27,6 +27,7 @@ add_custom_target(eager_final_state_codegen set(tmp_python_c_output_path "${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/tmp_eager_final_state_op_function_impl.h") set(python_c_output_path "${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/eager_final_state_op_function_impl.h") + add_custom_target(eager_final_state_python_c_codegen COMMAND "${PYTHON_EXECUTABLE}" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py" "--api_yaml_path=${api_yaml_path}" diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py index 9dccba034598bbfef205b6eb85ed5e149ba6d040..6d6644af199568db732af1d37cc29c1f54e7c822 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py @@ -657,6 +657,7 @@ def GenerateNodeDefinition(fwd_api_name, bwd_api_name, backward_fwd_input_map, else: # Rearrange output order accordingly returns_str += f"returns[{fwd_position}] = grad_api_returns[{grad_api_position}];\n" + returns_str += f"if(NeedComplexToRealConversion()) HandleComplexGradToRealGrad(&returns);\n" returns_str += f"return returns;\n" grad_node_name = GetGradNodeName(fwd_api_name) @@ -793,7 +794,7 @@ def GenerateNodeCreationCodes( set_edges_list = [] for name, (_, pos) in forward_inputs_position_map.items(): input_autograd_meta_name = GetAutoGradMetaName(name) - set_grad_out_meta = f" grad_node->SetGradOutMeta({input_autograd_meta_name}, {pos});" + set_grad_out_meta = f" grad_node->SetGradOutMeta({name}, {pos});" set_edges = f" grad_node->AddEdges({input_autograd_meta_name}, {pos});" set_grad_out_meta_list.append(set_grad_out_meta) set_edges_list.append(set_edges) @@ -810,17 +811,18 @@ def GenerateNodeCreationCodes( output_autograd_meta_name = GetAutoGradMetaName(name) set_out_rank = f" egr::EagerUtils::SetOutRankWithSlot({output_autograd_meta_name}, {pos});" set_history = f" egr::EagerUtils::SetHistory({output_autograd_meta_name}, grad_node);" - set_grad_in_meta = f" grad_node->SetGradInMeta({output_autograd_meta_name}, {pos});" + if num_outputs == 1: + set_retain_grad = f" egr::EagerUtils::CheckAndRetainGrad(api_result);" + set_grad_in_meta = f" grad_node->SetGradInMeta(api_result, {pos});" + else: + set_retain_grad = f" egr::EagerUtils::CheckAndRetainGrad(api_result[{pos}]);" + set_grad_in_meta = f" grad_node->SetGradInMeta(api_result[{pos}], {pos});" set_out_rank_list.append(set_out_rank) set_history_list.append(set_history) set_grad_in_meta_list.append(set_grad_in_meta) - - if num_outputs == 1: - set_retain_grad = f" egr::EagerUtils::CheckAndRetainGrad(api_result);" - else: - set_retain_grad = f" egr::EagerUtils::CheckAndRetainGrad(std::get<{pos}>(api_result));" set_retain_grad_list.append(set_retain_grad) + set_out_rank_str = "\n".join(set_out_rank_list) set_history_str = "\n".join(set_history_list) set_grad_in_meta_str = "\n".join(set_grad_in_meta_list) diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py index aba3e227ab4b3c52f423ea581a502589fa93f416..8e3b731cfe895f4def592b8cbea483fe0579437c 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py @@ -14,9 +14,18 @@ import os import argparse +import logging from eager_gen import namespace, yaml_types_mapping, ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap -skipped_fwd_api_names = set(["scale"]) +########################### +## Global Configurations ## +########################### +skipped_forward_api_names = set(["scale"]) + + +def SkipAPIGeneration(forward_api_name): + return (forward_api_name in skipped_forward_api_names) + atype_to_parsing_function = { "bool": "CastPyArg2Boolean", @@ -40,64 +49,31 @@ atype_to_parsing_function = { } -def ParseArguments(): - parser = argparse.ArgumentParser( - description='Eager Code Generator Args Parser') - parser.add_argument('--api_yaml_path', type=str) - parser.add_argument('--output_path', type=str) - - args = parser.parse_args() - return args - - def FindParsingFunctionFromAttributeType(atype): if atype not in atype_to_parsing_function.keys(): - print(f"Unable to find {atype} in atype_to_parsing_function.") - assert False + assert False, f"Unable to find {atype} in atype_to_parsing_function." return atype_to_parsing_function[atype] -def GeneratePythonCFunction(fwd_api_name, forward_inputs_position_map, - forward_attrs_list, forward_outputs_position_map, - optional_inputs, is_forward_only): - # forward_inputs_position_map = { "name" : [type, fwd_position] } - # forward_outputs_position_map = { "name" : [type, fwd_position] } - # forward_attrs_list = [ [attr_name, attr_type, default_value, orig_position], ...] - # optional_inputs = [name0, ...] - - # Get EagerTensor from args - # Get dygraph function call args - num_args = len(forward_inputs_position_map.keys()) + len(forward_attrs_list) - num_input_tensors = len(forward_inputs_position_map.keys()) - dygraph_function_call_list = ["" for i in range(num_args)] - get_eager_tensor_str = "" - for name, (ttype, pos) in forward_inputs_position_map.items(): - is_optional = (name in optional_inputs) - if IsVectorTensorType(ttype): - get_eager_tensor_str += f" auto {name} = GetTensorListFromArgs(\"{fwd_api_name}\", \"{name}\", args, {pos}, false);\n" - else: - if is_optional: - get_eager_tensor_str += f" auto {name} = GetOptionalTensorFromArgs(\"{fwd_api_name}\", \"{name}\", args, {pos}, false);\n" - else: - get_eager_tensor_str += f" auto {name} = GetTensorFromArgs(\"{fwd_api_name}\", \"{name}\", args, {pos}, false);\n" - dygraph_function_call_list[pos] = f"{name}" +########################## +## Refactored Functions ## +########################## +PARSE_PYTHON_C_TENSORS_TEMPLATE = \ +" auto {} = {}(\"{}\", \"{}\", args, {}, false);\n" + - parse_attributes_str = "" - # Get Attributes - for name, atype, _, pos in forward_attrs_list: - parsing_function = FindParsingFunctionFromAttributeType(atype) - key = f"{name}" +PARSE_PYTHON_C_ARGS_TEMPLATE = \ +""" PyObject* {}_obj = PyTuple_GET_ITEM(args, {});\n + {} {} = {}({}_obj, \"{}\", {});\n""" - parse_attributes_str += f" PyObject* {name}_obj = PyTuple_GET_ITEM(args, {pos});\n" - parse_attributes_str += f" {atype} {name} = {parsing_function}({name}_obj, \"{fwd_api_name}\", {pos});\n" - dygraph_function_call_list[pos] = f"{name}" - dygraph_function_call_str = ",".join(dygraph_function_call_list) +RECORD_EVENT_TEMPLATE = \ +" paddle::platform::RecordEvent {}(\"{} {}\", paddle::platform::TracerEventType::Operator, 1);" - pythonc_event_str = f"paddle::platform::RecordEvent pythonc_record_event(\"{fwd_api_name} pybind_imperative_func\", paddle::platform::TracerEventType::Operator, 1);" - PYTHON_C_FUNCTION_TEMPLATE = """ +PYTHON_C_FUNCTION_TEMPLATE = \ +""" static PyObject * eager_final_state_api_{}(PyObject *self, PyObject *args, PyObject *kwargs) {{ {} @@ -131,26 +107,50 @@ static PyObject * eager_final_state_api_{}(PyObject *self, PyObject *args, PyObj }} """ - namespace_str = "" - if len(namespace) > 0: - namespace_str = f"{namespace}::" - if is_forward_only: - fwd_function_name = "paddle::experimental::" + namespace_str + fwd_api_name - else: - fwd_function_name = namespace_str + GetForwardFunctionName(fwd_api_name) - python_c_function_str = PYTHON_C_FUNCTION_TEMPLATE.format( - fwd_api_name, pythonc_event_str, fwd_api_name, get_eager_tensor_str, - parse_attributes_str, fwd_function_name, dygraph_function_call_str) +FUNCTION_NAME_TEMPLATE = \ +"{}{}{}" - python_c_function_reg_str = f"{{\"final_state_{fwd_api_name}\", (PyCFunction)(void(*)(void)) {namespace_str}eager_final_state_api_{fwd_api_name}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {fwd_api_name} in dygraph.\"}}\n" - return python_c_function_str, python_c_function_reg_str +PYTHON_C_FUNCTION_REG_TEMPLATE = \ +"{{\"final_state_{}\", (PyCFunction)(void(*)(void)) {}eager_final_state_api_{}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {} in dygraph.\"}}" -def GenerateCoreOpsInfoMap(): - result = """ +PYTHON_C_WRAPPER_TEMPLATE = \ +""" +#pragma once + +#include "pybind11/detail/common.h" +#include "paddle/phi/api/all.h" +#include "paddle/phi/api/lib/dygraph_api.h" +#include "paddle/phi/common/backend.h" +#include "paddle/phi/common/data_type.h" +#include "paddle/phi/common/scalar.h" +#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/api/include/sparse_api.h" +#include "paddle/fluid/pybind/op_function_common.h" +#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h" +#include "paddle/fluid/pybind/exception.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" +#include + +namespace paddle {{ +namespace pybind {{ + +{} + +static PyMethodDef EagerFinalStateMethods[] = {{ + {} +}}; + +}} // namespace pybind +}} // namespace paddle +""" + + +CORE_OPS_INFO = \ +""" static PyObject * eager_get_final_state_core_ops_args_info(PyObject *self) { PyThreadState *tstate = nullptr; try @@ -195,9 +195,11 @@ static PyObject * eager_get_final_state_core_ops_returns_info(PyObject *self) { return nullptr; } } - """ +""" + - core_ops_infos_registry = """ +CORE_OPS_INFO_REGISTRY = \ +""" {\"get_final_state_core_ops_args_info\", (PyCFunction)(void(*)(void))eager_get_final_state_core_ops_args_info, METH_NOARGS, \"C++ interface function for eager_get_final_state_core_ops_args_info.\"}, @@ -210,7 +212,259 @@ static PyObject * eager_get_final_state_core_ops_returns_info(PyObject *self) { METH_NOARGS, \"C++ interface function for eager_get_final_state_core_ops_returns_info.\"}, """ - return result, core_ops_infos_registry +NAMESPACE_WRAPPER_TEMPLATE = \ +"""namespace {} {{ + {} +}} +""" + + +####################### +## Generator Classes ## +####################### +class PythonCSingleFunctionGenerator: + def __init__(self, fwd_api_contents, namespace): + self.fwd_api_contents = fwd_api_contents + self.namespace = namespace + + # Raw Contents + self.forward_api_name = "" + self.forward_args_str = "" + self.forward_returns_str = "" + + # Raw Data + self.forward_attrs_list = None #[ [attr_name, attr_type, default_value, orig_position], ...] + self.forward_inputs_list = None #[ [arg_name, arg_type, orig_position], ...] + self.forward_returns_list = None #[ [ret_name, ret_type, orig_position], ...] + + # Processed Data + self.forward_inputs_position_map = None #{ "name" : [type, fwd_position] } + self.forward_outputs_position_map = None #{ "name" : [type, fwd_position] } + + # Special Op Attributes + self.optional_inputs = [] #[name, ...] + self.is_forward_only = True + + # Generated Results + self.python_c_function_str = "" + self.python_c_function_reg_str = "" + + def CollectRawContents(self): + fwd_api_contents = self.fwd_api_contents + + assert 'api' in fwd_api_contents.keys( + ), "Unable to find \"api\" in fwd_api_contents keys" + assert 'args' in fwd_api_contents.keys( + ), "Unable to find \"args\" in fwd_api_contents keys" + assert 'output' in fwd_api_contents.keys( + ), "Unable to find \"output\" in fwd_api_contents keys" + + self.forward_api_name = fwd_api_contents['api'] + self.forward_args_str = fwd_api_contents['args'] + self.forward_returns_str = fwd_api_contents['output'] + + def CollectIsForwardOnly(self): + fwd_api_contents = self.fwd_api_contents + self.is_forward_only = False if 'backward' in fwd_api_contents.keys( + ) else True + + def CollectOptionalInputs(self): + fwd_api_contents = self.fwd_api_contents + if 'optional' in fwd_api_contents.keys(): + self.optional_inputs = ParseDispensable(fwd_api_contents[ + 'optional']) + + def CollectForwardInOutAttr(self): + forward_args_str = self.forward_args_str + forward_returns_str = self.forward_returns_str + + self.forward_inputs_list, self.forward_attrs_list, self.forward_returns_list = ParseYamlForward( + forward_args_str, forward_returns_str) + + def CollectForwardPositionMap(self): + forward_inputs_list = self.forward_inputs_list + forward_returns_list = self.forward_returns_list + + self.forward_inputs_position_map, self.forward_outputs_position_map = DetermineForwardPositionMap( + forward_inputs_list, forward_returns_list) + + def GeneratePythonCFunction(self): + namespace = self.namespace + forward_api_name = self.forward_api_name + forward_attrs_list = self.forward_attrs_list + forward_inputs_position_map = self.forward_inputs_position_map + forward_outputs_position_map = self.forward_outputs_position_map + optional_inputs = self.optional_inputs + is_forward_only = self.is_forward_only + + # Generate Python-C Tensors Parsing Logic + get_eager_tensor_str = "" + for name, (ttype, pos) in forward_inputs_position_map.items(): + is_optional = (name in optional_inputs) + if IsVectorTensorType(ttype): + get_eager_tensor_str += PARSE_PYTHON_C_TENSORS_TEMPLATE.format( + name, "GetTensorListFromArgs", forward_api_name, name, pos) + else: + if is_optional: + get_eager_tensor_str += PARSE_PYTHON_C_TENSORS_TEMPLATE.format( + name, "GetOptionalTensorFromArgs", forward_api_name, + name, pos) + else: + get_eager_tensor_str += PARSE_PYTHON_C_TENSORS_TEMPLATE.format( + name, "GetTensorFromArgs", forward_api_name, name, pos) + + parse_attributes_str = "" + + # Generate Python-C Attributes Parsing Logic + for name, atype, _, pos in forward_attrs_list: + parsing_function_name = FindParsingFunctionFromAttributeType(atype) + parse_attributes_str += PARSE_PYTHON_C_ARGS_TEMPLATE.format( + name, pos, atype, name, parsing_function_name, name, + forward_api_name, pos) + + # Generate Dygraph Function Call Logic + num_args = len(forward_inputs_position_map.keys()) + len( + forward_attrs_list) + dygraph_function_call_list = ["" for i in range(num_args)] + for name, (_, pos) in forward_inputs_position_map.items(): + dygraph_function_call_list[pos] = f"{name}" + for name, _, _, pos in forward_attrs_list: + dygraph_function_call_list[pos] = f"{name}" + dygraph_function_call_str = ",".join(dygraph_function_call_list) + + # Generate Python-C Function Definitions + if is_forward_only: + fwd_function_name = FUNCTION_NAME_TEMPLATE.format( + "paddle::experimental::", namespace, forward_api_name) + else: + fwd_function_name = FUNCTION_NAME_TEMPLATE.format( + "", namespace, GetForwardFunctionName(forward_api_name)) + + # Generate Record Event for performance profiling + pythonc_record_event_str = RECORD_EVENT_TEMPLATE.format( + "pythonc_record_event", forward_api_name, "pybind_imperative_func") + self.python_c_function_str = PYTHON_C_FUNCTION_TEMPLATE.format( + forward_api_name, pythonc_record_event_str, forward_api_name, + get_eager_tensor_str, parse_attributes_str, fwd_function_name, + dygraph_function_call_str) + + # Generate Python-C Function Registration + self.python_c_function_reg_str = PYTHON_C_FUNCTION_REG_TEMPLATE.format( + forward_api_name, namespace, forward_api_name, forward_api_name) + + def run(self): + # Initialized is_forward_only + self.CollectIsForwardOnly() + + # Initialized forward_api_name, forward_args_str, forward_returns_str + self.CollectRawContents() + if SkipAPIGeneration(self.forward_api_name): return False + + # Initialized optional_inputs + self.CollectOptionalInputs() + + # Initialized forward_inputs_list, forward_returns_list, forward_attrs_list + self.CollectForwardInOutAttr() + logging.info( + f"Parsed Original Forward Inputs List: \n{self.forward_inputs_list}") + logging.info( + f"Prased Original Forward Attrs List: \n{self.forward_attrs_list}") + logging.info( + f"Parsed Original Forward Returns List: \n{self.forward_returns_list}" + ) + + # Initialized forward_inputs_position_map, forward_outputs_position_map + self.CollectForwardPositionMap() + logging.info( + f"Generated Forward Input Position Map: {self.forward_inputs_position_map}" + ) + logging.info( + f"Generated Forward Output Position Map: {self.forward_outputs_position_map}" + ) + + # Code Generation + self.GeneratePythonCFunction() + logging.info( + f"Generated Python-C Function: {self.python_c_function_str}") + logging.info( + f"Generated Python-C Function Declaration: {self.python_c_function_reg_str}" + ) + + return True + + +class PythonCYamlGenerator: + def __init__(self, path): + self.yaml_path = path + + self.namespace = "" + self.forward_api_list = [] + + # Generated Result + self.python_c_functions_reg_str = "" + self.python_c_functions_str = "" + + def ParseYamlContents(self): + yaml_path = self.yaml_path + self.forward_api_list = ReadFwdFile(yaml_path) + + def GeneratePythonCFunctions(self): + namespace = self.namespace + forward_api_list = self.forward_api_list + + for forward_api_content in forward_api_list: + f_generator = PythonCSingleFunctionGenerator(forward_api_content, + namespace) + status = f_generator.run() + + if status == True: + self.python_c_functions_reg_str += f_generator.python_c_function_reg_str + ",\n" + self.python_c_functions_str += f_generator.python_c_function_str + "\n" + + def InferNameSpace(self): + yaml_path = self.yaml_path + if "sparse" in yaml_path: + self.namespace = "sparse::" + + def AttachNamespace(self): + namespace = self.namespace + python_c_functions_str = self.python_c_functions_str + + if namespace != "": + if namespace.endswith("::"): + namespace = namespace[:-2] + self.python_c_functions_str = NAMESPACE_WRAPPER_TEMPLATE.format( + namespace, python_c_functions_str) + + def run(self): + # Infer namespace from yaml_path + self.InferNameSpace() + + # Read Yaml file + self.ParseYamlContents() + + # Code Generation + self.GeneratePythonCFunctions() + + # Wrap with namespace + self.AttachNamespace() + + +############################ +## Code Generation Helper ## +############################ +def ParseArguments(): + parser = argparse.ArgumentParser( + description='Eager Code Generator Args Parser') + parser.add_argument('--api_yaml_path', type=str) + parser.add_argument('--output_path', type=str) + + args = parser.parse_args() + return args + + +def GenerateCoreOpsInfoMap(): + return CORE_OPS_INFO, CORE_OPS_INFO_REGISTRY def GeneratePythonCWrappers(python_c_function_str, python_c_function_reg_str): @@ -222,36 +476,6 @@ def GeneratePythonCWrappers(python_c_function_str, python_c_function_reg_str): python_c_function_reg_str += core_ops_infos_registry python_c_function_reg_str += "\n {nullptr,nullptr,0,nullptr}" - PYTHON_C_WRAPPER_TEMPLATE = """ -#pragma once - -#include "pybind11/detail/common.h" -#include "paddle/phi/api/all.h" -#include "paddle/phi/api/lib/dygraph_api.h" -#include "paddle/phi/common/backend.h" -#include "paddle/phi/common/data_type.h" -#include "paddle/phi/common/scalar.h" -#include "paddle/phi/common/scalar_array.h" -#include "paddle/phi/api/include/sparse_api.h" -#include "paddle/fluid/pybind/op_function_common.h" -#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h" -#include "paddle/fluid/pybind/exception.h" -#include "paddle/fluid/platform/profiler/event_tracing.h" -#include - -namespace paddle {{ -namespace pybind {{ - -{} - -static PyMethodDef EagerFinalStateMethods[] = {{ - {} -}}; - -}} // namespace pybind -}} // namespace paddle - -""" python_c_str = PYTHON_C_WRAPPER_TEMPLATE.format(python_c_function_str, python_c_function_reg_str) @@ -265,86 +489,23 @@ def GeneratePythonCFile(filepath, python_c_str): if __name__ == "__main__": args = ParseArguments() - api_yaml_paths = args.api_yaml_path.split(",") - python_c_functions_reg_str = "" - python_c_functions_str = "" - + generated_python_c_functions = "" + generated_python_c_registration = "" for i in range(len(api_yaml_paths)): api_yaml_path = api_yaml_paths[i] - if "sparse" in api_yaml_path: - namespace = "sparse" - else: - namespace = "" - - fwd_api_list = ReadFwdFile(api_yaml_path) - - python_c_function_list = [] - python_c_function_reg_list = [] - for fwd_api in fwd_api_list: - - # We only generate Ops with grad - is_forward_only = False - if 'backward' not in fwd_api.keys(): - is_forward_only = True - - assert 'api' in fwd_api.keys() - assert 'args' in fwd_api.keys() - assert 'output' in fwd_api.keys() - - fwd_api_name = fwd_api['api'] - fwd_args_str = fwd_api['args'] - fwd_returns_str = fwd_api['output'] - - if fwd_api_name in skipped_fwd_api_names: - continue - - # Parse Dispensable Inputs - optional_inputs = [] - if 'optional' in fwd_api.keys(): - optional_inputs = ParseDispensable(fwd_api['optional']) - - # Collect Original Forward Inputs/Outputs and then perform validation checks - forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForward( - fwd_args_str, fwd_returns_str) - print("Parsed Original Forward Inputs List: ", forward_inputs_list) - print("Prased Original Forward Attrs List: ", forward_attrs_list) - print("Parsed Original Forward Returns List: ", - forward_returns_list) - - forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap( - forward_inputs_list, forward_returns_list) - print("Generated Forward Input Position Map: ", - forward_inputs_position_map) - print("Generated Forward Output Position Map: ", - forward_outputs_position_map) - - python_c_function_str, python_c_function_reg_str = GeneratePythonCFunction( - fwd_api_name, forward_inputs_position_map, forward_attrs_list, - forward_outputs_position_map, optional_inputs, is_forward_only) - python_c_function_list.append(python_c_function_str) - python_c_function_reg_list.append(python_c_function_reg_str) - print("Generated Python-C Function: ", python_c_function_str) - - # Append Namespace - python_c_functions_reg_str += ",\n".join( - python_c_function_reg_list) + "," - python_c_functions = "\n".join(python_c_function_list) - if len(namespace) > 0: - python_c_functions_str += f"""namespace {namespace} {{ - {python_c_functions} -}} -""" + y_generator = PythonCYamlGenerator(api_yaml_path) + y_generator.run() - else: - python_c_functions_str += python_c_functions + generated_python_c_functions += y_generator.python_c_functions_str + "\n" + generated_python_c_registration += y_generator.python_c_functions_reg_str + "\n" - python_c_str = GeneratePythonCWrappers(python_c_functions_str, - python_c_functions_reg_str) + python_c_str = GeneratePythonCWrappers(generated_python_c_functions, + generated_python_c_registration) - print("Generated Python-C Codes: ", python_c_str) + logging.info(f"Generated Python-C Codes: \n{python_c_str}") output_path = args.output_path for path in [output_path]: diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc index 75ddfb92275524eece120e6f2aae4f41a3e67701..17bc2441488aa3c4fc62a37e825eeb94cafea9bb 100644 --- a/paddle/fluid/eager/backward.cc +++ b/paddle/fluid/eager/backward.cc @@ -517,11 +517,11 @@ std::vector RunBackward( } // TODO(jiabin): Should we erase it or find a more efficient way. + node_input_buffers_dict.erase(node); // Prepare GradTensorHolder for next node const std::vector>& edges = node->GetEdges(); - PADDLE_ENFORCE(edges.size() == grad_output_tensors.size() || edges.empty(), paddle::platform::errors::Fatal( "Number of edges should be either empty ( for leaf node " @@ -532,6 +532,7 @@ std::vector RunBackward( for (size_t i = 0; i < edges.size(); i++) { for (size_t j = 0; j < edges[i].size(); j++) { const Edge& edge = edges[i][j]; + auto edge_rank = edge.GetEdgeRankInfo(); // Since we make edge has as same rank as bwd outputs, we indexing them // with @@ -545,6 +546,7 @@ std::vector RunBackward( grad_output_tensors[i].empty()) { continue; } + PADDLE_ENFORCE_LT( j, grad_output_tensors[i].size(), paddle::platform::errors::Fatal( diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc index 7eb2902d935c4fd8d5990c81fbf6bcf3fd6e6e66..891ad4d8983b5b37b31ab5f5f980e74ccff47069 100644 --- a/paddle/fluid/eager/grad_node_info.cc +++ b/paddle/fluid/eager/grad_node_info.cc @@ -15,10 +15,16 @@ #include "paddle/fluid/eager/grad_node_info.h" #include "paddle/fluid/eager/accumulation/accumulation_node.h" #include "paddle/fluid/eager/autograd_meta.h" +#include "paddle/fluid/eager/utils.h" + #include "paddle/phi/common/data_type.h" #include "paddle/phi/core/dense_tensor.h" +#include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/data_type_transform.h" #include "paddle/fluid/framework/var_type.h" + #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/errors.h" @@ -33,7 +39,6 @@ GradNodeBase::GradNodeBase(size_t bwd_in_slot_num, size_t bwd_out_slot_num) { VLOG(6) << "Construct GradNodeBase"; bwd_in_meta_.resize(bwd_in_slot_num); bwd_out_meta_.resize(bwd_out_slot_num); - // adj_edges has the same num as backward outputs adj_edges_.resize(bwd_out_slot_num); } @@ -44,24 +49,20 @@ void GradNodeBase::AddEdges(std::vector* metas, size_t slot_id) { "Given slot id is out of range of adj_edges outter size, " "adj_edges is designed to has the same size of grad " "inputs's slot num.")); - for (const auto& meta : *metas) { + + for (size_t i = 0; i < metas->size(); i++) { + const auto& meta = (*metas)[i]; // adj_edges has as same rank as fwd inputs, and record it's output rank // from // its pre-ops if (meta && !meta->StopGradient()) { auto node = meta->GetMutableGradNode(); - if (node && node.get()) { - VLOG(6) << "Add Edges for slot: " << slot_id - << " which is: " << meta->GetMutableGradNode()->name(); - adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), - meta->OutRankInfo()); - } else { + if (!node || !node.get()) { meta->SetGradNode(std::make_shared(meta)); - VLOG(6) << "Add Edges for slot: " << slot_id - << " which is: " << meta->GetMutableGradNode()->name(); - adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), - meta->OutRankInfo()); } + + adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), + meta->OutRankInfo()); } } } @@ -73,130 +74,205 @@ void GradNodeBase::AddEdges(AutogradMeta* meta, size_t slot_id) { "Given slot id is out of range of adj_edges outter size, " "adj_edges is designed to has the same size of grad " "inputs's slot num.")); + if (meta && !meta->StopGradient()) { auto node = meta->GetMutableGradNode(); - if (node && node.get()) { - VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from " - << this->name() << " to " << meta->GetMutableGradNode()->name(); - adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), - meta->OutRankInfo()); - } else { + if (!node || !node.get()) { meta->SetGradNode(std::make_shared(meta)); - VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from " - << this->name() << " to " << meta->GetMutableGradNode()->name(); - adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), - meta->OutRankInfo()); } + VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from " + << this->name() << " to " << meta->GetMutableGradNode()->name(); + + adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), + meta->OutRankInfo()); } } -const std::vector& GradNodeBase::InputMeta() const { +const std::vector>& GradNodeBase::InputMeta() const { return bwd_in_meta_; } -const std::vector& GradNodeBase::OutputMeta() const { +const std::vector>& GradNodeBase::OutputMeta() const { return bwd_out_meta_; } -void GradNodeBase::SetGradInMeta(std::vector* fwd_out, +void GradNodeBase::SetGradInMeta(const paddle::experimental::Tensor& fwd_out, size_t slot_rank) { - size_t slot_size = fwd_out->size(); + auto* fwd_out_meta = egr::EagerUtils::nullable_autograd_meta(fwd_out); PADDLE_ENFORCE_LE( slot_rank, (bwd_in_meta_.size() - 1), paddle::platform::errors::InvalidArgument( "Slot Rank should less equal than bwd_in_meta_ size, since " "bwd_in_meta_ is designed to hold as same num as backward " "inputs.")); - auto& meta = bwd_in_meta_.at(slot_rank); - PADDLE_ENFORCE_EQ(meta.IsInitialized(), false, - paddle::platform::errors::PreconditionNotMet( - "Bwd_in_meta should only be init once, addition " - "initialization for it is forbidden. If you got this " - "error, it indicates bugs in framework.")); - // Init stop gradient vector before use to avoid push back - meta.Init(slot_size); - for (size_t i = 0; i < slot_size; i++) { - PADDLE_ENFORCE_NOT_NULL((*fwd_out)[i], - paddle::platform::errors::PreconditionNotMet( - "Bwd_in_meta should only be called while " - "autograd_meta is not null. If you got this " - "error, it indicates bugs in framework.")); - if ((*fwd_out)[i]->StopGradient()) { - // Set Stop Gradient only when its true or non-initialized autograd_meta, - // since all default value is false. - meta.SetStopGradient(i, (*fwd_out)[i]->StopGradient()); + auto& metas = bwd_in_meta_.at(slot_rank); + if (metas.size() == 0) { + metas.resize(1); + } + + auto& meta = metas[0]; + meta.SetStopGradient(fwd_out_meta->StopGradient()); + + // Record TensorMeta + if (phi::DenseTensor::classof(fwd_out.impl().get())) { + // Only Copy Meta + phi::DenseTensor* dense_tensor = + static_cast(fwd_out.impl().get()); + + PADDLE_ENFORCE_NE( + dense_tensor->meta().dtype, phi::DataType::UNDEFINED, + paddle::platform::errors::Fatal( + "Attempting to copy DenseTensorMeta with phi::DataType::UNDEFINED," + "which is illegal.")); + meta.SetTensorMeta(dense_tensor->meta()); + + if (paddle::framework::IsComplexType( + paddle::framework::TransToProtoVarType(dense_tensor->type()))) { + need_complex_to_real_ = true; } + } else { + VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta with " + "non-DenseTensor argument."; } } -void GradNodeBase::SetGradInMeta(AutogradMeta* fwd_out, size_t slot_rank) { +void GradNodeBase::SetGradInMeta( + const std::vector& fwd_out, + size_t slot_rank) { + size_t slot_size = fwd_out.size(); PADDLE_ENFORCE_LE( slot_rank, (bwd_in_meta_.size() - 1), paddle::platform::errors::InvalidArgument( "Slot Rank should less equal than bwd_in_meta_ size, since " "bwd_in_meta_ is designed to hold as same num as backward " "inputs.")); - auto& meta = bwd_in_meta_.at(slot_rank); - PADDLE_ENFORCE_EQ(meta.IsInitialized(), false, - paddle::platform::errors::PreconditionNotMet( - "Bwd_in_meta should only be init once, Additional " - "initialization for it is forbidden. If you got this " - "error, it indicates bugs in framework.")); + auto& metas = bwd_in_meta_.at(slot_rank); // Init stop gradient vector before use to avoid push back - VLOG(7) << "Init bwd_in_meta_ with slot rank: " << slot_rank; - meta.Init(1); - meta.SetStopGradient(0, fwd_out->StopGradient()); + if (metas.size() < slot_size) { + VLOG(7) << "Init bwd_in_meta_ with slot rank: " << slot_rank; + metas.resize(slot_size); + } + for (size_t i = 0; i < slot_size; i++) { + auto& meta = metas[i]; + const auto& fwd_out_tensor = fwd_out[i]; + auto* fwd_out_meta = + egr::EagerUtils::nullable_autograd_meta(fwd_out_tensor); + PADDLE_ENFORCE_NOT_NULL(fwd_out_meta, + paddle::platform::errors::PreconditionNotMet( + "Bwd_in_meta should only be called while " + "autograd_meta is not null. If you got this " + "error, it indicates bugs in framework.")); + if (fwd_out_meta->StopGradient()) { + // Set Stop Gradient only when its true or non-initialized autograd_meta, + // since all default value is false. + meta.SetStopGradient(fwd_out_meta->StopGradient()); + } + + // Record TensorMeta + if (phi::DenseTensor::classof(fwd_out_tensor.impl().get())) { + // Only Copy Meta + phi::DenseTensor* dense_tensor = + static_cast(fwd_out_tensor.impl().get()); + + PADDLE_ENFORCE_NE( + dense_tensor->meta().dtype, phi::DataType::UNDEFINED, + paddle::platform::errors::Fatal("Attempting to copy DenseTensorMeta " + "with phi::DataType::UNDEFINED," + "which is illegal.")); + meta.SetTensorMeta(dense_tensor->meta()); + if (paddle::framework::IsComplexType( + paddle::framework::TransToProtoVarType(dense_tensor->type()))) { + need_complex_to_real_ = true; + } + } else { + VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta " + "with non-DenseTensor argument."; + } + } } -void GradNodeBase::SetGradOutMeta(std::vector* fwd_in, +void GradNodeBase::SetGradOutMeta(const paddle::experimental::Tensor& fwd_in, size_t slot_rank) { - size_t slot_size = fwd_in->size(); + auto* fwd_in_meta = egr::EagerUtils::nullable_autograd_meta(fwd_in); PADDLE_ENFORCE_LE( - slot_rank, (bwd_out_meta_.size() - 1), + (slot_rank + 1), bwd_out_meta_.size(), paddle::platform::errors::InvalidArgument( "Slot Rank should less equal than bwd_out_meta_ size, " "since bwd_out_meta_ is designed to hold as same num as " "backward outputs.")); - auto& meta = bwd_out_meta_.at(slot_rank); - PADDLE_ENFORCE_EQ(meta.IsInitialized(), false, - paddle::platform::errors::PreconditionNotMet( - "Bwd_out_meta should only be init once. Additional " - "initialization for it is forbidden. If you got this " - "error, it indicates bugs in framework.")); + auto& metas = bwd_out_meta_.at(slot_rank); // Init stop gradient vector before use to avoid push back - meta.Init(slot_size); - for (size_t i = 0; i < slot_size; i++) { - if (!(*fwd_in)[i]) { - meta.SetStopGradient(i, true); - continue; - } - if ((*fwd_in)[i]->StopGradient()) { - // Set Stop Gradient only when its true or non-initialized autograd_meta, - // since all default value is false. - meta.SetStopGradient(i, (*fwd_in)[i]->StopGradient()); + if (metas.size() == 0) { + metas.resize(1); + } + auto& meta = metas[0]; + if (fwd_in_meta) { + meta.SetStopGradient(fwd_in_meta->StopGradient()); + } else { + meta.SetStopGradient(true); + } + + // Record TensorMeta + if (fwd_in.impl() && fwd_in.impl().get()) { + if (phi::DenseTensor::classof(fwd_in.impl().get())) { + // Only Copy Meta + phi::DenseTensor* dense_tensor = + static_cast(fwd_in.impl().get()); + PADDLE_ENFORCE_NE( + dense_tensor->meta().dtype, phi::DataType::UNDEFINED, + paddle::platform::errors::Fatal("Attempting to copy DenseTensorMeta " + "with phi::DataType::UNDEFINED," + "which is illegal.")); + meta.SetTensorMeta(dense_tensor->meta()); } + } else { + VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta with " + "non-DenseTensor argument."; } } -void GradNodeBase::SetGradOutMeta(AutogradMeta* fwd_in, size_t slot_rank) { +void GradNodeBase::SetGradOutMeta( + const std::vector& fwd_in, size_t slot_rank) { + size_t slot_size = fwd_in.size(); PADDLE_ENFORCE_LE( - (slot_rank + 1), bwd_out_meta_.size(), + slot_rank, (bwd_out_meta_.size() - 1), paddle::platform::errors::InvalidArgument( "Slot Rank should less equal than bwd_out_meta_ size, " "since bwd_out_meta_ is designed to hold as same num as " "backward outputs.")); - auto& meta = bwd_out_meta_.at(slot_rank); - PADDLE_ENFORCE_EQ(meta.IsInitialized(), false, - paddle::platform::errors::PreconditionNotMet( - "Bwd_out_meta should only be init once. Additional " - "initialization for it is forbidden. If you got this " - "error, it indicates bugs in framework.")); + auto& metas = bwd_out_meta_.at(slot_rank); // Init stop gradient vector before use to avoid push back - meta.Init(1); - if (fwd_in) { - meta.SetStopGradient(0, fwd_in->StopGradient()); - } else { - meta.SetStopGradient(0, true); + if (metas.size() < slot_size) { + metas.resize(slot_size); + } + for (size_t i = 0; i < slot_size; i++) { + const auto& fwd_in_tensor = fwd_in[i]; + auto& meta = metas[i]; + auto* fwd_in_meta = egr::EagerUtils::nullable_autograd_meta(fwd_in_tensor); + if (fwd_in_meta) { + // Set Stop Gradient only when its true or non-initialized autograd_meta, + // since all default value is false. + meta.SetStopGradient(fwd_in_meta->StopGradient()); + } + + // Record TensorMeta + if (fwd_in_tensor.impl() && fwd_in_tensor.impl().get()) { + if (phi::DenseTensor::classof(fwd_in_tensor.impl().get())) { + // Only Copy Meta + phi::DenseTensor* dense_tensor = + static_cast(fwd_in_tensor.impl().get()); + + PADDLE_ENFORCE_NE(dense_tensor->meta().dtype, phi::DataType::UNDEFINED, + paddle::platform::errors::Fatal( + "Attempting to copy DenseTensorMeta with " + "phi::DataType::UNDEFINED," + "which is illegal.")); + meta.SetTensorMeta(dense_tensor->meta()); + } + } else { + VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta " + "with non-DenseTensor argument."; + } } } @@ -207,12 +283,8 @@ void GradNodeBase::SetDefaultGradInOutMeta() { "meta setter, other size of inputs and outputs should " "create with Setter and Getters")); // Default stop_gradient is false and slot id is 0, slot size is 1; - bwd_out_meta_[0].Init(1); - bwd_in_meta_[0].Init(1); -} - -const std::vector>& GradNodeBase::GetEdges() const { - return adj_edges_; + bwd_out_meta_[0].resize(1); + bwd_in_meta_[0].resize(1); } int64_t GradNodeBase::RegisterGradientHook( @@ -222,6 +294,10 @@ int64_t GradNodeBase::RegisterGradientHook( return next_hook_id_++; } +const std::vector>& GradNodeBase::GetEdges() const { + return adj_edges_; +} + std::vector> GradNodeBase::ApplyGradientHooks( const std::vector>& tensors) { @@ -270,4 +346,45 @@ GradNodeBase::ApplyGradientHooks( return outs; } +void GradNodeBase::HandleComplexGradToRealGrad( + std::vector>* out_grads) { + for (size_t slot_id = 0; slot_id < out_grads->size(); slot_id++) { + const std::vector& slot_out_grads = + (*out_grads)[slot_id]; + for (size_t rank_id = 0; rank_id < slot_out_grads.size(); rank_id++) { + const GradSlotMeta& slot_meta = bwd_out_meta_[slot_id][rank_id]; + + PADDLE_ENFORCE( + slot_meta.HasTensorMeta() > 0, + paddle::platform::errors::Fatal( + "We require TensorMeta in GradInputMeta() to obtain forward data " + "types." + "However, no TensorMeta is detected in bwd_out_meta_.")); + + auto fwd_data_type = paddle::framework::TransToProtoVarType( + slot_meta.GetTensorMeta().dtype); + const paddle::experimental::Tensor& grad = slot_out_grads[rank_id]; + + if (paddle::framework::IsComplexType(fwd_data_type)) continue; + + // Only Handle Complex To Real for DenseTensor for now + if (phi::DenseTensor::classof(grad.impl().get())) { + phi::DenseTensor* grad_dense_tensor = + static_cast(grad.impl().get()); + + auto curr_data_type = + paddle::framework::TransToProtoVarType(grad_dense_tensor->type()); + if (!paddle::framework::IsComplexType(curr_data_type)) continue; + + // Convert Complex GradOut to Real + auto out = std::make_shared(); + paddle::framework::TransComplexToReal(fwd_data_type, curr_data_type, + *grad_dense_tensor, out.get()); + + (*out_grads)[slot_id][rank_id].set_impl(out); + } + } + } +} + } // namespace egr diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h index 168e1bcca77ca85eb6fa90a23350d1f62f63dc8e..4b21a193ee021f06538e1a11bbffb898376739a7 100644 --- a/paddle/fluid/eager/grad_node_info.h +++ b/paddle/fluid/eager/grad_node_info.h @@ -57,21 +57,28 @@ class AutogradMeta; class GradSlotMeta { public: GradSlotMeta() = default; - void Init(size_t size) { - size_ = static_cast(size); - stop_gradient_.resize(size, false); + bool IsStopGradient() const { return stop_gradient_; } + void SetStopGradient(bool stop_gradient = true) { + stop_gradient_ = stop_gradient; } - bool IsInitialized() const { return size_ != -1; } - bool IsStopGradient(size_t rank) const { return stop_gradient_[rank]; } - int Size() const { return size_; } - void SetStopGradient(size_t rank, bool stop_gradient = true) { - stop_gradient_.at(rank) = stop_gradient; + void SetTensorMeta(const phi::DenseTensorMeta& meta) { + meta_ = std::make_shared(meta); + } + bool HasTensorMeta() const { return meta_ && meta_.get(); } + const phi::DenseTensorMeta& GetTensorMeta() const { + if (!HasTensorMeta()) { + PADDLE_THROW(paddle::platform::errors::Fatal( + "meta_ of GradSlotMeta has not been initialized yet." + "You're expected to check Edge availability with HasTensorMeta()" + "before calling GetTensorMeta() interface.")); + } + return *meta_.get(); } private: - int size_{-1}; - std::vector stop_gradient_{false}; + bool stop_gradient_{false}; + std::shared_ptr meta_ = nullptr; }; class GradNodeBase { @@ -112,25 +119,30 @@ class GradNodeBase { void AddEdges(std::vector* metas, size_t slot_id); void AddEdges(AutogradMeta* meta, size_t slot_id); - /** - * GetEdges is designed to get all edges of current node**/ - const std::vector>& GetEdges() const; + // adj_edges were moved inside OutputMeta(), so no available direct access + // from GradNodeBase. + // To access Edges, get GradSlotMeta by calling OutputMeta(), then use + // slot_meta.GetEdge() /** * Get Input Meta of current Grad node**/ - const std::vector& InputMeta() const; + const std::vector>& InputMeta() const; /** * Get Output Meta of current Grad node**/ - const std::vector& OutputMeta() const; + const std::vector>& OutputMeta() const; /** * Set bwd ins and outs info with forward vars * **/ - void SetGradInMeta(std::vector* fwd_out, size_t slot_rank); - void SetGradInMeta(AutogradMeta* fwd_out, size_t slot_rank); + void SetGradInMeta(const std::vector& fwd_out, + size_t slot_rank); + void SetGradInMeta(const paddle::experimental::Tensor& fwd_out, + size_t slot_rank); - void SetGradOutMeta(std::vector* fwd_in, size_t slot_rank); - void SetGradOutMeta(AutogradMeta* fwd_in, size_t slot_rank); + void SetGradOutMeta(const std::vector& fwd_in, + size_t slot_rank); + void SetGradOutMeta(const paddle::experimental::Tensor& fwd_in, + size_t slot_rank); /** * Default setters for Grad in/out meta this should be used for same special @@ -162,11 +174,21 @@ class GradNodeBase { std::vector> ApplyGradientHooks( const std::vector>& tensors); + /** + * Handle Complex - Real Type Promotion + * **/ + void HandleComplexGradToRealGrad( + std::vector>* out_grads); + bool NeedComplexToRealConversion() { return need_complex_to_real_; } + virtual std::string name() { return "GradNodeBase"; } - private: - // TODO(jiabin): Use SmallVector instead after merge PR from develop + /** + * GetEdges is designed to get all edges of current node**/ + const std::vector>& GetEdges() const; + private: + // TODO(zhanlve): Merge adj_edges_ into GradOutMeta // Edges recorded the backward related node info, which indicate all edges // linked // by this Grad Node. @@ -174,10 +196,10 @@ class GradNodeBase { std::vector> adj_edges_; // bwd_out_meta_ is used to record Grad output info for backward - std::vector bwd_out_meta_; + std::vector> bwd_out_meta_; // bwd_in_meta_ used to record Grad input info for backward - std::vector bwd_in_meta_; + std::vector> bwd_in_meta_; // Gradient Hooks // Customer may register a list of hooks which will be called in order during // backward @@ -188,6 +210,8 @@ class GradNodeBase { /* hook */ std::shared_ptr>> gradient_hooks_; + // We handle complex to real conversion only if any complex GradIn is involved + bool need_complex_to_real_ = false; int64_t next_hook_id_{0}; }; diff --git a/paddle/fluid/eager/grad_tensor_holder.h b/paddle/fluid/eager/grad_tensor_holder.h index 9059b403607461cc980a58d345fe1542aa4b1903..8c00f9161b629f7a3f093a1225d3d5b0b9bcca8b 100644 --- a/paddle/fluid/eager/grad_tensor_holder.h +++ b/paddle/fluid/eager/grad_tensor_holder.h @@ -26,12 +26,13 @@ namespace egr { * GradTensorHolder should have as same format as forward output **/ class GradTensorHolder { public: - explicit GradTensorHolder(const std::vector& meta) { - VLOG(7) << "Init GradTensorHolder with meta size: " << meta.size(); - buffer_.resize(meta.size()); + explicit GradTensorHolder( + const std::vector>& metas) { + VLOG(7) << "Init GradTensorHolder with meta size: " << metas.size(); + buffer_.resize(metas.size()); for (size_t i = 0; i < buffer_.size(); i++) { - VLOG(7) << "Init GradTensorHolder with meta rank: " << meta[i].Size(); - buffer_[i].resize(meta[i].Size()); + VLOG(7) << "Init GradTensorHolder with meta rank: " << metas[i].size(); + buffer_[i].resize(metas[i].size()); } } diff --git a/paddle/fluid/eager/tensor_wrapper.h b/paddle/fluid/eager/tensor_wrapper.h index 0e11444b81526de1904b72fc983814314d834a45..8da27f3bb8a13a759bd12737746ce6add4b1aaa5 100644 --- a/paddle/fluid/eager/tensor_wrapper.h +++ b/paddle/fluid/eager/tensor_wrapper.h @@ -36,6 +36,15 @@ class TensorWrapper { explicit TensorWrapper(const paddle::experimental::Tensor& tensor, bool full_reserved = false, bool no_need_buffer = false) { + // set inplace_version_snapshot_ according to tensor's current inplace + // version. + if (tensor.impl() && phi::DenseTensor::classof(tensor.impl().get())) { + phi::DenseTensor* dense_tensor = + static_cast(tensor.impl().get()); + auto& inplace_version_counter = dense_tensor->InplaceVersionCounter(); + inplace_version_snapshot_ = inplace_version_counter.CurrentVersion(); + } + /** * Normally, we should fully reserved all non-output or non-leaf fwd tensor * here. And for fwd output tensor, we should not reserve its autogradmeta, @@ -49,6 +58,7 @@ class TensorWrapper { } // shallow copy tensor_impl here + no_need_buffer_ = no_need_buffer; if (no_need_buffer) { if (phi::DenseTensor::classof(tensor.impl().get())) { // Only Copy Meta @@ -86,6 +96,7 @@ class TensorWrapper { // if it's full_reserved just return the full copy of tensor if (full_reserved_) { + check_inplace_version(); return intermidiate_tensor_; } else { std::shared_ptr new_grad_node = grad_node; @@ -94,15 +105,52 @@ class TensorWrapper { intermidiate_tensor_.set_autograd_meta( std::static_pointer_cast( p_ab_autograd_meta)); + check_inplace_version(); return intermidiate_tensor_; } } + void check_inplace_version() { + if (no_need_buffer_) { + VLOG(6) << "There's no need to check inplace_version because " + "no_need_buffer_ is true."; + return; + } + if (intermidiate_tensor_.impl() && + phi::DenseTensor::classof(intermidiate_tensor_.impl().get())) { + phi::DenseTensor* dense_tensor = + static_cast(intermidiate_tensor_.impl().get()); + auto& inplace_version_counter = dense_tensor->InplaceVersionCounter(); + + uint32_t current_inplace_version = + inplace_version_counter.CurrentVersion(); + PADDLE_ENFORCE_EQ( + current_inplace_version, inplace_version_snapshot_, + paddle::platform::errors::PermissionDenied( + "Tensor '%s' used in gradient computation has been " + "modified by an inplace operation. " + "Its version is %d but the expected version is %d. " + "Please fix your code to void calling an inplace operator " + "after using the Tensor which will used in gradient " + "computation.", + intermidiate_tensor_.name(), current_inplace_version, + inplace_version_snapshot_)); + VLOG(6) << " The inplace_version_snapshot_ of Tensor '" + << intermidiate_tensor_.name() << "' is [ " + << inplace_version_snapshot_ << " ]"; + VLOG(6) << " The current_inplace_version of Tensor '" + << intermidiate_tensor_.name() << "' is [ " + << current_inplace_version << " ]"; + } + } + void clear() { intermidiate_tensor_.reset(); } private: bool full_reserved_ = false; + bool no_need_buffer_ = false; std::pair out_rank_info_; paddle::experimental::Tensor intermidiate_tensor_; + uint32_t inplace_version_snapshot_ = 0; }; } // namespace egr diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc b/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc index e3db309c4016a512c5379fb352beb4af690a271e..d592b5ccf66ffc8532214a72612e9308b7e51fe5 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc +++ b/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc @@ -11,6 +11,7 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. + #include "glog/logging.h" #include "gtest/gtest.h" @@ -23,14 +24,9 @@ TEST(GradNodeInfo, GradSlotMeta) { auto grad_slot = egr::GradSlotMeta(); - CHECK(grad_slot.IsInitialized() == false); - VLOG(6) << "Init GradSlotMeta"; - grad_slot.Init(2); - CHECK(grad_slot.IsInitialized() == true); VLOG(6) << "Set SetStopGradient"; - grad_slot.SetStopGradient(0); - CHECK(grad_slot.IsStopGradient(0) == true); - CHECK_EQ(grad_slot.Size(), 2); + grad_slot.SetStopGradient(); + CHECK(grad_slot.IsStopGradient() == true); } void TestGradNodeBase(bool is_remove_gradient_hook) { @@ -56,18 +52,22 @@ void TestGradNodeBase(bool is_remove_gradient_hook) { ->data()[0], 6.0f); VLOG(6) << "Test Add Edges"; - egr::Edge edge0(grad_test_node1, 1, 2); - auto auto_grad0 = std::make_shared(edge0); + egr::Edge tmp_edge0(grad_test_node1, 1, 2); + auto auto_grad0 = std::make_shared(tmp_edge0); auto_grad0->SetStopGradient(false); - egr::Edge edge1(grad_test_node1, 3, 4); - auto auto_grad1 = std::make_shared(edge1); + + egr::Edge tmp_edge1(grad_test_node1, 3, 4); + auto auto_grad1 = std::make_shared(tmp_edge1); + et1.set_autograd_meta(auto_grad1); auto_grad1->SetStopGradient(false); grad_test_node0->AddEdges(auto_grad0.get(), 0); + CHECK_EQ(grad_test_node0->GetEdges()[0][0].GetEdgeRankInfo().first, size_t(1)); CHECK_EQ(grad_test_node0->GetEdges()[0][0].GetEdgeRankInfo().second, size_t(2)); std::vector metas = {auto_grad1.get()}; + grad_test_node0->AddEdges(&metas, 1); CHECK_EQ(grad_test_node0->GetEdges()[1][0].GetEdgeRankInfo().first, size_t(3)); @@ -76,22 +76,30 @@ void TestGradNodeBase(bool is_remove_gradient_hook) { VLOG(6) << "Test Set Meta and Get Meta"; auto_grad1->SetStopGradient(true); - grad_test_node0->SetGradInMeta(&metas, 0); - grad_test_node0->SetGradInMeta(auto_grad1.get(), 1); - grad_test_node0->SetGradOutMeta(&metas, 0); - grad_test_node0->SetGradOutMeta(auto_grad1.get(), 1); - CHECK_EQ(grad_test_node0->InputMeta()[0].Size(), 1); - CHECK_EQ(grad_test_node0->InputMeta()[1].Size(), 1); - CHECK(grad_test_node0->OutputMeta()[0].IsStopGradient(0)); - CHECK(grad_test_node0->OutputMeta()[1].IsStopGradient(0)); + grad_test_node0->SetGradInMeta(et1, 0); + grad_test_node0->SetGradInMeta({et1}, 1); + grad_test_node0->SetGradOutMeta(et1, 0); + grad_test_node0->SetGradOutMeta({et1}, 1); + CHECK_EQ(grad_test_node0->InputMeta()[0].size(), size_t(1)); + CHECK_EQ(grad_test_node0->InputMeta()[1].size(), size_t(1)); + CHECK_EQ(grad_test_node0->InputMeta()[0][0].GetTensorMeta().dtype, + meta.dtype); + CHECK_EQ(grad_test_node0->InputMeta()[1][0].GetTensorMeta().dtype, + meta.dtype); + CHECK(grad_test_node0->OutputMeta()[0][0].IsStopGradient()); + CHECK(grad_test_node0->OutputMeta()[1][0].IsStopGradient()); + CHECK_EQ(grad_test_node0->OutputMeta()[0][0].GetTensorMeta().dtype, + meta.dtype); + CHECK_EQ(grad_test_node0->OutputMeta()[1][0].GetTensorMeta().dtype, + meta.dtype); VLOG(6) << "Test Default Set Meta and Get Meta"; auto grad_test_node2 = std::make_shared( /* val */ 5.0, /* in_num */ 1, /* out_num */ 1); grad_test_node2->SetDefaultGradInOutMeta(); - CHECK(grad_test_node2->OutputMeta()[0].IsInitialized()); - CHECK(grad_test_node2->OutputMeta()[0].IsStopGradient(0) == false); - CHECK_EQ(grad_test_node2->OutputMeta()[0].Size(), 1); + CHECK_GT(grad_test_node2->OutputMeta()[0].size(), size_t(0)); + CHECK(grad_test_node2->OutputMeta()[0][0].IsStopGradient() == false); + CHECK_EQ(grad_test_node2->OutputMeta()[0].size(), size_t(1)); VLOG(6) << "Test Gradient Hook"; auto gradient_hook = []( @@ -135,7 +143,17 @@ TEST(GradNodeInfo, GradNodeBase) { } TEST(GradNodeInfo, Edge) { + phi::DenseTensorMeta meta = + phi::DenseTensorMeta(phi::DataType::FLOAT32, phi::make_ddim({1, 1})); + std::shared_ptr dt = std::make_shared( + std::make_unique( + paddle::platform::CPUPlace()) + .get(), + meta); + paddle::experimental::Tensor et1(dt); + auto grad_test_node0 = std::make_shared(5, 2, 2); + auto auto_grad1 = std::make_shared(); VLOG(6) << "Test Construct Edge"; egr::Edge edge0 = egr::Edge(); CHECK(edge0.IsInitialized() == false); @@ -145,13 +163,12 @@ TEST(GradNodeInfo, Edge) { egr::Edge(grad_test_node0, std::make_pair(size_t(1), size_t(0))); VLOG(6) << "Test Set Edge's Grad Node"; auto* grad_node = edge1.GetGradNode(); + et1.set_autograd_meta(auto_grad1); + grad_node->SetGradInMeta(et1, 0); + CHECK_EQ(grad_node->InputMeta().size(), size_t(2)); - auto mt_grad_node = edge1.GetMutableGradNode(); - auto auto_grad1 = std::make_shared(); std::vector metas = {auto_grad1.get()}; - // Uninitialized AutogradMeta indicates - mt_grad_node->SetGradInMeta(&metas, 0); - CHECK(grad_node->InputMeta()[0].IsStopGradient(0) == true); + CHECK(grad_node->InputMeta()[0][0].IsStopGradient() == true); VLOG(6) << "Test Get/Set Edge Rank Info"; CHECK_EQ(edge2.GetEdgeRankInfo().first, size_t(1)); CHECK_EQ(edge2.GetEdgeRankInfo().second, size_t(0)); diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc b/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc index 384fdcd6f97c4b318341db68cdd88b644d42d22a..645eac06ddda519bba952abb460571c9667c6d4a 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc +++ b/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc @@ -30,8 +30,7 @@ PD_DECLARE_KERNEL(full_like, CPU, ALL_LAYOUT); using namespace egr; // NOLINT TEST(GradTensorHolder, Constructor) { - GradSlotMeta slot_meta; - slot_meta.Init(1); + std::vector slot_meta(1); GradTensorHolder grad_tensor_holder = GradTensorHolder({slot_meta}); GradTensorHolder grad_tensor_holder2 = GradTensorHolder(grad_tensor_holder); @@ -72,8 +71,7 @@ TEST(GradTensorHolder, Interfaces) { paddle::experimental::Tensor et1 = paddle::experimental::Tensor(dt1); // Constructor empty GradTensorHolder - GradSlotMeta slot_meta; - slot_meta.Init(1); + std::vector slot_meta(1); GradTensorHolder grad_tensor_holder = GradTensorHolder({slot_meta, slot_meta}); @@ -138,8 +136,7 @@ TEST(GradTensorHolder, SelectedRowsMergeAdd) { paddle::experimental::Tensor t2(sr2); // Constructor empty GradTensorHolder - GradSlotMeta slot_meta; - slot_meta.Init(1); + std::vector slot_meta(1); GradTensorHolder grad_tensor_holder = GradTensorHolder({slot_meta, slot_meta}); diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc index 887ea3e3acfd50a15206f3e84ab45e16707f80af..c8fb6050e9d450d598ea722ac74da924e8857f0e 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc @@ -37,7 +37,7 @@ #include "paddle/fluid/imperative/tracer.h" #include "paddle/fluid/memory/memcpy.h" -static size_t max_num_benchmark_runs = 5000; +static size_t max_num_benchmark_runs = 4000; namespace egr { diff --git a/paddle/fluid/eager/to_static/run_program_op_func.h b/paddle/fluid/eager/to_static/run_program_op_func.h index 9967d8c36900f45fdd76272bc4416df1d30f2a6a..277319bc700b652855576db248463b424846e2e9 100644 --- a/paddle/fluid/eager/to_static/run_program_op_func.h +++ b/paddle/fluid/eager/to_static/run_program_op_func.h @@ -66,10 +66,10 @@ inline void run_program_dygraph_function( grad_node->SetStepScope(step_scope); // Set Grad out rank as same as fwd input and set stop gradient to bwd - grad_node->SetGradOutMeta(&p_autograd_x, /*slot id*/ 0); - grad_node->SetGradOutMeta(&p_autograd_params, /*slot id*/ 1); + grad_node->SetGradOutMeta(x, /*slot id*/ 0); + grad_node->SetGradOutMeta(params, /*slot id*/ 1); - grad_node->SetGradInMeta(&p_autograd_outs, 0); + grad_node->SetGradInMeta(deref_out, 0); // Set Next Edges grad_node->AddEdges(&p_autograd_x, /*slot id*/ 0); grad_node->AddEdges(&p_autograd_params, /*slot id*/ 1); diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc index 8a57d2694535e9c27e88416468fe5a67ce020b43..048087903a47c1699a7d7f32199c313146bd37ab 100644 --- a/paddle/fluid/eager/utils.cc +++ b/paddle/fluid/eager/utils.cc @@ -212,6 +212,27 @@ std::vector> EagerUtils::CreateVars( return res; } +void EagerUtils::ModifyInplaceInput( + const std::shared_ptr& inplace_variable, + paddle::experimental::Tensor* inplace_tensor) { + // Only modify the meta information of the inplace tensor, because + // EagerVariable cannot modify Tensor's meta information after inplace + // op (such as ``reshape``) is executed. + PADDLE_ENFORCE_NOT_NULL(inplace_tensor, + paddle::platform::errors::Fatal( + "Inplace Tensor is null and cannot be modified. " + "We are tring to Modify Inplace Input from its " + "shared_ptr, this error may indicate the inplace " + " input is nullptr")); + if (phi::DenseTensor::classof(inplace_variable->GetTensorBase().get())) { + phi::DenseTensor* variable_dense_tensor = + static_cast(inplace_variable->GetTensorBase().get()); + phi::DenseTensor* tensor_dense_tensor = + static_cast(inplace_tensor->impl().get()); + tensor_dense_tensor->set_meta(variable_dense_tensor->meta()); + } +} + std::vector EagerUtils::GetOutputs( const std::vector>& outs) { std::vector res; diff --git a/paddle/fluid/eager/utils.h b/paddle/fluid/eager/utils.h index fa5735e6f32a0ca7762b9ba94cce26ac8ac567dd..fbd080ef70e25408abcb979360610ad08d752f96 100644 --- a/paddle/fluid/eager/utils.h +++ b/paddle/fluid/eager/utils.h @@ -14,6 +14,7 @@ #pragma once +#include "paddle/fluid/eager/api/utils/tensor_utils.h" #include "paddle/fluid/eager/autograd_meta.h" #include "paddle/fluid/eager/eager_tensor.h" #include "paddle/fluid/eager/grad_node_info.h" @@ -144,6 +145,19 @@ class EagerUtils { iter.apply(std::forward(args)...); } + static void CheckInplace(const paddle::experimental::Tensor& target, + const AutogradMeta* autograd_meta, + bool require_any_grad) { + if (require_any_grad && autograd_meta) { + PADDLE_ENFORCE_EQ(!autograd_meta->StopGradient() && + egr::egr_utils_api::IsLeafTensor(target), + false, paddle::platform::errors::InvalidArgument( + "Leaf Var (%s) that doesn't stop gradient " + "can't use inplace strategy.", + target.name())); + } + } + // TensorWrapper Utils static paddle::experimental::Tensor RecoverTensorWrapper( TensorWrapper* tw, const std::shared_ptr& grad_node); @@ -171,6 +185,9 @@ class EagerUtils { static std::vector> CreateVars( const size_t num); // Construct Tensor From var + static void ModifyInplaceInput( + const std::shared_ptr& inplace_variable, + paddle::experimental::Tensor* inplace_tensor); static std::vector GetOutputs( const std::vector>& outs); static paddle::experimental::Tensor GetOutput( diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc index 11190309814e7c75777a6cddd7e4d24bfc7ba9e6..bf2cf58f970addf1dac9f4871ba4abe09c3c7b38 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc @@ -32,8 +32,9 @@ USE_OP(conv2d_transpose); USE_OP_DEVICE_KERNEL(conv2d_transpose, MKLDNN); USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); -USE_OP(gelu); +USE_OP_ITSELF(gelu); USE_OP_DEVICE_KERNEL(gelu, MKLDNN); +PD_DECLARE_ARG_MAPPING_FN(gelu); namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc index ef2e83ced26e07f199a122ee3157eb428b63aec9..7df957b2c0eca64bacd1b48065f37ddffec1770a 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc @@ -18,6 +18,7 @@ #include #include + #include "paddle/fluid/framework/ir/pass_tester_helper.h" #include "paddle/fluid/framework/op_registry.h" @@ -27,10 +28,11 @@ USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); USE_OP_ITSELF(leaky_relu); USE_OP_DEVICE_KERNEL(leaky_relu, MKLDNN); -USE_OP(gelu); +USE_OP_ITSELF(gelu); USE_OP_ITSELF(relu); USE_OP_ITSELF(tanh); USE_OP_DEVICE_KERNEL(tanh, MKLDNN); +PD_DECLARE_ARG_MAPPING_FN(gelu); namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc index 28e1145db42123b9dacfa9e359e08476d16ab4c0..7fe1852f7396cb8cebe4b83f4cc80a8023421351 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc @@ -38,7 +38,7 @@ USE_OP(softmax_with_cross_entropy); USE_OP_ITSELF(reduce_mean); USE_OP_ITSELF(reduce_sum); USE_OP_ITSELF(reduce_sum_grad); -USE_OP(reduce_mean_grad); +USE_OP_ITSELF(reduce_mean_grad); USE_OP_ITSELF(reshape2_grad); USE_OP(softmax_with_cross_entropy_grad); USE_OP_ITSELF(elementwise_add_grad); diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index ec28c98d5986d96109332db488fd48fc20834bfb..42fbeb5d29ce4ac3a1498704b1fff88570c9c092 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -628,10 +628,12 @@ std::vector ExecutionContext::MultiOutput( bool OpSupportGPU(const std::string& op_type) { // check in new Function kernel first + bool has_phi_kernel = false; auto& kernel_factory = phi::KernelFactory::Instance(); auto kernel_key_map = kernel_factory.SelectKernelMap(phi::TransToPhiKernelName(op_type)); for (auto& kernel : kernel_key_map) { + has_phi_kernel = true; if (platform::is_gpu_place(phi::TransToPhiPlace(kernel.first.backend()))) { return true; } @@ -639,12 +641,19 @@ bool OpSupportGPU(const std::string& op_type) { auto& all_kernels = OperatorWithKernel::AllOpKernels(); auto it = all_kernels.find(op_type); - if (it == all_kernels.end()) { - // All control operator must support GPU - return true; - } - for (auto& kern_pair : it->second) { - if (platform::is_gpu_place(kern_pair.first.place_)) { + if (it != all_kernels.end()) { + for (auto& kern_pair : it->second) { + if (platform::is_gpu_place(kern_pair.first.place_)) { + return true; + } + } + } else { + if (has_phi_kernel) { + // if has phi kernel, but not find phi gpu kernel and fluid gpu kernel, + // this op doesn't support GPU + return false; + } else { + // All control operator must support GPU return true; } } @@ -2347,6 +2356,10 @@ void OperatorWithKernel::BuildPhiKernelContext( const auto& vector_int_attr = BOOST_GET_CONST(std::vector, attr_it->second); pt_kernel_context->EmplaceBackAttr(vector_int_attr); + } else if (attr_defs[i].type_index == + std::type_index(typeid(std::vector))) { + pt_kernel_context->EmplaceBackAttr( + BOOST_GET_CONST(std::vector, attr_it->second)); } else { PADDLE_THROW(platform::errors::Unimplemented( "Unsupported cast op attribute `%s` when construct " diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h index f70f44878e361bf72c35ae5ae346c47869198eb5..9daac181d57de63a85116d176a286a9be9b3d4c7 100644 --- a/paddle/fluid/imperative/prepared_operator.h +++ b/paddle/fluid/imperative/prepared_operator.h @@ -541,6 +541,10 @@ void BuildDygraphPhiKernelContext( } else if (attr_defs[i].type_index == std::type_index(typeid(std::vector))) { kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(std::vector, attr)); + } else if (attr_defs[i].type_index == + std::type_index(typeid(std::vector))) { + kernel_ctx->EmplaceBackAttr( + BOOST_GET_CONST(std::vector, attr)); } else { PADDLE_THROW(platform::errors::Unimplemented( "Unsupported cast op attribute `%s` when construct " diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh index 79a31555c7f0b1cb4a8d9c48bae16145d605935b..2c0945cd5b386a003ce63c86f3feb52213b378ba 100755 --- a/paddle/fluid/inference/api/demo_ci/run.sh +++ b/paddle/fluid/inference/api/demo_ci/run.sh @@ -53,7 +53,11 @@ if [ $7 == ON ]; then if [[ -e "MobileNetV2.inference.model.tar.gz" ]]; then echo "MobileNetV2.inference.model.tar.gz has been downloaded." else - wget -q --no-proxy http://paddle-inference-dist.bj.bcebos.com/MobileNetV2.inference.model.tar.gz + if [ $WIN_DETECT != "" ]; then + wget -q -Y off http://paddle-inference-dist.bj.bcebos.com/MobileNetV2.inference.model.tar.gz + else + wget -q --no-proxy http://paddle-inference-dist.bj.bcebos.com/MobileNetV2.inference.model.tar.gz + fi tar xzf *.tar.gz fi cd .. diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 61e292a922f0e98a958d4fe2f8fc7850bdf47e18..4a44448dc84cf744cdf061031bdf7fae8f658c4b 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -219,6 +219,12 @@ class AllocatorFacadePrivate { } InitNaiveBestFitCUDAPinnedAllocator(); #endif +#ifdef PADDLE_WITH_ASCEND_CL + for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) { + InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id)); + } + InitNaiveBestFitNPUPinnedAllocator(); +#endif #ifdef PADDLE_WITH_XPU for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) { InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id)); diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index 174207deb08b84194d6f20fe04e4c27245295caf..5194c8772e47bca5ec728079b4b2dce883e39c22 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -21,6 +21,9 @@ limitations under the License. */ #include "paddle/fluid/platform/mkldnn_helper.h" #endif +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/infermeta/multiary.h" + namespace paddle { namespace operators { @@ -297,184 +300,6 @@ The required data format for this layer is one of the following: )DOC"); } -template -class BatchNormKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - const float epsilon = ctx.Attr("epsilon"); - float momentum = ctx.Attr("momentum"); - const bool is_test = ctx.Attr("is_test"); - const bool use_global_stats = ctx.Attr("use_global_stats"); - const bool trainable_stats = ctx.Attr("trainable_statistics"); - bool test_mode = is_test && (!trainable_stats); - - bool global_stats = test_mode || use_global_stats; - - const std::string data_layout_str = ctx.Attr("data_layout"); - DataLayout data_layout = framework::StringToDataLayout(data_layout_str); - - const auto *x = ctx.Input("X"); - const auto &x_dims = x->dims(); - PADDLE_ENFORCE_GE( - x_dims.size(), 2, - platform::errors::InvalidArgument( - "The size of input X's dimensions should be larger than 1." - "But received: the size of input X's dimensions is [%d]", - x_dims.size())); - PADDLE_ENFORCE_LE( - x_dims.size(), 5, - platform::errors::InvalidArgument( - "The size of input X's dimensions should be less than 6." - "But received: the size of input X's dimensionss is [%d]", - x_dims.size())); - const int N = x_dims[0]; - const int C = - (data_layout == DataLayout::kNCHW ? x_dims[1] - : x_dims[x_dims.size() - 1]); - const int sample_size = x->numel() / N / C; - - auto *y = ctx.Output("Y"); - - auto *mean_out = ctx.Output("MeanOut"); - auto *variance_out = ctx.Output("VarianceOut"); - auto *saved_mean = ctx.Output("SavedMean"); - auto *saved_variance = ctx.Output("SavedVariance"); - - // alloc memory - y->mutable_data(ctx.GetPlace()); - mean_out->mutable_data(ctx.GetPlace()); - variance_out->mutable_data(ctx.GetPlace()); - saved_mean->mutable_data(ctx.GetPlace()); - saved_variance->mutable_data(ctx.GetPlace()); - - // input dimension is 2 and the format is NCHW. The input can be regarded - // as NHWC format - if (x_dims.size() == 2 && data_layout == DataLayout::kNCHW) { - data_layout = DataLayout::kNHWC; - } - - if (!global_stats) { - // saved_xx is use just in this batch of data - EigenVectorArrayMap saved_mean_e( - saved_mean->mutable_data(ctx.GetPlace()), C); - EigenVectorArrayMap saved_variance_e( - saved_variance->mutable_data(ctx.GetPlace()), C); - saved_mean_e.setZero(); - saved_variance_e.setZero(); - - EigenVectorArrayMap running_mean_arr( - mean_out->mutable_data(ctx.GetPlace()), C); - EigenVectorArrayMap running_var_arr( - variance_out->mutable_data(ctx.GetPlace()), C); - - if ((N * sample_size) == 1) { - // Only 1 element in normalization dimension, - // we skip the batch norm calculation, let y = x. - framework::TensorCopy(*x, ctx.GetPlace(), y); - return; - } - - switch (data_layout) { - case DataLayout::kNCHW: { - ConstEigenArrayMap x_arr(x->data(), sample_size, N * C); - for (int nc = 0; nc < N * C; ++nc) { - saved_mean_e(nc % C) += x_arr.col(nc).sum(); - } - saved_mean_e /= N * sample_size; - for (int nc = 0; nc < N * C; ++nc) { - saved_variance_e(nc % C) += - (x_arr.col(nc) - saved_mean_e(nc % C)).matrix().squaredNorm(); - } - saved_variance_e /= N * sample_size; - break; - } - case DataLayout::kNHWC: { - ConstEigenArrayMap x_arr(x->data(), C, N * sample_size); - for (int i = 0; i < N * sample_size; ++i) { - saved_mean_e += x_arr.col(i); - } - saved_mean_e /= N * sample_size; - for (int i = 0; i < N * sample_size; ++i) { - saved_variance_e += - (x_arr.col(i) - saved_mean_e) * (x_arr.col(i) - saved_mean_e); - } - saved_variance_e /= N * sample_size; - break; - } - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "Unknown storage order: %s", data_layout_str)); - } - - // if MomentumTensor is set, use MomentumTensor value, momentum - // is only used in this training branch - if (ctx.HasInput("MomentumTensor")) { - const auto *mom_tensor = ctx.Input("MomentumTensor"); - momentum = mom_tensor->data()[0]; - } - - running_mean_arr = - running_mean_arr * momentum + saved_mean_e * (1. - momentum); - running_var_arr = - running_var_arr * momentum + saved_variance_e * (1. - momentum); - } - - // use SavedMean and SavedVariance to do normalize - Eigen::Array inv_std(C); - if (global_stats) { - ConstEigenVectorArrayMap var_arr( - ctx.Input("Variance")->data(), C); - inv_std = (var_arr + epsilon).sqrt().inverse(); - } else { - EigenVectorArrayMap saved_inv_std( - ctx.Output("SavedVariance")->data(), C); - // inverse SavedVariance first, gradient will use it too. - saved_inv_std = (saved_inv_std + epsilon).inverse().sqrt(); - inv_std = saved_inv_std; - } - ConstEigenVectorArrayMap mean_arr( - global_stats ? ctx.Input("Mean")->data() - : ctx.Output("SavedMean")->data(), - C); - - // ((x - est_mean) * (inv_var) * scale + bias - // formula transform ====> - // (x * inv_var * scale) + (bias - est_mean * inv_var * scale) - const auto *scale = ctx.Input("Scale"); - const auto *bias = ctx.Input("Bias"); - ConstEigenVectorArrayMap scale_arr(scale->data(), C); - ConstEigenVectorArrayMap bias_arr(bias->data(), C); - Eigen::Array new_scale = inv_std * scale_arr; - Eigen::Array new_bias = - bias_arr - mean_arr * inv_std * scale_arr; - - switch (data_layout) { - case DataLayout::kNCHW: { - EigenArrayMap y_arr(y->mutable_data(ctx.GetPlace()), sample_size, - N * C); - ConstEigenArrayMap x_arr(x->data(), sample_size, N * C); - for (int nc = 0; nc < N * C; ++nc) { - y_arr.col(nc) = x_arr.col(nc) * new_scale(nc % C) + new_bias(nc % C); - } - break; - } - case DataLayout::kNHWC: { - EigenArrayMap(y->mutable_data(ctx.GetPlace()), C, - N * sample_size) = - (ConstEigenArrayMap(x->data(), C, N * sample_size).colwise() * - new_scale) - .colwise() + - new_bias; - break; - } - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "Unknown storage order: %d", data_layout)); - } - } -}; - void BatchNormGradOp::InferShape(framework::InferShapeContext *ctx) const { // check input OP_INOUT_CHECK(ctx->HasInput("Scale"), "Input", "Scale", "BatchNormGrad"); @@ -585,261 +410,6 @@ framework::OpKernelType BatchNormGradOp::GetKernelTypeForVar( tensor.place(), tensor.layout()); } -template -class BatchNormGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - const auto *d_y = ctx.Input(framework::GradVarName("Y")); - const auto *scale = ctx.Input("Scale"); - const auto *bias = ctx.Input("Bias"); - const auto *saved_mean = ctx.Input("SavedMean"); - // SavedVariance have been reverted in forward operator - const auto *saved_inv_variance = ctx.Input("SavedVariance"); - const std::string data_layout_str = ctx.Attr("data_layout"); - bool use_global_stats = ctx.Attr("use_global_stats"); - const bool is_test = ctx.Attr("is_test"); - const float epsilon = ctx.Attr("epsilon"); - DataLayout data_layout = framework::StringToDataLayout(data_layout_str); - - auto *d_x = ctx.Output(framework::GradVarName("X")); - auto *d_scale = ctx.Output(framework::GradVarName("Scale")); - auto *d_bias = ctx.Output(framework::GradVarName("Bias")); - - use_global_stats = is_test || use_global_stats; - - // batch_norm with inplace as false will take X as grad input, which - // is same as cuDNN batch_norm backward calculation, batch_norm - // with inplace as true only take Y as input and X should be calculate - // by inverse operation of batch_norm on Y - const Tensor *x; - bool is_inplace; - if (ctx.HasInput("Y")) { - x = ctx.Input("Y"); - is_inplace = true; - // if the input of batch norm is stop_gradient, d_x is null. - if (d_x) { - PADDLE_ENFORCE_EQ(d_x, d_y, - platform::errors::InvalidArgument( - "X@GRAD and Y@GRAD not inplace in inplace mode")); - } - } else { - x = ctx.Input("X"); - is_inplace = false; - if (d_x) { - PADDLE_ENFORCE_NE( - d_x, d_y, platform::errors::InvalidArgument( - "X@GRAD and Y@GRAD inplaced in non-inplace mode")); - } - } - - // Get the size for each dimension. - // NCHW [batch_size, in_channels, in_height, in_width] - const auto &x_dims = x->dims(); - PADDLE_ENFORCE_GE( - x_dims.size(), 2, - platform::errors::InvalidArgument( - "The size of input X's dimensions should be larger than 1." - "But received: the size of input X's dimensions is [%d]", - x_dims.size())); - PADDLE_ENFORCE_LE( - x_dims.size(), 5, - platform::errors::InvalidArgument( - "The size of input X's dimensions should be less than 6." - "But received: the size of input X's dimensions is [%d]", - x_dims.size())); - const int N = x_dims[0]; - const int C = - (data_layout == DataLayout::kNCHW ? x_dims[1] - : x_dims[x_dims.size() - 1]); - const int sample_size = x->numel() / N / C; - - // input dimension is 2 and the format is NCHW. The input can be regarded as - // NHWC format - if (x_dims.size() == 2 && data_layout == DataLayout::kNCHW) { - data_layout = DataLayout::kNHWC; - } - - // init output - if (d_x) { - d_x->mutable_data(ctx.GetPlace()); - } - - const T *mean_data = saved_mean->data(); - const T *inv_var_data = saved_inv_variance->data(); - Tensor inv_var_tensor; - if (use_global_stats) { - const auto *running_mean = ctx.Input("Mean"); - const auto *running_variance = ctx.Input("Variance"); - mean_data = running_mean->data(); - inv_var_tensor.Resize({C}); - T *running_inv_var_data = inv_var_tensor.mutable_data(ctx.GetPlace()); - EigenVectorArrayMap inv_var_tmp(running_inv_var_data, C); - ConstEigenVectorArrayMap var_arr(running_variance->data(), C); - - inv_var_tmp = (var_arr + epsilon).sqrt().inverse(); - inv_var_data = running_inv_var_data; - } - - ConstEigenVectorArrayMap scale_arr(scale->data(), C); - ConstEigenVectorArrayMap bias_arr(bias->data(), C); - ConstEigenVectorArrayMap mean_arr(mean_data, C); - ConstEigenVectorArrayMap inv_var_arr(inv_var_data, C); - - T *d_bias_data = nullptr; - T *d_scale_data = nullptr; - if (d_scale && d_bias) { - d_scale->mutable_data(ctx.GetPlace()); - d_bias->mutable_data(ctx.GetPlace()); - d_bias_data = d_bias->mutable_data(ctx.GetPlace()); - d_scale_data = d_scale->mutable_data(ctx.GetPlace()); - } - - // d_bias = np.sum(d_y, axis=0) - // d_scale = np.sum((X - mean) / inv_std * dy, axis=0) - // d_x = (1. / N) * scale * inv_var * (N * d_y - np.sum(d_y, axis=0) - // - (X - mean) * inv_var * inv_var * np.sum(d_y * (X - mean), axis=0)) - EigenVectorArrayMap d_bias_arr(d_bias_data, C); - EigenVectorArrayMap d_scale_arr(d_scale_data, C); - - if (d_scale && d_bias) { - d_bias_arr.setZero(); - d_scale_arr.setZero(); - } - - if (d_x && (N * sample_size) == 1 && !use_global_stats) { - framework::TensorCopy(*d_y, ctx.GetPlace(), d_x); - return; - } - - int scale_coefff = use_global_stats ? 1 : N * sample_size; - const auto scale_inv_var_nhw = scale_arr * inv_var_arr / scale_coefff; - - Tensor dy_sum; - dy_sum.Resize({C}); - dy_sum.mutable_data(ctx.GetPlace()); - EigenVectorArrayMap dy_sum_arr(dy_sum.mutable_data(ctx.GetPlace()), - C); - - Tensor dy_mul_x_sub_mean_mul_invstd_sum; - dy_mul_x_sub_mean_mul_invstd_sum.Resize({C}); - dy_mul_x_sub_mean_mul_invstd_sum.mutable_data(ctx.GetPlace()); - EigenVectorArrayMap dy_mul_x_sub_mean_mul_invstd_sum_arr( - dy_mul_x_sub_mean_mul_invstd_sum.mutable_data(ctx.GetPlace()), C); - - dy_sum_arr.setZero(); - dy_mul_x_sub_mean_mul_invstd_sum_arr.setZero(); - - // inplace calculation - // Y: ((x - est_mean) * (inv_var) * scale + bias - // formula transform ====> - // (x * inv_var * scale) + (bias - est_mean * inv_var * scale) - // X: (y - bias) / scale / (inv_var) + est_mean - // formula transform ====> - // (y - bias) / (scale * inv_var) + est_mean - switch (data_layout) { - case DataLayout::kNCHW: { - if (is_inplace) { - auto px = *x; - EigenArrayMap x_data(px.mutable_data(ctx.GetPlace()), - sample_size, N * C); - ConstEigenArrayMap y_data(x->data(), sample_size, N * C); - for (int nc = 0; nc < N * C; ++nc) { - x_data.col(nc) = (y_data.col(nc) - bias_arr(nc % C)) / - scale_inv_var_nhw(nc % C) / scale_coefff + - mean_arr(nc % C); - } - } - ConstEigenArrayMap x_arr(x->data(), sample_size, N * C); - ConstEigenArrayMap d_y_arr(d_y->data(), sample_size, N * C); - - for (int nc = 0; nc < N * C; ++nc) { - int c = nc % C; - dy_sum_arr(c) += d_y_arr.col(nc).sum(); - dy_mul_x_sub_mean_mul_invstd_sum_arr(c) += - ((x_arr.col(nc) - mean_arr(c)) * inv_var_arr(c) * d_y_arr.col(nc)) - .sum(); - } - - if (d_scale && d_bias) { - d_bias_arr = dy_sum_arr; - d_scale_arr = dy_mul_x_sub_mean_mul_invstd_sum_arr; - } - - if (d_x) { - EigenArrayMap d_x_arr(d_x->mutable_data(ctx.GetPlace()), - sample_size, N * C); - if (!use_global_stats) { - for (int nc = 0; nc < N * C; ++nc) { - int c = nc % C; - d_x_arr.col(nc) = - scale_inv_var_nhw(c) * - (d_y_arr.col(nc) * N * sample_size - dy_sum_arr(c) - - (x_arr.col(nc) - mean_arr[c]) * - dy_mul_x_sub_mean_mul_invstd_sum_arr(c) * - inv_var_arr(c)); - } - } else { - for (int nc = 0; nc < N * C; ++nc) { - int c = nc % C; - d_x_arr.col(nc) = scale_inv_var_nhw(c) * d_y_arr.col(nc); - } - } - } - break; - } - case DataLayout::kNHWC: { - if (is_inplace) { - auto px = *x; - EigenArrayMap x_data(px.mutable_data(ctx.GetPlace()), C, - N * sample_size); - ConstEigenArrayMap y_data(x->data(), C, N * sample_size); - for (int nhw = 0; nhw < N * sample_size; nhw++) { - x_data.col(nhw) = (y_data.col(nhw) - bias_arr) / scale_inv_var_nhw / - scale_coefff + - mean_arr; - } - } - ConstEigenArrayMap x_arr(x->data(), C, N * sample_size); - ConstEigenArrayMap d_y_arr(d_y->data(), C, N * sample_size); - - for (int nhw = 0; nhw < N * sample_size; ++nhw) { - dy_sum_arr += d_y_arr.col(nhw); - dy_mul_x_sub_mean_mul_invstd_sum_arr += - (x_arr.col(nhw) - mean_arr) * inv_var_arr * d_y_arr.col(nhw); - } - - if (d_scale && d_bias) { - d_bias_arr = dy_sum_arr; - d_scale_arr = dy_mul_x_sub_mean_mul_invstd_sum_arr; - } - - if (d_x) { - EigenArrayMap d_x_arr(d_x->mutable_data(ctx.GetPlace()), C, - N * sample_size); - if (!use_global_stats) { - for (int nhw = 0; nhw < N * sample_size; ++nhw) { - d_x_arr.col(nhw) = - scale_inv_var_nhw * - (d_y_arr.col(nhw) * N * sample_size - dy_sum_arr - - (x_arr.col(nhw) - mean_arr) * - dy_mul_x_sub_mean_mul_invstd_sum_arr * inv_var_arr); - } - } else { - for (int nhw = 0; nhw < N * sample_size; ++nhw) { - d_x_arr.col(nhw) = scale_inv_var_nhw * d_y_arr.col(nhw); - } - } - } - break; - } - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "Unknown storage order: %s", data_layout_str)); - } - } -}; - template void BatchNormGradMaker::Apply(GradOpPtr op) const { op->SetType(this->ForwardOpType() + "_grad"); @@ -951,335 +521,16 @@ framework::OpKernelType BatchNormDoubleGradOp::GetExpectedKernelType( OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace()); } -template -class BatchNormDoubleGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - const auto *X = ctx.Input("X"); - const auto *Scale = ctx.Input("Scale"); - const auto *dY = ctx.Input("DY"); - const auto *Saved_mean = ctx.Input("SavedMean"); - const auto *Saved_variance = ctx.Input("SavedVariance"); - const float epsilon = ctx.Attr("epsilon"); - const bool use_global_stats = ctx.Attr("use_global_stats"); - const bool is_test = ctx.Attr("is_test"); - - PADDLE_ENFORCE_EQ( - is_test, false, - platform::errors::InvalidArgument( - "`is_test = True` CANNOT be used in train program. If " - "you want to use global status in pre_train model, " - "please set `use_global_stats = True`")); - - const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = - framework::StringToDataLayout(data_layout_str); - - const auto *ddX = ctx.Input("DDX"); - const auto *ddScale = ctx.Input("DDScale"); - const auto *ddBias = ctx.Input("DDBias"); - - auto *dX = ctx.Output("DX"); - auto *dScale = ctx.Output("DScale"); - auto *ddY = ctx.Output("DDY"); - dX->mutable_data(ctx.GetPlace()); - ddY->mutable_data(ctx.GetPlace()); - - auto &dev_ctx = ctx.template device_context(); - - const auto &x_dims = X->dims(); - const int C = - (data_layout == DataLayout::kNCHW ? x_dims[1] - : x_dims[x_dims.size() - 1]); - const int sample_size = X->numel() / C; - phi::funcs::SetConstant set_constant; - - const T *mean_data = Saved_mean->data(); - const T *inv_var_data = Saved_variance->data(); - - Tensor inv_var_tensor; - if (use_global_stats) { - const auto *running_mean = ctx.Input("Mean"); - const auto *running_variance = ctx.Input("Variance"); - mean_data = running_mean->data(); - inv_var_tensor.Resize({C}); - - T *running_inv_var_data = inv_var_tensor.mutable_data(ctx.GetPlace()); - EigenVectorArrayMap inv_var_tmp(running_inv_var_data, C); - ConstEigenVectorArrayMap var_arr(running_variance->data(), C); - - inv_var_tmp = (var_arr + epsilon).sqrt().inverse(); - inv_var_data = running_inv_var_data; - } - - // transpose NCHW -> NHWC for easy calculate - Tensor transformed_x(X->type()); - Tensor transformed_dy(dY->type()); - Tensor transformed_ddx(ddX->type()); - - Tensor transformed_dx(dX->type()); - Tensor transformed_ddy(ddY->type()); - if (data_layout == DataLayout::kNCHW && x_dims.size() > 2) { - VLOG(3) << "Transform batchnorm output from NCHW to NHWC"; - // Input Tensor - ResizeToChannelLast(ctx, X, - &transformed_x); - TransToChannelLast(ctx, X, &transformed_x); - ResizeToChannelLast(ctx, dY, - &transformed_dy); - TransToChannelLast(ctx, dY, - &transformed_dy); - ResizeToChannelLast(ctx, ddX, - &transformed_ddx); - TransToChannelLast(ctx, ddX, - &transformed_ddx); - // Output Tensor - ResizeToChannelLast(ctx, dX, - &transformed_dx); - ResizeToChannelLast(ctx, ddY, - &transformed_ddy); - } else { - transformed_x.ShareDataWith(*X); - transformed_dy.ShareDataWith(*dY); - transformed_ddx.ShareDataWith(*ddX); - - transformed_dx.ShareDataWith(*dX); - transformed_ddy.ShareDataWith(*ddY); - } - - ConstEigenArrayMap x_arr(transformed_x.data(), C, sample_size); - ConstEigenVectorArrayMap mean_arr(mean_data, C); - ConstEigenVectorArrayMap inv_var_arr(inv_var_data, C); - - Tensor mean_tile; - mean_tile.Resize({C, sample_size}); - mean_tile.mutable_data(ctx.GetPlace()); - EigenArrayMap mean_tile_data(mean_tile.mutable_data(ctx.GetPlace()), - C, sample_size); - - Tensor inv_var_tile; - inv_var_tile.Resize({C, sample_size}); - inv_var_tile.mutable_data(ctx.GetPlace()); - EigenArrayMap inv_var_tile_data( - inv_var_tile.mutable_data(ctx.GetPlace()), C, sample_size); - - mean_tile_data = mean_arr.replicate(1, sample_size); - inv_var_tile_data = inv_var_arr.replicate(1, sample_size); - - Tensor Scale_data; - if (!Scale) { - Scale_data.mutable_data({C}, ctx.GetPlace()); - set_constant(dev_ctx, &Scale_data, static_cast(1)); - } - ConstEigenVectorArrayMap scale_arr( - Scale ? Scale->data() : Scale_data.data(), C); - - Tensor scale_tile; - scale_tile.Resize({C, sample_size}); - scale_tile.mutable_data(ctx.GetPlace()); - EigenArrayMap scale_tile_data(scale_tile.mutable_data(ctx.GetPlace()), - C, sample_size); - scale_tile_data = scale_arr.replicate(1, sample_size); - - ConstEigenArrayMap dy_arr(transformed_dy.data(), C, sample_size); - ConstEigenArrayMap ddx_arr(transformed_ddx.data(), C, sample_size); - - Tensor x_sub_mean_mul_invstd; - x_sub_mean_mul_invstd.Resize({C, sample_size}); - x_sub_mean_mul_invstd.mutable_data(ctx.GetPlace()); - EigenArrayMap x_sub_mean_mul_invstd_arr( - x_sub_mean_mul_invstd.mutable_data(ctx.GetPlace()), C, sample_size); - x_sub_mean_mul_invstd_arr = (x_arr - mean_tile_data) * inv_var_tile_data; - - if (dX) { - dX->mutable_data(ctx.GetPlace()); - EigenArrayMap dx_arr(transformed_dx.mutable_data(ctx.GetPlace()), C, - sample_size); - dx_arr.setZero(); - if (use_global_stats) { - // math: dx = (ddscale * dy) * inv_var - if (ddScale) { - ConstEigenVectorArrayMap ddscale_arr(ddScale->data(), C); - Tensor ddscale_tile; - ddscale_tile.Resize({C, sample_size}); - EigenArrayMap ddscale_tile_data( - ddscale_tile.mutable_data(ctx.GetPlace()), C, sample_size); - ddscale_tile_data = ddscale_arr.replicate(1, sample_size); - - dx_arr = dy_arr * ddscale_tile_data * inv_var_tile_data; - } - } else { - // math: dx = scale * ((x - mean) * inv_var / NxHxW * (np.mean(ddx, - // axis=(n,h,w)) * - // np.sum(dy, axis=(n,h,w)) - - // np.sum(dy * ddx, axis=(n,h,w)) + 3 * np.mean(dy * (x - - // mean), - // axis=(n,h,w)) * inv_var.pow(2) * - // np.sum(ddx * (x - mean), axis=(n,h,w))) + inv_var.pow(3) / - // NxHxW * - // np.sum(ddx * (x - mean)) * - // (np.mean(dy, axis=(n,h,w)) - dy) + inv_var.pow(3) / NxHxW * - // np.sum(dy, - // axis=(n,h,w)) * (x - mean) * - // (np.mean(ddx, axis=(n,h,w)) - ddx)) + ddr * (dy * inv_var - - // inv_var - // * - // np.mean(dy, axis=(n,h,w)) - - // inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean), - // axis=(n,h,w))) - - if (ddX) { - dx_arr += - (x_sub_mean_mul_invstd_arr * inv_var_tile_data * - inv_var_tile_data / sample_size) - .colwise() * - (ddx_arr.rowwise().sum() * dy_arr.rowwise().sum() / sample_size - - (dy_arr * ddx_arr).rowwise().sum() + - 3. * (dy_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() * - (ddx_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() / - sample_size); - - dx_arr += (inv_var_tile_data * inv_var_tile_data).colwise() * - (ddx_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() / - sample_size * - (dy_arr.rowwise().sum() / sample_size - dy_arr); - - dx_arr += (inv_var_tile_data * inv_var_tile_data).colwise() * - (dy_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() / - sample_size * - (ddx_arr.rowwise().sum() / sample_size - ddx_arr); - - dx_arr = scale_tile_data * dx_arr; - } - if (ddScale) { - ConstEigenVectorArrayMap ddscale_arr(ddScale->data(), C); - Tensor ddscale_tile; - ddscale_tile.Resize({C, sample_size}); - EigenArrayMap ddscale_tile_data( - ddscale_tile.mutable_data(ctx.GetPlace()), C, sample_size); - ddscale_tile_data = ddscale_arr.replicate(1, sample_size); - - dx_arr += (dy_arr * inv_var_tile_data - - (dy_arr.rowwise().sum().replicate(1, sample_size) / - sample_size) * - inv_var_tile_data - - x_sub_mean_mul_invstd_arr * inv_var_tile_data * - (dy_arr * x_sub_mean_mul_invstd_arr) - .rowwise() - .sum() - .replicate(1, sample_size) / - sample_size) * - ddscale_tile_data; - } - } - if (data_layout == DataLayout::kNCHW) { - VLOG(3) << "Transform batchnorm output from NHWC to NCHW"; - TransToChannelFirst( - ctx, &transformed_dx, dX); - } - } - if (dScale) { - dScale->mutable_data(ctx.GetPlace()); - EigenVectorArrayMap dscale_arr(dScale->mutable_data(ctx.GetPlace()), - C); - dscale_arr.setZero(); - if (use_global_stats) { - // math: dscale = np.sum(ddx * dy, axis=(n,h,w)) * inv_var - if (ddX) { - dscale_arr = (ddx_arr * dy_arr * inv_var_tile_data).rowwise().sum(); - } - } else { - // math: dscale = inv_var * (dy - np.mean(dy, axis=(n,h,w) - (x-mean) * - // inv_var.pow(2) * np.mean(dy * (x-mean), axis=(n,h,w)))) * - // ddx - if (ddX) { - Tensor first_grad; - first_grad.Resize({C, sample_size}); - EigenArrayMap first_grad_arr( - first_grad.mutable_data(ctx.GetPlace()), C, sample_size); - first_grad_arr.setZero(); - - first_grad_arr += - inv_var_tile_data * - (dy_arr - - dy_arr.rowwise().sum().replicate(1, sample_size) / sample_size - - x_sub_mean_mul_invstd_arr * - (dy_arr * x_sub_mean_mul_invstd_arr) - .rowwise() - .sum() - .replicate(1, sample_size) / - sample_size); - dscale_arr = (first_grad_arr * ddx_arr).rowwise().sum(); - } - } - } - - if (ddY) { - ddY->mutable_data(ctx.GetPlace()); - EigenArrayMap ddy_arr(transformed_ddy.mutable_data(ctx.GetPlace()), - C, sample_size); - ddy_arr.setZero(); - if (use_global_stats) { - // math: ddy = r * ddx * inv_var + ddbias + - // ddscale * (x - mean) * inv_var - if (ddX) { - ddy_arr = scale_tile_data * ddx_arr * inv_var_tile_data; - } - } else { - // math: ddy = (x - mean) * inv_var * ddscale + ddbias + - // scale * inv_var * (ddx - (x - mean) * inv_var.pow(2) * - // np.mean(ddx * (x - mean), axis=(n,h,w))) - if (ddX) { - ddy_arr += - scale_tile_data * inv_var_tile_data * - (ddx_arr - - ddx_arr.rowwise().sum().replicate(1, sample_size) / sample_size - - x_sub_mean_mul_invstd_arr * - (ddx_arr * x_sub_mean_mul_invstd_arr) - .rowwise() - .sum() - .replicate(1, sample_size) / - sample_size); - } - } - if (ddScale) { - ConstEigenVectorArrayMap ddscale_arr(ddScale->data(), C); - Tensor ddscale_tile; - ddscale_tile.Resize({C, sample_size}); - EigenArrayMap ddscale_tile_data( - ddscale_tile.mutable_data(ctx.GetPlace()), C, sample_size); - ddscale_tile_data = ddscale_arr.replicate(1, sample_size); - - ddy_arr += x_sub_mean_mul_invstd_arr * ddscale_tile_data; - } - - if (ddBias) { - ConstEigenVectorArrayMap ddbias_arr(ddBias->data(), C); - Tensor ddbias_tile; - ddbias_tile.Resize({C, sample_size}); - EigenArrayMap ddbias_tile_data( - ddbias_tile.mutable_data(ctx.GetPlace()), C, sample_size); - ddbias_tile_data = ddbias_arr.replicate(1, sample_size); - - ddy_arr += ddbias_tile_data; - } - - if (data_layout == DataLayout::kNCHW) { - VLOG(3) << "Transform batchnorm output from NHWC to NCHW"; - TransToChannelFirst( - ctx, &transformed_ddy, ddY); - } - } - } -}; - DECLARE_INPLACE_OP_INFERER(BatchNormDoubleGradOpInplaceInferer, {"DY", "DDY"}); } // namespace operators } // namespace paddle namespace ops = paddle::operators; + +DECLARE_INFER_SHAPE_FUNCTOR(batch_norm, BatchNormInferShapeFunctor, + PD_INFER_META(phi::BatchNormInferMeta)); + REGISTER_OPERATOR(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker, ops::BatchNormOpInferVarType, ops::BatchNormGradMaker, diff --git a/paddle/fluid/operators/batch_norm_op.h b/paddle/fluid/operators/batch_norm_op.h index f8d37d685b929258118e5d4b9d02a6be9d71c078..d274e8d2c006d7cbfe8337eab5c6d9a57a62e5ca 100644 --- a/paddle/fluid/operators/batch_norm_op.h +++ b/paddle/fluid/operators/batch_norm_op.h @@ -113,23 +113,5 @@ class BatchNormOpInferVarType } }; -template -class BatchNormKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override; -}; - -template -class BatchNormGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override; -}; - -template -class BatchNormDoubleGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override; -}; - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/batch_norm_op_npu.cc b/paddle/fluid/operators/batch_norm_op_npu.cc index a70b6e991161dfef99cc0b6da9fba9a2696cc08e..ae03ecbcb16a0441cdb87e0ec579c07d872bc9a2 100644 --- a/paddle/fluid/operators/batch_norm_op_npu.cc +++ b/paddle/fluid/operators/batch_norm_op_npu.cc @@ -76,10 +76,10 @@ class NPUBatchNormOpKernel : public framework::OpKernel { auto *variance_out = ctx.Output("VarianceOut"); auto *saved_mean = ctx.Output("SavedMean"); auto *saved_variance = ctx.Output("SavedVariance"); - mean_out->mutable_data(ctx.GetPlace()); - variance_out->mutable_data(ctx.GetPlace()); - saved_mean->mutable_data(ctx.GetPlace()); - saved_variance->mutable_data(ctx.GetPlace()); + mean_out->mutable_data(ctx.GetPlace()); + variance_out->mutable_data(ctx.GetPlace()); + saved_mean->mutable_data(ctx.GetPlace()); + saved_variance->mutable_data(ctx.GetPlace()); // if MomentumTensor is set, use MomentumTensor value, momentum // is only used in this training branch @@ -170,8 +170,8 @@ class NPUBatchNormGradOpKernel : public framework::OpKernel { auto stream = ctx.template device_context().stream(); if (d_scale && d_bias) { - d_scale->mutable_data(ctx.GetPlace()); - d_bias->mutable_data(ctx.GetPlace()); + d_scale->mutable_data(ctx.GetPlace()); + d_bias->mutable_data(ctx.GetPlace()); if (use_global_stats) { const auto *running_mean = ctx.Input("Mean"); const auto *running_variance = ctx.Input("Variance"); diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index 8213e877f722433488cd826bb63cba376972c57a..9be63a85fc0de3ba75cb9741b25f7c312cd9f60b 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -27,6 +27,9 @@ limitations under the License. */ #endif #include "paddle/fluid/platform/cudnn_workspace_helper.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/infermeta/binary.h" + namespace paddle { namespace operators { @@ -841,6 +844,8 @@ framework::OpKernelType ConvOpDoubleGrad::GetExpectedKernelType( } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(conv2d, Conv2dInferShapeFunctor, + PD_INFER_META(phi::ConvInferMeta)); REGISTER_OPERATOR(conv2d, ops::ConvOp, ops::Conv2DOpMaker, ops::ConvOpInferVarType, ops::Conv2DGradMaker, @@ -851,6 +856,8 @@ REGISTER_OPERATOR(conv2d_grad, ops::ConvOpGrad, REGISTER_OPERATOR(conv2d_grad_grad, ops::ConvOpDoubleGrad); // depthwise convolution op +DECLARE_INFER_SHAPE_FUNCTOR(depthwise_conv2d, DepthwiseConv2dInferShapeFunctor, + PD_INFER_META(phi::ConvInferMeta)); REGISTER_OPERATOR(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker, ops::ConvOpInferVarType, ops::Conv2DGradMaker, @@ -860,6 +867,8 @@ REGISTER_OPERATOR(depthwise_conv2d_grad, ops::ConvOpGrad, ops::Conv2DDoubleGradMaker); REGISTER_OPERATOR(depthwise_conv2d_grad_grad, ops::ConvOpDoubleGrad); +DECLARE_INFER_SHAPE_FUNCTOR(conv3d, Conv3dInferShapeFunctor, + PD_INFER_META(phi::ConvInferMeta)); REGISTER_OPERATOR(conv3d, ops::ConvOp, ops::Conv3DOpMaker, ops::ConvOpInferVarType, ops::Conv3DGradMaker, diff --git a/paddle/fluid/operators/conv_op_npu.cc b/paddle/fluid/operators/conv_op_npu.cc index 8897f7b229c321e28609d8ef739f4388f5cb586a..fcda16a3e72ac9250a0206e69f50c75d71cb0d64 100644 --- a/paddle/fluid/operators/conv_op_npu.cc +++ b/paddle/fluid/operators/conv_op_npu.cc @@ -356,7 +356,7 @@ class NPUConvGradOpKernel : public framework::OpKernel { auto stream = ctx.template device_context().stream(); if (filter_grad) { - filter_grad->mutable_data(ctx.GetPlace()); + filter_grad->mutable_data(ctx.GetPlace()); std::vector filter_shape_vec = phi::vectorize(filter->dims()); const auto& runner = NpuOpRunner( diff --git a/paddle/fluid/operators/deformable_conv_op.cc b/paddle/fluid/operators/deformable_conv_op.cc index b15efc5f84bdd0a62f3ee5deca01b1e601f19aed..6e15fd090b8c4feeb8837efb392a2d3a6a6b80c7 100644 --- a/paddle/fluid/operators/deformable_conv_op.cc +++ b/paddle/fluid/operators/deformable_conv_op.cc @@ -338,8 +338,6 @@ REGISTER_OPERATOR(deformable_conv, ops::DeformableConvOp, REGISTER_OPERATOR(deformable_conv_grad, ops::DeformableConvGradOp); -REGISTER_OP_CPU_KERNEL(deformable_conv, ops::DeformableConvCPUKernel, - ops::DeformableConvCPUKernel); REGISTER_OP_CPU_KERNEL(deformable_conv_grad, ops::DeformableConvGradCPUKernel, ops::DeformableConvGradCPUKernel); diff --git a/paddle/fluid/operators/deformable_conv_op.cu b/paddle/fluid/operators/deformable_conv_op.cu index 2c7d905c79b37e9b1c8777d62f1b593c8a8866a5..ad10abf9c647b588e8c66dea89588e344c46ae69 100644 --- a/paddle/fluid/operators/deformable_conv_op.cu +++ b/paddle/fluid/operators/deformable_conv_op.cu @@ -446,108 +446,6 @@ __global__ void FilterGradAddupGpuKernel(const int nthreads, const int n, } } -template -class DeformableConvCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const Tensor* input = ctx.Input("Input"); - const Tensor offset = *ctx.Input("Offset"); - const Tensor mask = *ctx.Input("Mask"); - Tensor filter = *ctx.Input("Filter"); - Tensor* output = ctx.Output("Output"); - output->mutable_data(ctx.GetPlace()); - - auto& dev_ctx = ctx.cuda_device_context(); - - const int groups = ctx.Attr("groups"); - const int deformable_groups = ctx.Attr("deformable_groups"); - const int im2col_step = ctx.Attr("im2col_step"); - const std::vector strides = ctx.Attr>("strides"); - const std::vector paddings = ctx.Attr>("paddings"); - const std::vector dilations = ctx.Attr>("dilations"); - - const int batch_size = static_cast(input->dims()[0]); - - std::vector filter_shape_vec(phi::vectorize(filter.dims())); - std::vector output_shape_vec(phi::vectorize(output->dims())); - - // col_shape_vec: {c_i * k_h * k_w, im2col_step, o_h, o_w} - std::vector col_buffer_shape_vec(filter_shape_vec.size()); - col_buffer_shape_vec[0] = - input->dims()[1] * filter.dims()[2] * filter.dims()[3]; - col_buffer_shape_vec[1] = im2col_step; - for (size_t j = 0; j < filter_shape_vec.size() - 2; ++j) { - col_buffer_shape_vec[j + 2] = output_shape_vec[j + 2]; - } - framework::DDim col_shape(phi::make_ddim(col_buffer_shape_vec)); - std::vector output_buffer_shape_vec(1); - output_buffer_shape_vec[0] = batch_size * output_shape_vec[1] * - output_shape_vec[2] * output_shape_vec[3]; - framework::DDim output_shape(phi::make_ddim(output_buffer_shape_vec)); - Tensor col_buffer; - Tensor output_buffer; - col_buffer = ctx.AllocateTmpTensor(col_shape, dev_ctx); - output_buffer = - ctx.AllocateTmpTensor(output_shape, dev_ctx); - - int64_t M = output_shape_vec[1] / groups; - int64_t N = im2col_step * output_shape_vec[2] * output_shape_vec[3]; - int64_t K = - input->dims()[1] * filter_shape_vec[2] * filter_shape_vec[3] / groups; - - Tensor weight_3d; - weight_3d.ShareDataWith(filter).Resize(phi::make_ddim({groups, M, K})); - Tensor col_buffer_3d; - col_buffer_3d.ShareDataWith(col_buffer) - .Resize(phi::make_ddim({groups, K, N})); - Tensor output_4d; - output_4d.ShareDataWith(output_buffer) - .Resize(phi::make_ddim({batch_size / im2col_step, groups, M, N})); - output_4d.mutable_data(ctx.GetPlace()); - framework::DDim input_shape = - phi::slice_ddim(input->dims(), 1, input->dims().size()); - std::vector input_shape_vec = phi::vectorize(input_shape); - - int input_dim = input->numel() / input->dims()[0]; - int input_offset_dim = offset.numel() / offset.dims()[0]; - int input_mask_dim = mask.numel() / mask.dims()[0]; - - auto blas = phi::funcs::GetBlas(dev_ctx); - - const T* input_ptr = input->data(); - const T* offset_ptr = offset.data(); - const T* mask_ptr = mask.data(); - col_buffer.mutable_data(ctx.GetPlace()); - T* col_buffer_ptr = col_buffer.data(); - - for (int i = 0; i < batch_size / im2col_step; ++i) { - ModulatedDeformableIm2col( - ctx.device_context(), input_ptr + i * im2col_step * input_dim, - offset_ptr + i * im2col_step * input_offset_dim, - mask_ptr + i * im2col_step * input_mask_dim, input_shape_vec, - col_buffer_shape_vec, filter_shape_vec, paddings, strides, dilations, - deformable_groups, col_buffer_ptr); - - Tensor output_3d = output_4d.Slice(i, i + 1).Resize( - phi::slice_ddim(output_4d.dims(), 1, output_4d.dims().size())); - for (int g = 0; g < groups; ++g) { - Tensor weight_3d_slice = weight_3d.Slice(g, g + 1).Resize( - phi::slice_ddim(weight_3d.dims(), 1, weight_3d.dims().size())); - Tensor col_buffer_3d_slice = - col_buffer_3d.Slice(g, g + 1).Resize(phi::slice_ddim( - col_buffer_3d.dims(), 1, col_buffer_3d.dims().size())); - Tensor output_3d_slice = output_3d.Slice(g, g + 1).Resize( - phi::slice_ddim(output_3d.dims(), 1, output_3d.dims().size())); - - blas.MatMul(weight_3d_slice, false, col_buffer_3d_slice, false, T(1.0), - &output_3d_slice, T(0.0)); - } - } - output->ShareDataWith(output_buffer) - .Resize(phi::make_ddim(output_shape_vec)); - } -}; - template class DeformableConvGradCUDAKernel : public framework::OpKernel { public: @@ -740,9 +638,6 @@ class DeformableConvGradCUDAKernel : public framework::OpKernel { namespace ops = paddle::operators; using CUDA = paddle::platform::CUDADeviceContext; -REGISTER_OP_CUDA_KERNEL(deformable_conv, - ops::DeformableConvCUDAKernel, - ops::DeformableConvCUDAKernel); REGISTER_OP_CUDA_KERNEL(deformable_conv_grad, ops::DeformableConvGradCUDAKernel, ops::DeformableConvGradCUDAKernel); diff --git a/paddle/fluid/operators/deformable_conv_op.h b/paddle/fluid/operators/deformable_conv_op.h index 66961655ee6ffa88e162477ad424eb10a0702b27..1176b96987ed6fbd0077e68d5bb0d4ece5c4b4f0 100644 --- a/paddle/fluid/operators/deformable_conv_op.h +++ b/paddle/fluid/operators/deformable_conv_op.h @@ -318,102 +318,6 @@ void FilterGradAddupCPUKernel(const int nthreads, const int n, const int height, } } -template -class DeformableConvCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("Input"); - auto* offset = ctx.Input("Offset"); - auto* mask = ctx.Input("Mask"); - Tensor filter = *ctx.Input("Filter"); - Tensor* output = ctx.Output("Output"); - output->mutable_data(ctx.GetPlace()); - - auto& dev_ctx = ctx.template device_context(); - - const int groups = ctx.Attr("groups"); - const int deformable_groups = ctx.Attr("deformable_groups"); - const int im2col_step = ctx.Attr("im2col_step"); - const std::vector strides = ctx.Attr>("strides"); - const std::vector paddings = ctx.Attr>("paddings"); - const std::vector dilations = ctx.Attr>("dilations"); - - const int batch_size = static_cast(input->dims()[0]); - - std::vector filter_shape_vec(phi::vectorize(filter.dims())); - std::vector output_shape_vec(phi::vectorize(output->dims())); - - // col_shape_vec: {c_i * k_h * k_w, im2col_step, o_h, o_w} - std::vector col_buffer_shape_vec(filter_shape_vec.size()); - col_buffer_shape_vec[0] = - input->dims()[1] * filter.dims()[2] * filter.dims()[3]; - col_buffer_shape_vec[1] = im2col_step; - for (size_t j = 0; j < filter_shape_vec.size() - 2; ++j) { - col_buffer_shape_vec[j + 2] = output_shape_vec[j + 2]; - } - framework::DDim col_shape(phi::make_ddim(col_buffer_shape_vec)); - std::vector output_buffer_shape_vec(1); - output_buffer_shape_vec[0] = batch_size * output_shape_vec[1] * - output_shape_vec[2] * output_shape_vec[3]; - framework::DDim output_shape(phi::make_ddim(output_buffer_shape_vec)); - Tensor col_buffer; - Tensor output_buffer; - col_buffer = ctx.AllocateTmpTensor(col_shape, dev_ctx); - output_buffer = - ctx.AllocateTmpTensor(output_shape, dev_ctx); - int64_t M = output_shape_vec[1] / groups; - int64_t N = im2col_step * output_shape_vec[2] * output_shape_vec[3]; - int64_t K = - input->dims()[1] * filter_shape_vec[2] * filter_shape_vec[3] / groups; - - Tensor weight_3d; - weight_3d.ShareDataWith(filter).Resize(phi::make_ddim({groups, M, K})); - Tensor col_buffer_3d; - col_buffer_3d.ShareDataWith(col_buffer) - .Resize(phi::make_ddim({groups, K, N})); - Tensor output_4d; - output_4d.ShareDataWith(output_buffer) - .Resize(phi::make_ddim({batch_size / im2col_step, groups, M, N})); - output_4d.mutable_data(ctx.GetPlace()); - framework::DDim input_shape = - phi::slice_ddim(input->dims(), 1, input->dims().size()); - std::vector input_shape_vec = phi::vectorize(input_shape); - int input_dim = input->numel() / input->dims()[0]; - int input_offset_dim = offset->numel() / offset->dims()[0]; - int input_mask_dim = mask->numel() / mask->dims()[0]; - auto blas = phi::funcs::GetBlas(dev_ctx); - const T* input_ptr = input->data(); - const T* offset_ptr = offset->data(); - const T* mask_ptr = mask->data(); - col_buffer.mutable_data(ctx.GetPlace()); - T* col_buffer_ptr = col_buffer.data(); - for (int i = 0; i < batch_size / im2col_step; ++i) { - ModulatedDeformableIm2colCPU( - dev_ctx, input_ptr + i * im2col_step * input_dim, - offset_ptr + i * im2col_step * input_offset_dim, - mask_ptr + i * im2col_step * input_mask_dim, input_shape_vec, - col_buffer_shape_vec, filter_shape_vec, paddings, strides, dilations, - deformable_groups, col_buffer_ptr); - Tensor output_3d = output_4d.Slice(i, i + 1).Resize( - phi::slice_ddim(output_4d.dims(), 1, output_4d.dims().size())); - // get the product of pixel and weight - for (int g = 0; g < groups; ++g) { - Tensor weight_3d_slice = weight_3d.Slice(g, g + 1).Resize( - phi::slice_ddim(weight_3d.dims(), 1, weight_3d.dims().size())); - Tensor col_buffer_3d_slice = - col_buffer_3d.Slice(g, g + 1).Resize(phi::slice_ddim( - col_buffer_3d.dims(), 1, col_buffer_3d.dims().size())); - Tensor output_3d_slice = output_3d.Slice(g, g + 1).Resize( - phi::slice_ddim(output_3d.dims(), 1, output_3d.dims().size())); - blas.MatMul(weight_3d_slice, false, col_buffer_3d_slice, false, T(1.0), - &output_3d_slice, T(0.0)); - } - } - output->ShareDataWith(output_buffer) - .Resize(phi::make_ddim(output_shape_vec)); - } -}; - template class DeformableConvGradCPUKernel : public framework::OpKernel { public: diff --git a/paddle/fluid/operators/detection/yolo_box_op.cc b/paddle/fluid/operators/detection/yolo_box_op.cc index 0d9fbf612f73c428fb8050fcfcc319ddafabe482..35e389090175f7768244b95b1d388ea0d735c2d5 100644 --- a/paddle/fluid/operators/detection/yolo_box_op.cc +++ b/paddle/fluid/operators/detection/yolo_box_op.cc @@ -9,8 +9,10 @@ See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -235,10 +237,13 @@ class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(yolo_box, YoloBoxInferShapeFunctor, + PD_INFER_META(phi::YoloBoxInferMeta)); REGISTER_OPERATOR( yolo_box, ops::YoloBoxOp, ops::YoloBoxOpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); + paddle::framework::EmptyGradOpMaker, + YoloBoxInferShapeFunctor); REGISTER_OP_VERSION(yolo_box) .AddCheckpoint( diff --git a/paddle/fluid/operators/dropout_op.cc b/paddle/fluid/operators/dropout_op.cc index 6d52ce45c4c10099dbeb4d4fadbf91f8c390ef46..3d9950902acfe80a3cfef6c9efa2c6370e685c32 100644 --- a/paddle/fluid/operators/dropout_op.cc +++ b/paddle/fluid/operators/dropout_op.cc @@ -14,7 +14,9 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -25,17 +27,6 @@ class DropoutOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Dropout"); - - auto x_dims = ctx->GetInputDim("X"); - ctx->SetOutputDim("Out", x_dims); - if (ctx->Attrs().Get("is_test") == false) { - ctx->SetOutputDim("Mask", x_dims); - } - ctx->ShareLoD("X", /*->*/ "Out"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -173,7 +164,11 @@ class DropoutGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(dropout, DropoutInferShapeFunctor, + PD_INFER_META(phi::DropoutInferMeta)); + REGISTER_OPERATOR(dropout, ops::DropoutOp, ops::DropoutOpMaker, ops::DropoutGradOpMaker, - ops::DropoutGradOpMaker); + ops::DropoutGradOpMaker, + DropoutInferShapeFunctor); REGISTER_OPERATOR(dropout_grad, ops::DropoutOpGrad); diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h index 761b401ca9a2e535e1badfee834ef9ee98a07aae..d1a1aa3008c8b33690ecd9ea85501ad0178f592a 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h @@ -198,10 +198,7 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel { platform::EventRole::kUniqueOp); reorder_p->execute(astream, *reorder_src_memory_p, *dst_memory); - } - - // elementwise_mul & elementwise_div - else { + } else { // elementwise_mul & elementwise_div platform::BinaryMKLDNNHandler binary_handler( BINARY_OP, axis, onednn_engine, ctx.GetPlace(), dout, y, dx, 1.0f, 1.0f, 1.0f); @@ -253,10 +250,7 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel { } else { broadcast_src_memory = reorder_src_memory_p; } - } - - // elementwise_mul & elementwise_div - else { + } else { // elementwise_mul & elementwise_div std::unordered_map args; std::shared_ptr binary_prim; std::shared_ptr post_op_memory; diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cc b/paddle/fluid/operators/fused/conv_fusion_op.cc index c445a28c084f67f2688e17994cb622903b73c707..e60fc44e9a6ffc106a9c6957c2365e7b44c467b9 100644 --- a/paddle/fluid/operators/fused/conv_fusion_op.cc +++ b/paddle/fluid/operators/fused/conv_fusion_op.cc @@ -120,6 +120,142 @@ class Conv2DFusionOp : public operators::ConvOp { ctx->SetOutputsDim("Outputs", output_shapes); } } + + std::vector ComputeOutputShape( + framework::InferShapeContext* ctx) const { + OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "Conv"); + OP_INOUT_CHECK(ctx->HasInput("Filter"), "Input", "Filter", "Conv"); + + auto in_dims = ctx->GetInputDim("Input"); + auto filter_dims = ctx->GetInputDim("Filter"); + + std::vector strides = ctx->Attrs().Get>("strides"); + std::vector paddings = ctx->Attrs().Get>("paddings"); + std::string padding_algorithm = + ctx->Attrs().Get("padding_algorithm"); + int groups = ctx->Attrs().Get("groups"); + std::vector dilations = + ctx->Attrs().Get>("dilations"); + int dilation_size = dilations.size(); + for (int i = 0; i < dilation_size; ++i) { + PADDLE_ENFORCE_GT( + dilations[i], 0, + platform::errors::InvalidArgument( + "The dilation of Op(Conv) should be larget than 0, but received " + "dilation is %d.", + dilations[i])); + } + const std::string data_format = + ctx->Attrs().Get("data_format"); + + // MKL-DNN Kernels are using NCHW order of dims description + // so we ignore data_format consideration for MKL-DNN kernel + const bool channel_last = (ctx->IsRunMKLDNNKernel() == false) && + (data_format == "NHWC" || data_format == "NDHWC"); + + PADDLE_ENFORCE_EQ( + in_dims.size() == 4 || in_dims.size() == 5, true, + platform::errors::InvalidArgument( + "The input of Op(Conv) should be a 4-D or 5-D Tensor. But " + "received: input's dimension is %u, input's shape is [%s].", + in_dims.size(), in_dims)); + + PADDLE_ENFORCE_EQ( + in_dims.size(), filter_dims.size(), + platform::errors::InvalidArgument( + "The input's dimension and filter's dimension of " + "Op(Conv) should be equal. But received: the input's shape is " + "[%s], " + "the input's dimension is %d; the filter's shape is [%s], " + "the filter's dimension is %d.", + in_dims, in_dims.size(), filter_dims, filter_dims.size())); + + int stride_size = strides.size(); + for (int i = 0; i < stride_size; ++i) { + PADDLE_ENFORCE_GT( + strides[i], 0, + platform::errors::InvalidArgument( + "The stride of Op(Conv) should be larget than 0, but received " + "stride is %d.", + strides[i])); + } + + int in_sub_stride_size = in_dims.size() - stride_size; + PADDLE_ENFORCE_EQ( + in_dims.size(), strides.size() + 2U, + platform::errors::InvalidArgument( + "The difference of input's dimension and Attr(strides)'s " + "length must be euqal to 2 for Op(Conv). " + "But received: input's dimension is %d, input's shape is [%s]; " + "Attr(stride)'s length is %d, Attr(stride) is [%s]; " + "difference of input's dimention and Attr(strides)'s length = %u.", + in_dims.size(), in_dims, strides.size(), phi::make_ddim(strides), + in_sub_stride_size)); + + const auto input_channels = + channel_last ? in_dims[in_dims.size() - 1] : in_dims[1]; + + PADDLE_ENFORCE_EQ( + input_channels, filter_dims[1] * groups, + platform::errors::InvalidArgument( + "The number of input's channels should be equal to filter's " + "channels " + "* groups for Op(Conv). But received: the input's channels is %d, " + "the input's shape is [%s]; the filter's channels is %d, the " + "filter's shape is [%s]; the groups is %d, the data_format is %s. " + "The error may come from wrong data_format setting.", + input_channels, in_dims, filter_dims[1], filter_dims, groups, + data_format)); + PADDLE_ENFORCE_EQ( + filter_dims[0] % groups, 0, + platform::errors::InvalidArgument( + "The number of output's channels (filter's first dimension) of " + "Op(Conv) should be divided by groups. But received: " + "the output channels is %d, the filter's shape is [%s], " + "the groups is %d.", + filter_dims[0], filter_dims, groups)); + + if (ctx->IsRuntime()) { + PADDLE_ENFORCE_GT( + filter_dims[0], 0, + platform::errors::InvalidArgument( + "the size of filter at axis 0 should be greater than 0")); + } + + framework::DDim in_data_dims; + if (channel_last) { + in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); + } else { + in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); + } + + framework::DDim filter_data_dims = + phi::slice_ddim(filter_dims, 2, filter_dims.size()); + + std::vector ksize = phi::vectorize(filter_data_dims); + UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, + in_data_dims, strides, ksize); + + std::vector output_shape({in_dims[0]}); + if (!channel_last) { + output_shape.push_back(filter_dims[0]); + } + for (int i = 0; i < in_data_dims.size(); ++i) { + if ((!ctx->IsRuntime()) && + (in_data_dims[i] <= 0 || filter_dims[i + 2] <= 0)) { + output_shape.push_back(-1); + } else { + output_shape.push_back( + ConvOutputSize(in_data_dims[i], filter_data_dims[i], dilations[i], + paddings[2 * i], paddings[2 * i + 1], strides[i])); + } + } + if (channel_last) { + output_shape.push_back(filter_dims[0]); + } + + return output_shape; + } }; // TODO(qingqing): add gradient operator for conv2d_fusion diff --git a/paddle/fluid/operators/gelu_op.cc b/paddle/fluid/operators/gelu_op.cc index 3d338f00d4fcbf4be35b2392a10c275526dc5d4b..3be2606bfc93984f918adf595b522fe6bfca72be 100644 --- a/paddle/fluid/operators/gelu_op.cc +++ b/paddle/fluid/operators/gelu_op.cc @@ -14,10 +14,11 @@ limitations under the License. */ #include #include -#include - -#include "paddle/fluid/operators/gelu_op.h" -#include "paddle/fluid/platform/float16.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -29,18 +30,6 @@ class GeluOp : public framework::OperatorWithKernel { const framework::AttributeMap &attrs) : OperatorWithKernel(type, inputs, outputs, attrs) {} - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Input(%s) of GeluOp should not be null.", "X")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(%s) of GeluOp should not be null.", "Out")); - - ctx->ShareDim("X", /*->*/ "Out"); - ctx->ShareLoD("X", /*->*/ "Out"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -156,13 +145,10 @@ class GeluGradOpMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(gelu, GeluInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); REGISTER_OPERATOR(gelu, ops::GeluOp, ops::GeluOpMaker, ops::GeluGradOpMaker, - ops::GeluGradOpMaker); + ops::GeluGradOpMaker, + GeluInferShapeFunctor); REGISTER_OPERATOR(gelu_grad, ops::GeluGradOp); -REGISTER_OP_CPU_KERNEL( - gelu, ops::GeluKernel, - ops::GeluKernel); -REGISTER_OP_CPU_KERNEL( - gelu_grad, ops::GeluGradKernel, - ops::GeluGradKernel); diff --git a/paddle/fluid/operators/gelu_op.cu b/paddle/fluid/operators/gelu_op.cu deleted file mode 100644 index ef836ab72f001a540e081d7e9975ca5ee28758be..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/gelu_op.cu +++ /dev/null @@ -1,320 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/amp/fp16_type_traits.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" -#include "paddle/fluid/operators/gelu_op.h" - -DECLARE_bool(use_fast_math); - -namespace paddle { -namespace operators { - -#ifdef __NVCC__ -template -static __device__ __forceinline__ float FP32FastTanh(float x) { -#if __CUDA_ARCH__ >= 750 && CUDA_VERSION >= 11000 - if (FastMode) { - float y; - asm("tanh.approx.f32 %0,%1; \n\t" : "=f"(y) : "f"(x)); - return y; - } -#endif - return tanhf(x); -} - -template -static __device__ __forceinline__ float FP32GeluFwd(float x) { - auto tanh_out = - FP32FastTanh(0.79788456f * x * (1.0f + 0.044715f * x * x)); - return x * 0.5f * (1.0f + tanh_out); -} - -template -static __device__ __forceinline__ float FP32GeluBwd(float x, float y_g) { - auto tanh_out = - FP32FastTanh(0.79788456f * x * (1.0f + 0.044715f * x * x)); - auto tmp = 0.5f * x * ((1.0f - tanh_out * tanh_out) * - (0.79788456f + 0.1070322243f * x * x)) + - 0.5f * (1.0f + tanh_out); - return tmp * y_g; -} - -template -static __global__ void FP16FastGeluFwdCUDAKernel(const __half* x, __half* y, - size_t n) { - size_t offset = - static_cast(threadIdx.x + blockIdx.x * blockDim.x) * VecSize; - size_t stride = static_cast(blockDim.x * gridDim.x) * VecSize; - for (; offset < n; offset += stride) { - using ArrT = phi::AlignedVector<__half, VecSize>; - ArrT in_arr = *reinterpret_cast(x + offset); -#pragma unroll - for (int i = 0; i < VecSize; ++i) { - float tmp = __half2float(in_arr[i]); - in_arr[i] = __float2half(FP32GeluFwd(tmp)); - } - *reinterpret_cast(y + offset) = in_arr; - } -} - -template -static __global__ void FP16FastGeluBwdCUDAKernel(const __half* x, - const __half* y_g, __half* x_g, - size_t n) { - size_t offset = - static_cast(threadIdx.x + blockIdx.x * blockDim.x) * VecSize; - size_t stride = static_cast(blockDim.x * gridDim.x) * VecSize; - for (; offset < n; offset += stride) { - using ArrT = phi::AlignedVector<__half, VecSize>; - ArrT x_in_arr = *reinterpret_cast(x + offset); - ArrT y_g_in_arr = *reinterpret_cast(y_g + offset); -#pragma unroll - for (int i = 0; i < VecSize; ++i) { - __half2 tmp_fp16_2; - tmp_fp16_2.x = x_in_arr[i]; - tmp_fp16_2.y = y_g_in_arr[i]; - float2 tmp_fp32_2 = __half22float2(tmp_fp16_2); - x_in_arr[i] = - __float2half(FP32GeluBwd(tmp_fp32_2.x, tmp_fp32_2.y)); - } - *reinterpret_cast(x_g + offset) = x_in_arr; - } -} - -static bool TryLaunchFP16FastGeluFwdVectorizeCUDAKernel( - const platform::CUDADeviceContext& dev_ctx, const __half* x, __half* y, - size_t n) { - auto is_aligned = [](const void* p, size_t alignment) { - return reinterpret_cast(p) % alignment == 0; - }; - -#define PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(__vec_size, __use_fast_math) \ - do { \ - constexpr auto kAlignment = \ - alignof(phi::AlignedVector<__half, __vec_size>); \ - if (n % __vec_size == 0 && is_aligned(x, kAlignment) && \ - is_aligned(y, kAlignment)) { \ - size_t thread = std::min(512, dev_ctx.GetMaxThreadsPerBlock()); \ - size_t block = (n / __vec_size + thread - 1) / thread; \ - block = std::min(block, dev_ctx.GetCUDAMaxGridDimSize()[0]); \ - VLOG(10) << "Use FP16 fast gelu fwd kernel, block = " << block \ - << " , thread = " << thread; \ - FP16FastGeluFwdCUDAKernel< \ - __vec_size, \ - __use_fast_math><<>>(x, y, n); \ - return true; \ - } \ - } while (0) - - if (FLAGS_use_fast_math) { - PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(8, true); - } else { - PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(8, false); - } - -#undef PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL - return false; -} - -static bool TryLaunchFP16FastGeluBwdVectorizeCUDAKernel( - const platform::CUDADeviceContext& dev_ctx, const __half* x, - const __half* y_g, __half* x_g, size_t n) { - auto is_aligned = [](const void* p, size_t alignment) { - return reinterpret_cast(p) % alignment == 0; - }; - -#define PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(__vec_size, __use_fast_math) \ - do { \ - constexpr auto kAlignment = \ - alignof(phi::AlignedVector<__half, __vec_size>); \ - if (n % __vec_size == 0 && is_aligned(x, kAlignment) && \ - is_aligned(x, kAlignment) && is_aligned(y_g, kAlignment) && \ - is_aligned(x_g, kAlignment)) { \ - size_t thread = std::min(512, dev_ctx.GetMaxThreadsPerBlock()); \ - size_t block = (n / __vec_size + thread - 1) / thread; \ - block = std::min(block, dev_ctx.GetCUDAMaxGridDimSize()[0]); \ - VLOG(10) << "Use FP16 fast gelu bwd kernel, block = " << block \ - << " , thread = " << thread; \ - FP16FastGeluBwdCUDAKernel< \ - __vec_size, \ - __use_fast_math><<>>(x, y_g, \ - x_g, n); \ - return true; \ - } \ - } while (0) - - if (FLAGS_use_fast_math) { - PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(8, true); - } else { - PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(8, false); - } - -#undef PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL - return false; -} -#endif - -template -struct GeluWithApproximateFunctor { - using MPType = typename details::MPTypeTrait::Type; - inline HOSTDEVICE T operator()(T arg_x) { - // this function is tanh approximation of gelu - MPType x = static_cast(arg_x); - MPType one = static_cast(1); - MPType half = static_cast(0.5); - MPType kAlpha = static_cast(M_2_SQRTPI * M_SQRT1_2); - auto tanh_out = - tanh(kAlpha * x * (one + static_cast(GELU_CONSTANT) * x * x)); - MPType out = x * half * (one + tanh_out); - return static_cast(out); - } -}; - -template -struct GeluWithoutApproximateFunctor { - using MPType = typename details::MPTypeTrait::Type; - inline HOSTDEVICE T operator()(T arg_x) { - // actual gelu with approximation = false - MPType x = static_cast(arg_x); - return static_cast(x * normcdf(x)); - } -}; - -template -class GeluKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* out = context.Output("Out"); - auto* in = context.Input("X"); - auto approximate = context.Attr("approximate"); - out->mutable_data(in->place()); - - std::vector ins = {in}; - std::vector outs = {out}; - const auto& dev_ctx = - context.template device_context(); - - if (approximate) { -#ifdef __NVCC__ - if (std::is_same::value) { - size_t n = in->numel(); - const auto* in_ptr = reinterpret_cast(in->data()); - auto* out_ptr = reinterpret_cast<__half*>(out->data()); - if (TryLaunchFP16FastGeluFwdVectorizeCUDAKernel(dev_ctx, in_ptr, - out_ptr, n)) { - return; - } - } -#endif - paddle::operators::LaunchElementwiseCudaKernel( - dev_ctx, ins, &outs, 0, GeluWithApproximateFunctor()); - } else { - paddle::operators::LaunchElementwiseCudaKernel( - dev_ctx, ins, &outs, 0, GeluWithoutApproximateFunctor()); - } - } -}; - -template -struct GeluWithApproximateGradFunctor { - using MPType = typename details::MPTypeTrait::Type; - inline HOSTDEVICE T operator()(T arg_x, T arg_dout) { - MPType x = static_cast(arg_x); - MPType dout = static_cast(arg_dout); - MPType one = static_cast(1); - MPType half = static_cast(0.5); - MPType kAlpha = static_cast(M_2_SQRTPI * M_SQRT1_2); - MPType kBeta = - kAlpha * static_cast(GELU_CONSTANT) * static_cast(3); - auto cube_x = x * x * x; - auto tanh_out = - tanh(kAlpha * ((static_cast(GELU_CONSTANT) * cube_x) + x)); - auto ans = - half * (one + tanh_out + - (one - tanh_out * tanh_out) * (x * kAlpha + kBeta * cube_x)); - return static_cast(ans * dout); - } -}; - -template -struct GeluWithoutApproximateGradFunctor { - using MPType = typename details::MPTypeTrait::Type; - inline HOSTDEVICE T operator()(T arg_x, T arg_dout) { - MPType x = static_cast(arg_x); - MPType dout = static_cast(arg_dout); - constexpr MPType kBeta = M_2_SQRTPI * M_SQRT1_2 * static_cast(0.5); - const MPType cdf = normcdf(x); - const MPType pdf = exp(static_cast(-0.5) * x * x) * kBeta; - return static_cast(dout * (cdf + x * pdf)); - } -}; - -template -class GeluGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* dout = - context.Input(framework::GradVarName("Out")); - auto* dx = context.Output(framework::GradVarName("X")); - auto approximate = context.Attr("approximate"); - dx->mutable_data(dout->place()); - - std::vector ins = {x, dout}; - std::vector outs = {dx}; - const auto& dev_ctx = - context.template device_context(); - if (approximate) { -#ifdef __NVCC__ - if (std::is_same::value) { - size_t n = x->numel(); - const auto* x_ptr = reinterpret_cast(x->data()); - const auto* y_g_ptr = reinterpret_cast(dout->data()); - auto* x_g_ptr = reinterpret_cast<__half*>(dx->data()); - if (TryLaunchFP16FastGeluBwdVectorizeCUDAKernel(dev_ctx, x_ptr, y_g_ptr, - x_g_ptr, n)) { - return; - } - } -#endif - paddle::operators::LaunchElementwiseCudaKernel( - dev_ctx, ins, &outs, 0, GeluWithApproximateGradFunctor()); - } else { - paddle::operators::LaunchElementwiseCudaKernel( - dev_ctx, ins, &outs, 0, GeluWithoutApproximateGradFunctor()); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - gelu, ops::GeluKernel, - ops::GeluKernel, - ops::GeluKernel); -REGISTER_OP_CUDA_KERNEL( - gelu_grad, ops::GeluGradKernel, - ops::GeluGradKernel, - ops::GeluGradKernel); diff --git a/paddle/fluid/operators/gelu_op.h b/paddle/fluid/operators/gelu_op.h deleted file mode 100644 index d4fed8a868ff9e66f64c90ab9352e824ab673217..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/gelu_op.h +++ /dev/null @@ -1,233 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#ifndef _USE_MATH_DEFINES -#define _USE_MATH_DEFINES -#endif -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/float16.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" - -#ifdef PADDLE_WITH_MKLDNN -#include "paddle/fluid/platform/mkldnn_helper.h" -#endif - -namespace paddle { -namespace operators { - -#define GELU_CONSTANT 0.044715 - -template -struct GeluFunctor { - template - void operator()(Device d, X x, Out out, bool approximate) const { - if (approximate) { - // gelu(x) = 0.5 * x * (1 + tanh(sqrt(2 / \pi) * (x + 0.044715 * x^{3}))) - if (std::is_same::value) { - VLOG(4) << "cast from float16 to float before computing"; - auto casted_x = x.template cast(); - auto temp = - (static_cast(M_2_SQRTPI * M_SQRT1_2) * - (casted_x + static_cast(GELU_CONSTANT) * casted_x.cube())) - .tanh(); - out.device(d) = (casted_x * static_cast(0.5) * - (static_cast(1) + temp)) - .template cast(); - } else { - auto temp = (static_cast(M_2_SQRTPI * M_SQRT1_2) * - (x + static_cast(GELU_CONSTANT) * x.cube())) - .tanh(); - out.device(d) = x * static_cast(0.5) * (static_cast(1) + temp); - } - } else { -#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \ - !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) - auto x_data = x.data(); - auto out_data = out.data(); - int n = std::min(x.size(), out.size()); - - std::memset(out_data, 0, n * sizeof(T)); - phi::funcs::CBlas::AXPY(n, static_cast(M_SQRT1_2), x_data, 1, - out_data, 1); - phi::funcs::CBlas::VMERF(n, out_data, out_data, VML_LA); - for (int i = 0; i < n; i++) { - out_data[i] += static_cast(1); - } - phi::funcs::CBlas::VMUL(n, x_data, out_data, out_data); - for (int i = 0; i < n; i++) { - out_data[i] *= static_cast(0.5); - } -#else - // gelu(x) = 0.5 * x * (1 + erf(x / sqrt(2))) - if (std::is_same::value) { - VLOG(4) << "cast from float16 to float before computing"; - auto casted_x = x.template cast(); - auto temp = (casted_x * static_cast(M_SQRT1_2)).erf(); - out.device(d) = (casted_x * static_cast(0.5) * - (static_cast(1) + temp)) - .template cast(); - } else { - auto temp = (x * static_cast(M_SQRT1_2)).erf(); - out.device(d) = x * static_cast(0.5) * (static_cast(1) + temp); - } -#endif - } - } -}; - -template -struct GeluGradFunctor { - template - void operator()(Device d, X x, dOut dout, dX dx, bool approximate) const { - if (approximate) { - if (std::is_same::value) { - VLOG(4) << "cast from float16 to float before computing"; - auto casted_x = x.template cast(); - auto casted_dout = dout.template cast(); - - const float kAlpha = static_cast(M_2_SQRTPI * M_SQRT1_2); - const float kBeta = - kAlpha * static_cast(GELU_CONSTANT) * static_cast(3); - const auto y = - (kAlpha * - ((static_cast(GELU_CONSTANT) * casted_x.cube()) + casted_x)) - .tanh(); - dx.device(d) = (static_cast(0.5) * casted_dout * - (static_cast(1) + y + - (casted_x - casted_x * y.square()) * - (kAlpha + kBeta * casted_x.square()))) - .template cast(); - } else { - const T kAlpha = static_cast(M_2_SQRTPI * M_SQRT1_2); - const T kBeta = - kAlpha * static_cast(GELU_CONSTANT) * static_cast(3); - const auto y = - (kAlpha * ((static_cast(GELU_CONSTANT) * x.cube()) + x)).tanh(); - dx.device(d) = static_cast(0.5) * dout * - (static_cast(1) + y + - (x - x * y.square()) * (kAlpha + kBeta * x.square())); - } - } else { -#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \ - !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) - auto x_data = x.data(); - auto dx_data = dx.data(); - auto dout_data = dout.data(); - int n = std::min(x.size(), dx.size()); - - auto first = static_cast(std::malloc(n * sizeof(T))); - std::memset(first, 0, n * sizeof(T)); - auto second = static_cast(std::malloc(n * sizeof(T))); - std::memset(second, 0, n * sizeof(T)); - - // first = (0.5 * (1 + erf(x / sqrt(2)))) - phi::funcs::CBlas::AXPY(n, static_cast(M_SQRT1_2), x_data, 1, first, - 1); - phi::funcs::CBlas::VMERF(n, first, first, VML_LA); - for (int i = 0; i < n; i++) { - first[i] += static_cast(1); - } - phi::funcs::CBlas::SCAL(n, static_cast(0.5), first, 1); - - // second = (0.5 * 2/sqrt(pi) * 1/sqrt(2) * x * exp(-0.5 * x^2)) - phi::funcs::CBlas::VSQUARE(n, x_data, second); - phi::funcs::CBlas::SCAL(n, -static_cast(0.5), second, 1); - phi::funcs::CBlas::VEXP(n, second, second); - phi::funcs::CBlas::VMUL(n, x_data, second, second); - phi::funcs::CBlas::SCAL( - n, static_cast(0.5 * M_2_SQRTPI * M_SQRT1_2), second, 1); - - // dx = dout * (first + second); - phi::funcs::CBlas::VADD(n, first, second, first); - phi::funcs::CBlas::VMUL(n, dout_data, first, dx_data); - - std::free(first); - std::free(second); -#else - // gelu_grad(x) = dout * 0.5 * (1 + erf(x / sqrt(2)) + x * sqrt(2 / pi) * - // exp(- x^2 / 2) - if (std::is_same::value) { - VLOG(4) << "cast from float16 to float before computing"; - auto casted_x = x.template cast(); - auto casted_dout = dout.template cast(); - auto first = static_cast(0.5) * - (static_cast(1) + - ((casted_x * static_cast(M_SQRT1_2)).erf())); - auto second = static_cast(0.5 * M_2_SQRTPI * M_SQRT1_2) * - casted_x * - (-static_cast(0.5) * casted_x.square()).exp(); - dx.device(d) = (casted_dout * (first + second)).template cast(); - } else { - auto first = - static_cast(0.5) * - (static_cast(1) + ((x * static_cast(M_SQRT1_2)).erf())); - - auto second = static_cast(0.5 * M_2_SQRTPI * M_SQRT1_2) * x * - (-static_cast(0.5) * x.square()).exp(); - dx.device(d) = dout * (first + second); - } -#endif - } - } -}; - -template -class GeluKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* out = context.Output("Out"); - auto* in = context.Input("X"); - auto approximate = context.Attr("approximate"); - out->mutable_data(in->place()); - - auto eigen_out = framework::EigenVector::Flatten(*out); - auto eigen_in = framework::EigenVector::Flatten(*in); - auto& place = - *context.template device_context().eigen_device(); - - GeluFunctor functor; - functor(place, eigen_in, eigen_out, approximate); - } -}; - -template -class GeluGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* dout = - context.Input(framework::GradVarName("Out")); - auto* dx = context.Output(framework::GradVarName("X")); - auto approximate = context.Attr("approximate"); - dx->mutable_data(dout->place()); - - auto eigen_x = framework::EigenVector::Flatten(*x); - auto eigen_dout = framework::EigenVector::Flatten(*dout); - auto eigen_dx = framework::EigenVector::Flatten(*dx); - auto& place = - *context.template device_context().eigen_device(); - - GeluGradFunctor functor; - functor(place, eigen_x, eigen_dout, eigen_dx, approximate); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/gelu_op_npu.cc b/paddle/fluid/operators/gelu_op_npu.cc index 18bbc7f4929c6493db9161d0415c0728eb8689c0..c5297dd9cd404b7637c2eec79dafcc027509ddcb 100644 --- a/paddle/fluid/operators/gelu_op_npu.cc +++ b/paddle/fluid/operators/gelu_op_npu.cc @@ -15,7 +15,9 @@ limitations under the License. */ #include #include -#include "paddle/fluid/operators/gelu_op.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/gelu_op_npu_test.cc b/paddle/fluid/operators/gelu_op_npu_test.cc index f3ac53138328dbfad12c6d530a6517f40c658677..b132b3170756d95adfde51e6d6ce7a5f0f25ca26 100644 --- a/paddle/fluid/operators/gelu_op_npu_test.cc +++ b/paddle/fluid/operators/gelu_op_npu_test.cc @@ -30,7 +30,7 @@ limitations under the License. */ namespace f = paddle::framework; namespace p = paddle::platform; -USE_OP(gelu); +USE_OP_ITSELF(gelu); USE_OP_DEVICE_KERNEL(gelu, NPU); template diff --git a/paddle/fluid/operators/gelu_op_xpu.cc b/paddle/fluid/operators/gelu_op_xpu.cc index b8c2e9becf2950d12f87ec5d61c05f3bf0010b12..559d2448ad94525d623e24fc8fb6c5e3881b58e3 100644 --- a/paddle/fluid/operators/gelu_op_xpu.cc +++ b/paddle/fluid/operators/gelu_op_xpu.cc @@ -14,9 +14,9 @@ limitations under the License. */ #include #include - -#include "paddle/fluid/operators/gelu_op.h" - +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/tensor.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc index 9575ab54b32bda9292e5d266010484a34eae3e54..93f0d3d334f271ec7e40e38e9d654ad7f8ba3c59 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc @@ -12,9 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/hierarchical_sigmoid_op.h" #include #include + +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/multiary.h" + namespace paddle { namespace operators { @@ -60,31 +64,6 @@ namespace operators { class HierarchicalSigmoidOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "hsigmoid"); - OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", "hsigmoid"); - OP_INOUT_CHECK(ctx->HasInput("W"), "Input", "W", "hsigmoid"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "hsigmoid"); - OP_INOUT_CHECK(ctx->HasOutput("PreOut"), "Output", "PreOut", "hsigmoid"); - - auto with_prefetch = ctx->Attrs().Get("remote_prefetch"); - if (with_prefetch) { - OP_INOUT_CHECK(ctx->HasOutput("W_Out"), "Output", "W_Out", "hsigmoid"); - } - const int64_t input_dims = ctx->GetInputDim("X")[0]; - const int64_t label_dims = ctx->GetInputDim("Label")[0]; - PADDLE_ENFORCE_EQ(input_dims, label_dims, - platform::errors::InvalidArgument( - "The first dimension of " - "input and label is expected to be the same. " - "But received input's first dimension is %d; " - "label's first dimension is %d.", - input_dims, label_dims)); - - std::vector output_shape({input_dims, 1}); - ctx->SetOutputDim("Out", phi::make_ddim(output_shape)); - ctx->ShareLoD("X", /*->*/ "Out"); - } protected: framework::OpKernelType GetExpectedKernelType( @@ -272,22 +251,14 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER( } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR( - hierarchical_sigmoid, ops::HierarchicalSigmoidOp, - ops::HierarchicalSigmoidOpMaker, - ops::HierarchicalSigmoidGradMaker, - ops::HierarchicalSigmoidGradMaker); +DECLARE_INFER_SHAPE_FUNCTOR(hierarchical_sigmoid, + HierarchicalSigmoidInferShapeFunctor, + PD_INFER_META(phi::HierarchicalSigmoidInferMeta)); +REGISTER_OPERATOR(hierarchical_sigmoid, ops::HierarchicalSigmoidOp, + ops::HierarchicalSigmoidOpMaker, + ops::HierarchicalSigmoidGradMaker, + ops::HierarchicalSigmoidGradMaker, + HierarchicalSigmoidInferShapeFunctor); REGISTER_OPERATOR(hierarchical_sigmoid_grad, ops::HierarchicalSigmoidGradOp, ops::HierarchicalSigmoidGradOpGradVarTypeInference, ops::HierarchicalSigmoidGradOpNoNeedBufferVarInferer); -REGISTER_OP_CPU_KERNEL( - hierarchical_sigmoid, - ops::HierarchicalSigmoidOpKernel, - ops::HierarchicalSigmoidOpKernel); -REGISTER_OP_CPU_KERNEL( - hierarchical_sigmoid_grad, - ops::HierarchicalSigmoidGradOpKernel, - ops::HierarchicalSigmoidGradOpKernel); diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h deleted file mode 100644 index f11b28cfefb071182eb99cce3d8c2b7f2343cdf6..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.h +++ /dev/null @@ -1,222 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/mixed_vector.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/clip_op.h" -#include "paddle/fluid/operators/math/matrix_bit_code.h" -#include "paddle/fluid/platform/transform.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -template -using EigenMatrix = framework::EigenMatrix; -using platform::Transform; -using framework::LoDTensor; - -static std::vector PathToRows(const LoDTensor& path) { - std::set rows; - const int64_t* paths = path.data(); - for (int64_t i = 0; i < path.numel(); ++i) { - int64_t row = paths[i]; - if (row < 0) { - continue; - } - rows.emplace(row); - } - return std::vector(rows.begin(), rows.end()); -} -template -class HierarchicalSigmoidOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& in = GET_DATA_SAFELY(ctx.Input("X"), "Input", "X", - "HierarchicalSigmoid"); - auto& w = GET_DATA_SAFELY(ctx.Input("W"), "Input", "W", - "HierarchicalSigmoid"); - auto* path = ctx.Input("PathTable"); - auto* code = ctx.Input("PathCode"); - auto& label = GET_DATA_SAFELY(ctx.Input("Label"), "Input", - "Label", "HierarchicalSigmoid"); - auto* bias = ctx.Input("Bias"); - auto* out = ctx.Output("Out"); - auto* pre_out = ctx.Output("PreOut"); - size_t num_classes = static_cast(ctx.Attr("num_classes")); - // for remote prefetch - - bool is_custom = false; - if (path) { - is_custom = true; - } - int64_t code_length = - path ? path->dims()[1] : math::FindLastSet(num_classes - 1); - int64_t batch_size = in.dims()[0]; - LoDTensor sum; - auto& dev_ctx = ctx.template device_context(); - auto* pre_out_data = pre_out->mutable_data( - phi::make_ddim({batch_size, code_length}), ctx.GetPlace()); - auto pre_out_mat = EigenMatrix::From(*pre_out); - // Not all class(leaf) nodes' path lengths equal code_length, thus init as - // 0s can avoid out of path's loss. - phi::funcs::SetConstant zero; - zero(dev_ctx, pre_out, static_cast(0.0)); - auto& place = *ctx.template device_context().eigen_device(); - phi::funcs::RowwiseSum row_sum; - - std::unique_ptr> bit_code; - if (!is_custom) { - bit_code.reset(new math::MatrixBitCodeFunctor( - num_classes, label.template data())); - } else { - bit_code.reset(new math::MatrixBitCodeFunctor( - *path, *code, label.template data())); - } - - std::vector sum_dims({batch_size, 1UL}); - sum.mutable_data(phi::make_ddim(sum_dims), ctx.GetPlace()); - auto sum_mat = EigenMatrix::From(sum); - out->mutable_data(ctx.GetPlace()); - auto out_mat = framework::EigenMatrix::From(*out); - if (bias) { - bit_code->Add(*bias, pre_out); - } - bit_code->Mul(pre_out, w, in); - // clip to [-40, 40] - Transform trans; - trans(ctx.template device_context(), pre_out_data, - pre_out_data + pre_out->numel(), pre_out_data, - ClipFunctor(static_cast(-40.0), static_cast(40.0))); - bit_code->Sum(*pre_out, out, static_cast(-1)); - // use softrelu to calculate cross entropy - pre_out_mat.device(place) = (static_cast(1.0) + pre_out_mat.exp()).log(); - row_sum(dev_ctx, *pre_out, &sum); - // TODO(guosheng): Subtract the out of path's loss, since not all - // class(leaf) nodes' path lengths equal code_length. But it won't break the - // gradient check since both have the out of path's loss and will cancel out - // each other. - out_mat.device(place) = sum_mat + out_mat; - } -}; - -template -class HierarchicalSigmoidGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& in = GET_DATA_SAFELY(ctx.Input("X"), "Input", "X", - "HierarchicalSigmoidGrad"); - auto& w = GET_DATA_SAFELY(ctx.Input("W"), "Input", "W", - "HierarchicalSigmoidGrad"); - auto* path = ctx.Input("PathTable"); - auto* code = ctx.Input("PathCode"); - auto* in_grad = ctx.Output(framework::GradVarName("X")); - bool is_sparse = ctx.Attr("is_sparse"); - auto& dev_ctx = ctx.template device_context(); - phi::funcs::SetConstant zero; - auto& label = GET_DATA_SAFELY(ctx.Input("Label"), "Input", - "Label", "HierarchicalSigmoidGrad"); - auto& pre_out = GET_DATA_SAFELY(ctx.Input("PreOut"), "Input", - "PreOut", "HierarchicalSigmoidGrad"); - auto& out_grad = GET_DATA_SAFELY( - ctx.Input(framework::GradVarName("Out")), "Input", - framework::GradVarName("Out"), "HierarchicalSigmoidGrad"); - LoDTensor pre_out_grad; - - pre_out_grad.mutable_data(pre_out.dims(), ctx.GetPlace()); - in_grad->mutable_data(ctx.GetPlace()); - zero(dev_ctx, in_grad, static_cast(0.0)); - - size_t num_classes = static_cast(ctx.Attr("num_classes")); - - bool is_custom = false; - if (path) { - is_custom = true; - } - - std::unique_ptr> bit_code; - if (!is_custom) { - bit_code.reset(new math::MatrixBitCodeFunctor( - num_classes, label.template data())); - } else { - bit_code.reset(new math::MatrixBitCodeFunctor( - *path, *code, label.template data())); - } - - // softrelu derivative - - auto blas = phi::funcs::GetBlas(ctx); - - auto* pre_out_grad_data = pre_out_grad.data(); - auto* pre_out_data = pre_out.template data(); - auto n = pre_out.numel(); - blas.VEXP(n, pre_out_data, pre_out_grad_data); - blas.VINV(n, pre_out_grad_data, pre_out_grad_data); - for (int64_t i = 0; i < n; ++i) { - pre_out_grad_data[i] = 1.0 - pre_out_grad_data[i]; - } - bit_code->Sub(&pre_out_grad); // the gradient of clip(w * x + b) - auto* out_grad_data = out_grad.template data(); - - int64_t dim0 = pre_out_grad.dims()[0]; - int64_t dim1 = pre_out_grad.dims()[1]; - for (int64_t i = 0; i < dim0; ++i) { - T tmp = out_grad_data[i]; - blas.SCAL(dim1, tmp, pre_out_grad_data + i * dim1); - } - // TODO(guosheng): multiply pre_out_grad with subgradient of clipping to - // be consistent with the clipping in forward. - auto* bias_grad = ctx.Output(framework::GradVarName("Bias")); - if (bias_grad) { - bias_grad->mutable_data(ctx.GetPlace()); - zero(dev_ctx, bias_grad, static_cast(0.0)); - bit_code->AddGrad(pre_out_grad, bias_grad); - } - if (!is_sparse) { - auto* w_grad = ctx.Output(framework::GradVarName("W")); - w_grad->mutable_data(ctx.GetPlace()); - zero(dev_ctx, w_grad, static_cast(0.0)); - bit_code->MulGradWeight(pre_out_grad, w_grad, in); - } else { - PADDLE_ENFORCE_NOT_NULL(path, - platform::errors::NotFound( - "Custom tree must be set for sparse mode!")); - framework::Vector real_rows = PathToRows(*path); - auto* w_grad = ctx.Output(framework::GradVarName("W")); - w_grad->set_rows(real_rows); - // Build a map of id -> row_index to speed up finding the index of one id - w_grad->set_height(w.dims()[0]); - auto* w_grad_value = w_grad->mutable_value(); - framework::DDim temp_dim(w.dims()); - temp_dim[0] = real_rows.size(); - w_grad_value->mutable_data(temp_dim, ctx.GetPlace()); - zero(dev_ctx, w_grad_value, static_cast(0.0)); - bit_code->MulGradWeight(pre_out_grad, w_grad, in); - } - bit_code->MulGradError(pre_out_grad, w, in_grad); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/histogram_op.cc b/paddle/fluid/operators/histogram_op.cc index 92cc6077defcd3f2b27c1b45875014742bc792ae..c9fd75651b5892beffa3b2aad7c21a0805facfce 100644 --- a/paddle/fluid/operators/histogram_op.cc +++ b/paddle/fluid/operators/histogram_op.cc @@ -16,7 +16,9 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -28,27 +30,6 @@ class HistogramOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "histogram"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "histogram"); - const auto &nbins = ctx->Attrs().Get("bins"); - const auto &minval = ctx->Attrs().Get("min"); - const auto &maxval = ctx->Attrs().Get("max"); - - PADDLE_ENFORCE_GE(nbins, 1, - platform::errors::InvalidArgument( - "The bins should be greater than or equal to 1." - "But received nbins is %d", - nbins)); - PADDLE_ENFORCE_GE(maxval, minval, platform::errors::InvalidArgument( - "max must be larger or equal to min." - "But received max is %d, min is %d", - maxval, minval)); - - ctx->SetOutputDim("Out", phi::make_ddim({nbins})); - ctx->ShareLoD("X", /*->*/ "Out"); - } - framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const { auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); @@ -81,7 +62,12 @@ class HistogramOpMaker : public framework::OpProtoAndCheckerMaker { } // namespace paddle namespace ops = paddle::operators; + +DECLARE_INFER_SHAPE_FUNCTOR(histogram, HistogramInferShapeFunctor, + PD_INFER_META(phi::HistogramInferMeta)); + REGISTER_OPERATOR( histogram, ops::HistogramOp, ops::HistogramOpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); + paddle::framework::EmptyGradOpMaker, + HistogramInferShapeFunctor); diff --git a/paddle/fluid/operators/inplace_abn_op.cc b/paddle/fluid/operators/inplace_abn_op.cc index 7f5136969980b887bb7bbe013690898e66abeac1..77951ff394e7491569746c89ac45826f23fdf313 100644 --- a/paddle/fluid/operators/inplace_abn_op.cc +++ b/paddle/fluid/operators/inplace_abn_op.cc @@ -323,6 +323,7 @@ class InplaceABNGradKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; + REGISTER_OPERATOR(inplace_abn, ops::InplaceABNOp, ops::InplaceABNOpMaker, ops::BatchNormOpInferVarType, ops::InplaceABNOpGradMaker, diff --git a/paddle/fluid/operators/kthvalue_op.cc b/paddle/fluid/operators/kthvalue_op.cc index 2a79cee27814e86b277d927082e9a772359217f1..4c679d30263863c70176bebb686556af056068d0 100644 --- a/paddle/fluid/operators/kthvalue_op.cc +++ b/paddle/fluid/operators/kthvalue_op.cc @@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/kthvalue_op.h" #include #include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -25,54 +26,6 @@ class KthvalueOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "kthvalue"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "kthvalue"); - OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "kthvalue"); - auto input_dims = ctx->GetInputDim("X"); - const int& dim_size = input_dims.size(); - int axis = static_cast(ctx->Attrs().Get("axis")); - PADDLE_ENFORCE_LT(axis, dim_size, - paddle::platform::errors::InvalidArgument( - "the axis must be [-%d, %d), but received %d .", - dim_size, dim_size, axis)); - PADDLE_ENFORCE_GE(axis, -dim_size, - paddle::platform::errors::InvalidArgument( - "the axis must be [-%d, %d), but received %d .", - dim_size, dim_size, axis)); - if (axis < 0) axis += dim_size; - int k = static_cast(ctx->Attrs().Get("k")); - PADDLE_ENFORCE_GE( - k, 1, paddle::platform::errors::InvalidArgument( - "the k in the kthvalue must >= 1, but received %d .", k)); - PADDLE_ENFORCE_GE(input_dims.size(), 1, - paddle::platform::errors::InvalidArgument( - "input of kthvalue must have >= 1d shape")); - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_GE( - input_dims[axis], k, - paddle::platform::errors::InvalidArgument( - "input of kthvalue must have >= %d columns in axis of %d", k, - axis)); - } - bool keepdim = ctx->Attrs().Get("keepdim"); - std::vector dimvec; - for (int64_t i = 0; i < axis; i++) { - dimvec.emplace_back(input_dims[i]); - } - if (keepdim) { - dimvec.emplace_back(static_cast(1)); - } - for (int64_t i = axis + 1; i < dim_size; i++) { - dimvec.emplace_back(input_dims[i]); - } - framework::DDim dims = phi::make_ddim(dimvec); - ctx->SetOutputDim("Out", dims); - ctx->SetOutputDim("Indices", dims); - ctx->ShareLoD("X", "Out"); - ctx->ShareLoD("X", "Indices"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -155,20 +108,13 @@ class KthvalueGradOpMaker : public framework::SingleGradOpMaker { } // namespace operators } // namespace paddle +DECLARE_INFER_SHAPE_FUNCTOR(kthvalue, KthvalueInferShapeFunctor, + PD_INFER_META(phi::KthvalueInferMeta)); + namespace ops = paddle::operators; REGISTER_OPERATOR(kthvalue, ops::KthvalueOp, ops::KthvalueOpMaker, ops::KthvalueGradOpMaker, - ops::KthvalueGradOpMaker); -REGISTER_OP_CPU_KERNEL( - kthvalue, ops::KthvalueCPUKernel, - ops::KthvalueCPUKernel, - ops::KthvalueCPUKernel, - ops::KthvalueCPUKernel); + ops::KthvalueGradOpMaker, + KthvalueInferShapeFunctor); REGISTER_OPERATOR(kthvalue_grad, ops::KthvalueOpGrad); -REGISTER_OP_CPU_KERNEL( - kthvalue_grad, - ops::KthvalueGradCPUKernel, - ops::KthvalueGradCPUKernel, - ops::KthvalueGradCPUKernel, - ops::KthvalueGradCPUKernel); diff --git a/paddle/fluid/operators/kthvalue_op.cu b/paddle/fluid/operators/kthvalue_op.cu deleted file mode 100644 index f6f56f70f1a11971b31e679ef879f2d1d0a96085..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/kthvalue_op.cu +++ /dev/null @@ -1,278 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/kthvalue_op.h" -#include "paddle/fluid/operators/top_k_function_cuda.h" -#ifdef __NVCC__ -#include "cub/cub.cuh" -#endif -#ifdef __HIPCC__ -#include -#endif - -namespace paddle { -namespace operators { - -int getBlockSize(int col) { - if (col > 512) - return 1024; - else if (col > 256 && col <= 512) - return 512; - else if (col > 128 && col <= 256) - return 256; - else if (col > 64 && col <= 128) - return 128; - else - return 64; -} - -template -bool SortKthvalue(const platform::CUDADeviceContext& ctx, - const framework::Tensor* input_tensor, const int64_t num_cols, - const int64_t num_rows, const int k, - framework::Tensor* out_tensor, - framework::Tensor* indices_tensor) { - auto cu_stream = ctx.stream(); - framework::Tensor input_indices; - const std::vector dims = {num_rows, num_cols}; - auto dim = phi::make_ddim(dims); - input_indices.Resize(dim); - input_indices.mutable_data(ctx.GetPlace()); - size_t temp_storage_bytes = -1; - int block_size = getBlockSize(num_cols); - unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0]; - unsigned int grid_size = num_rows < maxGridDimX - ? static_cast(num_rows) - : maxGridDimX; - InitIndex<<>>( - input_indices.data(), num_rows, num_cols); - cub::CountingInputIterator counting_iter(0); - cub::TransformInputIterator> - segment_offsets_t(counting_iter, SegmentOffsetIter(num_cols)); - T* sorted_values_ptr; - int64_t* sorted_indices_ptr; - framework::Tensor temp_values, temp_indices; - const T* input = input_tensor->data(); - T* values = out_tensor->data(); - int64_t* indices = indices_tensor->mutable_data(ctx.GetPlace()); - temp_values.Resize(dim); - temp_indices.Resize(dim); - sorted_values_ptr = temp_values.mutable_data(ctx.GetPlace()); - sorted_indices_ptr = temp_indices.mutable_data(ctx.GetPlace()); - auto err = cub::DeviceSegmentedRadixSort::SortPairs( - nullptr, temp_storage_bytes, input, sorted_values_ptr, - input_indices.data(), sorted_indices_ptr, num_cols * num_rows, - num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8, - cu_stream); -#ifdef __HIPCC__ - if (err != hipSuccess) { - LOG(ERROR) << "KthvalueOP failed as could not launch " - "hipcub::DeviceSegmentedRadixSort::SortPairs, status: " - << hipGetErrorString(err); - return false; - } -#else - if (err != cudaSuccess) { - LOG(ERROR) << "KthvalueOP failed as could not launch " - "cub::DeviceSegmentedRadixSort::SortPairs, status: " - << cudaGetErrorString(err); - return false; - } -#endif - framework::Tensor temp_storage; - temp_storage.mutable_data(ctx.GetPlace(), temp_storage_bytes); - - err = cub::DeviceSegmentedRadixSort::SortPairs( - temp_storage.data(), temp_storage_bytes, input, - sorted_values_ptr, input_indices.data(), sorted_indices_ptr, - num_cols * num_rows, num_rows, segment_offsets_t, segment_offsets_t + 1, - 0, sizeof(T) * 8, cu_stream); -#ifdef __HIPCC__ - if (err != hipSuccess) { - LOG(ERROR) << "KthvalueOP failed as could not launch " - "hipcub::DeviceSegmentedRadixSort::SortPairs, " - << temp_storage_bytes << ", status: " << hipGetErrorString(err); - return false; - } -#else - if (err != cudaSuccess) { - LOG(ERROR) << "KthvalueOP failed as could not launch " - "cub::DeviceSegmentedRadixSort::SortPairs, " - << temp_storage_bytes << ", status: " << cudaGetErrorString(err); - return false; - } -#endif - auto& dev = *ctx.eigen_device(); - const Eigen::DSizes slice_indices{0, k - 1}; - const Eigen::DSizes slice_sizes{num_rows, 1}; - auto e_indices = framework::EigenMatrix::From(*indices_tensor, dim); - auto e_tmp_indices = framework::EigenMatrix::From( - static_cast(temp_indices)); - std::vector odims = {static_cast(num_rows), static_cast(1)}; - dim = phi::make_ddim(odims); - auto e_values = framework::EigenMatrix::From(*out_tensor, dim); - auto e_tmp_values = framework::EigenMatrix::From( - static_cast(temp_values)); - - EigenSlice, int64_t, 2>::Eval( - dev, e_indices, e_tmp_indices, slice_indices, slice_sizes); - EigenSlice, T, 2>::Eval( - dev, e_values, e_tmp_values, slice_indices, slice_sizes); - return true; -} - -template -class KthvalueOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::InvalidArgument( - "It must use CUDAPlace, you must check your device set.")); - auto* input = ctx.Input("X"); - auto* output = ctx.Output("Out"); - auto* indices = ctx.Output("Indices"); - int k = static_cast(ctx.Attr("k")); - int axis = static_cast(ctx.Attr("axis")); - bool keepdim = static_cast(ctx.Attr("keepdim")); - const auto& in_dims = input->dims(); - if (axis < 0) axis += in_dims.size(); - auto out_dims = output->dims(); - const T* input_data = input->data(); - T* output_data = output->mutable_data(ctx.GetPlace()); - int64_t* indices_data = indices->mutable_data(ctx.GetPlace()); - - if (axis == in_dims.size() - 1) { - const int64_t& input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t& input_width = in_dims[in_dims.size() - 1]; - const auto& dev_ctx = ctx.cuda_device_context(); - PADDLE_ENFORCE_EQ(SortKthvalue(dev_ctx, input, input_width, - input_height, k, output, indices), - true, platform::errors::External( - "KthvalueOP: Error when use cub sorting")); - return; - } else { - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.emplace_back(i); - } - trans.emplace_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans.emplace_back(i); - } - trans.emplace_back(axis); - if (!keepdim) { - std::vector tmp_out_shape; - for (int i = 0; i < axis; i++) { - tmp_out_shape.emplace_back(in_dims[i]); - } - tmp_out_shape.emplace_back(1); - for (int i = axis + 1; i < in_dims.size(); i++) { - tmp_out_shape.emplace_back(in_dims[i]); - } - framework::DDim tmp_out_dims = phi::make_ddim(tmp_out_shape); - output->Resize(tmp_out_dims); - indices->Resize(tmp_out_dims); - } - framework::DDim trans_dims(in_dims); - framework::DDim trans_out_dims(in_dims); - for (int i = 0; i < trans.size(); i++) { - trans_dims[i] = in_dims[trans[i]]; - trans_out_dims[i] = in_dims[trans[i]]; - } - trans_out_dims[in_dims.size() - 1] = 1; - framework::Tensor trans_input; - trans_input.mutable_data(trans_dims, ctx.GetPlace()); - int ndims = trans.size(); - const auto& dev_ctx = ctx.cuda_device_context(); - TransCompute(ndims, dev_ctx, *input, - &trans_input, trans); - framework::Tensor trans_ind, trans_out; - trans_ind.mutable_data(trans_out_dims, ctx.GetPlace()); - trans_out.mutable_data(trans_out_dims, ctx.GetPlace()); - const int64_t input_height = - phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); - const int64_t input_width = trans_dims[trans_dims.size() - 1]; - PADDLE_ENFORCE_EQ( - SortKthvalue(dev_ctx, &trans_input, input_width, input_height, k, - &trans_out, &trans_ind), - true, - platform::errors::External("KthvalueOP: Error when use cub sorting")); - TransCompute( - ndims, dev_ctx, trans_ind, indices, trans); - TransCompute(ndims, dev_ctx, trans_out, - output, trans); - if (!keepdim) { - output->Resize(out_dims); - indices->Resize(out_dims); - } - } - } -}; - -template -class KthvalueOpGradCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(context.GetPlace()), true, - platform::errors::InvalidArgument( - "It must use CUDAPlace, you must check your device set.")); - auto* x = context.Input("X"); - auto* out_grad = - context.Input(framework::GradVarName("Out")); - auto* indices = context.Input("Indices"); - auto* x_grad = - context.Output(framework::GradVarName("X")); - int axis = context.Attr("axis"); - int k = static_cast(context.Attr("k")); - const auto& in_dims = x->dims(); - auto out_dims = indices->dims(); - if (axis < 0) axis += in_dims.size(); - T* x_grad_data = x_grad->mutable_data(context.GetPlace()); - const T* out_grad_data = out_grad->data(); - const int64_t* indices_data = indices->data(); - int pre, n, post; - GetDims(in_dims, axis, &pre, &n, &post); - auto& dev_ctx = context.cuda_device_context(); - int block_size = getBlockSize(post * k); - int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); - const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1); - int grid_size = std::min(max_blocks, pre); - AssignGradWithAxis<<>>( - out_grad_data, indices_data, x_grad_data, pre, post, n, 1); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - kthvalue, - ops::KthvalueOpCUDAKernel, - ops::KthvalueOpCUDAKernel, - ops::KthvalueOpCUDAKernel, - ops::KthvalueOpCUDAKernel); -REGISTER_OP_CUDA_KERNEL( - kthvalue_grad, - ops::KthvalueOpGradCUDAKernel, - ops::KthvalueOpGradCUDAKernel, - ops::KthvalueOpGradCUDAKernel, - ops::KthvalueOpGradCUDAKernel); diff --git a/paddle/fluid/operators/kthvalue_op.h b/paddle/fluid/operators/kthvalue_op.h deleted file mode 100644 index 15df0a10c6992f07f9913b867319bff342180c3d..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/kthvalue_op.h +++ /dev/null @@ -1,281 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/transpose_op.h" - -namespace paddle { -namespace operators { -template -static void getKthvalue(Type input_height, Type input_width, int input_dim, - const framework::Tensor* input, T* t_out, - Type* t_indices, const int& k) { - bool partial_sort_flag = (k * 64) < input_width; -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (Type i = 0; i < input_height; ++i) { - std::vector> col_vec; - col_vec.reserve(input_width); - if (input_dim == 1) { - auto e_input = framework::EigenVector::Flatten(*input); - for (Type j = 0; j < input_width; ++j) { - col_vec.emplace_back(std::pair(e_input(j), j)); - } - } else { - auto e_input = framework::EigenMatrix::Reshape(*input, input_dim - 1); - for (Type j = 0; j < input_width; ++j) { - col_vec.emplace_back(std::pair(e_input(i, j), j)); - } - } - if (partial_sort_flag) { - std::partial_sort( - col_vec.begin(), col_vec.begin() + k, col_vec.end(), - [](const std::pair& l, const std::pair& r) { - return (!std::isnan(static_cast(l.first)) && - std::isnan(static_cast(r.first))) || - (l.first < r.first); - }); - } else { - std::nth_element( - col_vec.begin(), col_vec.begin() + k - 1, col_vec.end(), - [](const std::pair& l, const std::pair& r) { - return (!std::isnan(static_cast(l.first)) && - std::isnan(static_cast(r.first))) || - (l.first < r.first); - }); - } - t_out[i] = col_vec[k - 1].first; - t_indices[i] = col_vec[k - 1].second; - } -} - -template -static void kthvalueAssign(const Type& input_height, const Type& input_width, - const int& input_dim, const framework::Tensor* input, - const framework::Tensor* indices, T* output_data) { -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (Type i = 0; i < input_height; ++i) { - if (input_dim == 1) { - auto e_input = framework::EigenVector::Flatten(*input); - auto e_indices = framework::EigenVector::Flatten(*indices); - output_data[i * input_width + e_indices(0)] = e_input(0); - } else { - auto e_input = framework::EigenMatrix::Reshape(*input, input_dim - 1); - auto e_indices = - framework::EigenMatrix::Reshape(*indices, input_dim - 1); - output_data[i * input_width + e_indices(i, 0)] = e_input(i, 0); - } - } -} - -template -class KthvalueCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* input = context.Input("X"); - auto* output = context.Output("Out"); - auto* indices = context.Output("Indices"); - const auto& in_dims = input->dims(); - int k = static_cast(context.Attr("k")); - bool keepdim = static_cast(context.Attr("keepdim")); - int axis = static_cast(context.Attr("axis")); - if (axis < 0) axis += in_dims.size(); - T* output_data = output->mutable_data(context.GetPlace()); - int64_t* indices_data = indices->mutable_data(context.GetPlace()); - auto out_dims = output->dims(); - if (axis == in_dims.size() - 1) { - const int64_t& input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t& input_width = in_dims[in_dims.size() - 1]; - getKthvalue(input_height, input_width, in_dims.size(), input, - output_data, indices_data, k); - } else { - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.emplace_back(i); - } - trans.emplace_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans.emplace_back(i); - } - trans.emplace_back(axis); - if (!keepdim) { - std::vector tmp_out_shape; - for (int i = 0; i < axis; i++) { - tmp_out_shape.emplace_back(in_dims[i]); - } - tmp_out_shape.emplace_back(1); - for (int i = axis + 1; i < in_dims.size(); i++) { - tmp_out_shape.emplace_back(in_dims[i]); - } - framework::DDim tmp_out_dims = phi::make_ddim(tmp_out_shape); - output->Resize(tmp_out_dims); - indices->Resize(tmp_out_dims); - } - framework::DDim trans_dims(in_dims); - framework::DDim trans_out_dims(in_dims); - - for (size_t i = 0; i < trans.size(); i++) { - trans_dims[i] = in_dims[trans[i]]; - trans_out_dims[i] = in_dims[trans[i]]; - } - trans_out_dims[in_dims.size() - 1] = 1; - framework::Tensor trans_inp; - trans_inp.mutable_data(trans_dims, context.GetPlace()); - int ndims = trans.size(); - auto& dev_context = - context.template device_context(); - TransCompute(ndims, dev_context, *input, - &trans_inp, trans); - - const int64_t input_height = - phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); - const int64_t input_width = trans_dims[trans_dims.size() - 1]; - framework::Tensor tmp_out, tmp_indices; - T* t_out = tmp_out.mutable_data(trans_out_dims, context.GetPlace()); - auto* t_ind = - tmp_indices.mutable_data(trans_out_dims, context.GetPlace()); - - getKthvalue(input_height, input_width, in_dims.size(), - &trans_inp, t_out, t_ind, k); - TransCompute( - ndims, dev_context, tmp_indices, indices, trans); - TransCompute(ndims, dev_context, tmp_out, - output, trans); - if (!keepdim) { - output->Resize(out_dims); - indices->Resize(out_dims); - } - } - } -}; - -template -class KthvalueGradCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* out_grad = - context.Input(framework::GradVarName("Out")); - auto* indices = context.Input("Indices"); - auto* x_grad = - context.Output(framework::GradVarName("X")); - int axis = static_cast(context.Attr("axis")); - bool keepdim = static_cast(context.Attr("keepdim")); - auto in_dims = x->dims(); - auto out_dims = indices->dims(); - axis = (axis < 0) ? (in_dims.size() + axis) : axis; - if (!keepdim) { - std::vector tmp_out_shape; - for (int i = 0; i < axis; i++) { - tmp_out_shape.emplace_back(out_dims[i]); - } - tmp_out_shape.emplace_back(1); - for (int i = axis + 1; i < in_dims.size(); i++) { - tmp_out_shape.emplace_back(out_dims[i - 1]); - } - out_dims = phi::make_ddim(tmp_out_shape); - } - T* x_grad_data = x_grad->mutable_data(context.GetPlace()); - if (axis == in_dims.size() - 1) { - const int64_t input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t input_width = in_dims[in_dims.size() - 1]; - memset(x_grad_data, 0, x_grad->numel() * sizeof(T)); - if (keepdim) { - kthvalueAssign(input_height, input_width, in_dims.size(), out_grad, - indices, x_grad_data); - } else { - auto& dev_context = - context.template device_context(); - framework::Tensor out_grad_tmp, indices_tmp; - out_grad_tmp.mutable_data(out_grad->dims(), dev_context.GetPlace()); - indices_tmp.mutable_data(indices->dims(), - dev_context.GetPlace()); - framework::TensorCopy(*out_grad, dev_context.GetPlace(), dev_context, - &out_grad_tmp); - framework::TensorCopy(*indices, dev_context.GetPlace(), dev_context, - &indices_tmp); - out_grad_tmp.Resize(out_dims); - indices_tmp.Resize(out_dims); - kthvalueAssign(input_height, input_width, in_dims.size(), &out_grad_tmp, - &indices_tmp, x_grad_data); - } - } else { - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.emplace_back(i); - } - trans.emplace_back(out_dims.size() - 1); - for (int i = axis + 1; i < out_dims.size() - 1; i++) { - trans.emplace_back(i); - } - trans.emplace_back(axis); - framework::DDim trans_dims(out_dims); - framework::DDim trans_in_dims(in_dims); - for (size_t i = 0; i < trans.size(); i++) { - trans_dims[i] = out_dims[trans[i]]; - trans_in_dims[i] = in_dims[trans[i]]; - } - framework::Tensor trans_dO, trans_ind; - trans_dO.mutable_data(trans_dims, context.GetPlace()); - trans_ind.mutable_data(trans_dims, context.GetPlace()); - int ndims = trans.size(); - auto& dev_context = - context.template device_context(); - if (keepdim) { - TransCompute( - ndims, dev_context, *out_grad, &trans_dO, trans); - TransCompute( - ndims, dev_context, *indices, &trans_ind, trans); - } else { - framework::Tensor out_grad_tmp, indices_tmp; - out_grad_tmp.mutable_data(out_grad->dims(), dev_context.GetPlace()); - indices_tmp.mutable_data(indices->dims(), - dev_context.GetPlace()); - framework::TensorCopy(*out_grad, dev_context.GetPlace(), dev_context, - &out_grad_tmp); - framework::TensorCopy(*indices, dev_context.GetPlace(), dev_context, - &indices_tmp); - out_grad_tmp.Resize(out_dims); - indices_tmp.Resize(out_dims); - TransCompute( - ndims, dev_context, out_grad_tmp, &trans_dO, trans); - TransCompute( - ndims, dev_context, indices_tmp, &trans_ind, trans); - } - const int64_t input_height = phi::product( - phi::slice_ddim(trans_in_dims, 0, trans_in_dims.size() - 1)); - const int64_t input_width = trans_in_dims[trans_in_dims.size() - 1]; - framework::Tensor tmp_out; - T* t_out = tmp_out.mutable_data(trans_in_dims, context.GetPlace()); - memset(t_out, 0, x_grad->numel() * sizeof(T)); - kthvalueAssign(input_height, input_width, in_dims.size(), - &trans_dO, &trans_ind, t_out); - TransCompute(ndims, dev_context, tmp_out, - x_grad, trans); - } - } -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/log_softmax_op.cc b/paddle/fluid/operators/log_softmax_op.cc index 0e69b397e04c7eda7f515350caf870be5d7b57a5..da38f906b9bd34ba6c3251059ee12902e62eadaf 100644 --- a/paddle/fluid/operators/log_softmax_op.cc +++ b/paddle/fluid/operators/log_softmax_op.cc @@ -12,10 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/log_softmax_op.h" #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/common_infer_shape_functions.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -24,10 +27,6 @@ class LogSoftmaxOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - return UnaryOpUnchangedInferShapeCheckAxis(ctx); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -123,18 +122,11 @@ class LogSoftmaxGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; - +DECLARE_INFER_SHAPE_FUNCTOR(log_softmax, LogSoftmaxInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMetaCheckAxis)); REGISTER_OPERATOR(log_softmax, ops::LogSoftmaxOp, ops::LogSoftmaxOpMaker, ops::LogSoftmaxOpInferVarType, ops::LogSoftmaxGradOpMaker, - ops::LogSoftmaxGradOpMaker); + ops::LogSoftmaxGradOpMaker, + LogSoftmaxInferShapeFunctor); REGISTER_OPERATOR(log_softmax_grad, ops::LogSoftmaxGradOp); - -REGISTER_OP_CPU_KERNEL( - log_softmax, - ops::LogSoftmaxKernel, - ops::LogSoftmaxKernel); -REGISTER_OP_CPU_KERNEL( - log_softmax_grad, - ops::LogSoftmaxGradKernel, - ops::LogSoftmaxGradKernel); diff --git a/paddle/fluid/operators/log_softmax_op.cu b/paddle/fluid/operators/log_softmax_op.cu deleted file mode 100644 index 26b6ce43303d181c41b60cf36c229d00acb0e626..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/log_softmax_op.cu +++ /dev/null @@ -1,81 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/log_softmax_op.h" -#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class LogSoftmaxKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *x = ctx.Input("X"); - auto *out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - int input_axis = ctx.Attr("axis"); - auto &dev_ctx = ctx.template device_context(); - phi::SoftmaxForwardCUDAKernelDriver(dev_ctx, *x, input_axis, out); - } -}; - -template -class LogSoftmaxGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *out = ctx.Input("Out"); - auto *dout = ctx.Input(framework::GradVarName("Out")); - auto *dx = ctx.Output(framework::GradVarName("X")); - dx->mutable_data(ctx.GetPlace()); - - int input_axis = ctx.Attr("axis"); - auto &dev_ctx = ctx.template device_context(); - phi::SoftmaxBackwardCUDAKernelDriver(dev_ctx, *out, *dout, - input_axis, dx); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -#ifdef PADDLE_WITH_HIP -REGISTER_OP_CUDA_KERNEL( - log_softmax, ops::LogSoftmaxKernel, - ops::LogSoftmaxKernel, - ops::LogSoftmaxKernel); -REGISTER_OP_CUDA_KERNEL( - log_softmax_grad, ops::LogSoftmaxGradKernel, - ops::LogSoftmaxGradKernel, - ops::LogSoftmaxGradKernel); -#else -REGISTER_OP_CUDA_KERNEL( - log_softmax, ops::LogSoftmaxKernel, - ops::LogSoftmaxKernel, - ops::LogSoftmaxKernel, - ops::LogSoftmaxKernel); -REGISTER_OP_CUDA_KERNEL( - log_softmax_grad, ops::LogSoftmaxGradKernel, - ops::LogSoftmaxGradKernel, - ops::LogSoftmaxGradKernel, - ops::LogSoftmaxGradKernel); -#endif diff --git a/paddle/fluid/operators/log_softmax_op.h b/paddle/fluid/operators/log_softmax_op.h deleted file mode 100644 index 162087a75662d711a63cbbe4beeaecf265367c6a..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/log_softmax_op.h +++ /dev/null @@ -1,197 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -using EigenMatrix = framework::EigenMatrix; - -static inline int CanonicalAxis(const int axis, const int rank) { - if (axis < 0) { - return axis + rank; - } - return axis; -} - -static inline size_t SizeToAxis(const int axis, const framework::DDim dims) { - size_t size = 1; - for (int i = 0; i < axis; i++) { - size *= dims[i]; - } - return size; -} - -static inline size_t SizeFromAxis(const int axis, const framework::DDim dims) { - size_t size = 1; - for (int i = axis; i < dims.size(); i++) { - size *= dims[i]; - } - return size; -} - -template -struct ValueClip { - HOSTDEVICE T operator()(const T& x) const { - const T kThreshold = static_cast(-64.); - return x < kThreshold ? kThreshold : x; - } -}; - -template -struct LogSoftmaxFunctor { - void operator()(const DeviceContext& context, const framework::Tensor* X, - framework::Tensor* Y, const int axis) { - constexpr int kBatchDim = 0; - constexpr int kClassDim = 1; - constexpr int kAxisDim = 1; - - int axis_dim = X->dims()[axis]; - const int n = SizeToAxis(axis, X->dims()); - const int d = SizeFromAxis(axis, X->dims()); - framework::DDim dim_2d{n, d}; - - auto logits = EigenMatrix::From(*X, dim_2d); - auto log_softmax = EigenMatrix::From(*Y, dim_2d); - - const int batch_size = logits.dimension(kBatchDim); - const int num_classes = logits.dimension(kClassDim); - const int num_remain = num_classes / axis_dim; - - Eigen::DSizes along_axis(kAxisDim); - Eigen::DSizes batch_classes(batch_size, num_classes); - Eigen::DSizes batch_by_one(batch_size, 1); - Eigen::DSizes one_by_class(1, num_classes); - Eigen::DSizes batch_one_remain(batch_size, 1, num_remain); - Eigen::DSizes one_axis_one(1, axis_dim, 1); - Eigen::DSizes one_axis(1, axis_dim); - Eigen::DSizes batch_axis_remain(batch_size, axis_dim, num_remain); - - // For numerical stability, logits should be shifted by maximum number along - // axis, calculate shifted_logits into log_softmax tensor for memory reuse. - if (num_remain == 1) { - // axis == -1, axis and class in same dimension, calculate along - // class dimension directly for higher performance - log_softmax.device(*context.eigen_device()) = - (logits - - logits.maximum(along_axis) - .eval() - .reshape(batch_by_one) - .broadcast(one_by_class)) - .unaryExpr(ValueClip()); - } else { - // axis != -1, class dimension split into (axis, remain), max and sum - // should be calculated along axis dimension - log_softmax.device(*context.eigen_device()) = - (logits.reshape(batch_axis_remain) - - logits.reshape(batch_axis_remain) - .maximum(along_axis) - .eval() - .reshape(batch_one_remain) - .broadcast(one_axis_one) - .reshape(batch_classes)) - .unaryExpr(ValueClip()); - } - - log_softmax.device(*context.eigen_device()) = - log_softmax - - log_softmax.exp() - .eval() - .reshape(batch_axis_remain) - .sum(along_axis) - .log() - .broadcast(one_axis); - } -}; - -template -class LogSoftmaxKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* X = context.Input("X"); - auto* Out = context.Output("Out"); - const int rank = X->dims().size(); - const int axis = CanonicalAxis(context.Attr("axis"), rank); - - // allocate memory on device. - Out->mutable_data(context.GetPlace()); - - if (X->numel() != 0) { - LogSoftmaxFunctor()( - context.template device_context(), X, Out, axis); - } - } -}; - -template -struct LogSoftmaxGradFunctor { - void operator()(const DeviceContext& context, const framework::Tensor* Y, - const framework::Tensor* dY, framework::Tensor* dX, - const int axis) { - constexpr int kBatchDim = 0; - constexpr int kClassDim = 1; - - const int n = SizeToAxis(axis, Y->dims()); - const int d = SizeFromAxis(axis, Y->dims()); - framework::DDim dim_2d{n, d}; - - auto y = EigenMatrix::From(*Y, dim_2d); - auto dy = EigenMatrix::From(*dY, dim_2d); - auto dx = EigenMatrix::From(*dX, dim_2d); - - const int axis_dim = Y->dims()[axis]; - const int batch_size = y.dimension(kBatchDim); - const int num_classes = y.dimension(kClassDim); - const int num_remain = num_classes / axis_dim; - - Eigen::DSizes along_class(kClassDim); - Eigen::DSizes batch_axis_remain(batch_size, axis_dim, num_remain); - Eigen::DSizes one_axis(1, axis_dim); - - dx.device(*context.eigen_device()) = - dy - - (y.exp()) * (dy.reshape(batch_axis_remain) - .sum(along_class) - .broadcast(one_axis)); - } -}; - -template -class LogSoftmaxGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* Out = context.Input("Out"); - auto* dOut = - context.Input(framework::GradVarName("Out")); - auto* dX = context.Output(framework::GradVarName("X")); - const int rank = Out->dims().size(); - const int axis = CanonicalAxis(context.Attr("axis"), rank); - - // allocate memory on device. - dX->mutable_data(context.GetPlace()); - - if (Out->numel() != 0) { - LogSoftmaxGradFunctor()( - context.template device_context(), Out, dOut, dX, - axis); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/log_softmax_op_npu.cc b/paddle/fluid/operators/log_softmax_op_npu.cc index 5795f1dffac785b82662cebb84e8224cec78ecf6..6ce21aec9215a007ac6ca49ee1bffc1a40d40c81 100644 --- a/paddle/fluid/operators/log_softmax_op_npu.cc +++ b/paddle/fluid/operators/log_softmax_op_npu.cc @@ -12,8 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/log_softmax_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" +#include "paddle/phi/kernels/funcs/axis_utils.h" namespace paddle { namespace operators { @@ -27,7 +28,7 @@ class LogSoftmaxNPUKernel : public framework::OpKernel { auto* X = ctx.Input("X"); auto* Out = ctx.Output("Out"); const int rank = X->dims().size(); - const int axis = CanonicalAxis(ctx.Attr("axis"), rank); + const int axis = phi::funcs::CanonicalAxis(ctx.Attr("axis"), rank); Out->mutable_data(ctx.GetPlace()); if (X->numel() != 0) { @@ -47,7 +48,7 @@ class LogSoftmaxGradNPUKernel : public framework::OpKernel { auto* dOut = ctx.Input(framework::GradVarName("Out")); auto* dX = ctx.Output(framework::GradVarName("X")); const int rank = dOut->dims().size(); - const int axis = CanonicalAxis(ctx.Attr("axis"), rank); + const int axis = phi::funcs::CanonicalAxis(ctx.Attr("axis"), rank); // allocate memory on device. dX->mutable_data(ctx.GetPlace()); diff --git a/paddle/fluid/operators/masked_select_op.cc b/paddle/fluid/operators/masked_select_op.cc index a6eb535c693b8422a7b066618cbfddeddd751387..1887bbcfb7efdcf43e0cc020d773268312523505 100644 --- a/paddle/fluid/operators/masked_select_op.cc +++ b/paddle/fluid/operators/masked_select_op.cc @@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -21,16 +23,6 @@ class MaskedSelectOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "Input", "MaskedSelect"); - OP_INOUT_CHECK(ctx->HasInput("Mask"), "Input", "Mask", "MaskedSelect"); - OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Out", "MaskedSelect"); - - // output will only be a 1-D Tensor - ctx->SetOutputDim("Y", phi::make_ddim({-1})); - ctx->ShareLoD("X", /*->*/ "Y"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -100,8 +92,13 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(MaskedSelectedGradNoNeedBufferVarsInferer, } // namespace paddle namespace ops = paddle::operators; + +DECLARE_INFER_SHAPE_FUNCTOR(masked_select, MaksedSelectInferShapeFunctor, + PD_INFER_META(phi::MaskedSelectInferMeta)); + REGISTER_OPERATOR(masked_select, ops::MaskedSelectOp, ops::MaskedSelectOpMaker, ops::MaskedSelectGradOpMaker, - ops::MaskedSelectGradOpMaker); + ops::MaskedSelectGradOpMaker, + MaksedSelectInferShapeFunctor); REGISTER_OPERATOR(masked_select_grad, ops::MaskedSelectOpGrad, ops::MaskedSelectedGradNoNeedBufferVarsInferer); diff --git a/paddle/fluid/operators/norm_op.cc b/paddle/fluid/operators/norm_op.cc index 5d394424d54f5291df2855041d5d7f943dbd43d0..51daccce0e8822a1eec25ac428e5a56c632805e2 100644 --- a/paddle/fluid/operators/norm_op.cc +++ b/paddle/fluid/operators/norm_op.cc @@ -15,7 +15,9 @@ limitations under the License. */ #include #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -57,21 +59,7 @@ where, $\sum {x^2}$ is calculated along the `axis` dimension. }; class NormOp : public framework::OperatorWithKernel { - public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "NormOp"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "NormOp"); - auto xdim = ctx->GetInputDim("X"); - ctx->SetOutputDim("Out", xdim); - - if (ctx->Attrs().Get("is_test") == false) { - int axis = ctx->Attrs().Get("axis"); - if (axis < 0) axis = xdim.size() + axis; - xdim[axis] = 1; - ctx->SetOutputDim("Norm", xdim); - } - } }; class NormOpGrad : public framework::OperatorWithKernel { @@ -111,7 +99,11 @@ class NormOpGradOpMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; using CPU = paddle::platform::CPUDeviceContext; +DECLARE_INFER_SHAPE_FUNCTOR(norm, NormInferShapeFunctor, + PD_INFER_META(phi::NormInferMeta)); + REGISTER_OPERATOR(norm, ops::NormOp, ops::NormOpMaker, ops::NormOpGradOpMaker, - ops::NormOpGradOpMaker); + ops::NormOpGradOpMaker, + NormInferShapeFunctor); REGISTER_OPERATOR(norm_grad, ops::NormOpGrad); diff --git a/paddle/fluid/operators/pad_op_npu.cc b/paddle/fluid/operators/pad_op_npu.cc index d0cb674b4049f988773accd4b0652d62a1be2287..adc4a2ffaf8c54d32c10fa47e27d86aef2f9c508 100644 --- a/paddle/fluid/operators/pad_op_npu.cc +++ b/paddle/fluid/operators/pad_op_npu.cc @@ -90,5 +90,5 @@ namespace plat = paddle::platform; REGISTER_OP_NPU_KERNEL(pad, ops::PadNPUKernel, ops::PadNPUKernel, ops::PadNPUKernel); -REGISTER_OP_NPU_KERNEL(pad_grad, ops::PadNPUKernel, +REGISTER_OP_NPU_KERNEL(pad_grad, ops::PadGradNPUKernel, ops::PadGradNPUKernel); diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc index 4d2a2e23b3f70dc48029be2e0a79c9695881b519..de35f67405810180554bfd556f91b7501f9c4ba2 100644 --- a/paddle/fluid/operators/prelu_op.cc +++ b/paddle/fluid/operators/prelu_op.cc @@ -9,14 +9,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/prelu_op.h" - #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { +using Tensor = framework::Tensor; + framework::OpKernelType innerGetKernelTypeForVar( const Tensor &tensor, const framework::OpKernelType &expected_kernel_type) { #ifdef PADDLE_WITH_MKLDNN @@ -44,95 +49,6 @@ class PReluOp : public framework::OperatorWithKernel { const framework::AttributeMap &attrs) : OperatorWithKernel(type, inputs, outputs, attrs) {} - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "prelu"); - OP_INOUT_CHECK(ctx->HasInput("Alpha"), "Input", "Alpha", "prelu"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "prelu"); - - auto x_dim = ctx->GetInputDim("X"); - std::string mode = ctx->Attrs().Get("mode"); - if (mode == "all") { - PADDLE_ENFORCE_EQ(phi::product(ctx->GetInputDim("Alpha")), 1, - platform::errors::InvalidArgument( - "For mode 'all', size of weight Alpha must be one. " - "But recevied alpha's size: %d.", - product(ctx->GetInputDim("Alpha")))); - } else if (mode == "channel") { - auto x_rank = x_dim.size(); - PADDLE_ENFORCE_GE(x_rank, 2, - platform::errors::InvalidArgument( - "For mode 'channel', rank of input X must be " - "equal or larger than 2. But recevied X's " - "rank: %d", - x_rank)); - const std::string data_format_str = - ctx->Attrs().Get("data_format"); - PADDLE_ENFORCE_EQ(data_format_str == "NCHW" || data_format_str == "NHWC", - true, - platform::errors::InvalidArgument( - "For mode 'channel', data_format must be one of " - "NCHW and NHWC. But recevied data_format: %s", - data_format_str)); - if (data_format_str == "NCHW" || ctx->IsRunMKLDNNKernel()) { - PADDLE_ENFORCE_EQ( - product(ctx->GetInputDim("Alpha")) == x_dim[1], true, - platform::errors::InvalidArgument( - "For mode 'channel', size of weight Alpha must be " - "equal to the number of channels of input(x). But " - "recevied alpha's size: %d, x_dim[1]: %d", - product(ctx->GetInputDim("Alpha")), x_dim[1])); - } else { - PADDLE_ENFORCE_EQ( - product(ctx->GetInputDim("Alpha")) == x_dim[x_rank - 1], true, - platform::errors::InvalidArgument( - "For mode 'channel', size of weight Alpha must be " - "equal to the number of channels of input(x). But " - "recevied alpha's size: %d, x_dim[%d]: %d", - product(ctx->GetInputDim("Alpha")), x_rank - 1, - x_dim[x_rank - 1])); - } - - } else if (mode == "element") { - auto alpha_dim = ctx->GetInputDim("Alpha"); - auto alpha_rank = alpha_dim.size(); - auto x_rank = x_dim.size(); - PADDLE_ENFORCE_GE(x_rank, 1, - platform::errors::InvalidArgument( - "For mode 'element', rank of input X must be " - "equal or larger than 2. But recevied X's " - "rank: %d", - x_rank)); - PADDLE_ENFORCE_EQ( - alpha_rank, x_rank, - platform::errors::InvalidArgument( - "For mode 'element', rank of weight Alpha must be ", - "equal to the rank of input(x). But recevied alpha's rank: %d, " - "x's rank: %d.", - alpha_rank, x_rank)); - size_t x_product = 1; - size_t alpha_product = 1; - for (int64_t i = x_rank - 1; i > 0; i--) { - x_product *= x_dim[i]; - alpha_product *= alpha_dim[i]; - } - PADDLE_ENFORCE_EQ( - alpha_product, x_product, - platform::errors::InvalidArgument( - "For mode 'element', the size of weight Alpha must be " - "equal to the size of input(x). But recevied alpha's size: %d, " - "x's size: %d.", - alpha_product, x_product)); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Attr(mode) of prelu must be one of 'all', 'channel', or 'element'. " - "But recevied " - "mode: '%s'.", - mode)); - } - ctx->ShareDim("X", /*->*/ "Out"); - ctx->ShareLoD("X", /*->*/ "Out"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -268,13 +184,10 @@ class PReluGradOpMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(prelu, PReluInferShapeFunctor, + PD_INFER_META(phi::PReluInferMeta)); REGISTER_OPERATOR(prelu, ops::PReluOp, ops::PReluOpMaker, ops::PReluGradOpMaker, - ops::PReluGradOpMaker); + ops::PReluGradOpMaker, + PReluInferShapeFunctor); REGISTER_OPERATOR(prelu_grad, ops::PReluGradOp); -REGISTER_OP_CPU_KERNEL( - prelu, ops::PReluKernel, - ops::PReluKernel); -REGISTER_OP_CPU_KERNEL( - prelu_grad, ops::PReluGradKernel, - ops::PReluGradKernel); diff --git a/paddle/fluid/operators/prelu_op.cu b/paddle/fluid/operators/prelu_op.cu deleted file mode 100644 index 12e55d042d7037606179cc06480e4f80f942d8a2..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/prelu_op.cu +++ /dev/null @@ -1,208 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/prelu.h" -#include "paddle/fluid/operators/prelu_op.h" -#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -#define CUDA_NUM_THREADS 1024 - -inline static int PADDLE_GET_BLOCKS(const int N) { - return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; -} - -template -class CUDAPReluKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* alpha = context.Input("Alpha"); - auto* out = context.Output("Out"); - - const T* x_ptr = x->data(); - T* o_ptr = out->mutable_data(context.GetPlace()); - - const T* alpha_ptr = alpha->data(); - auto& mode = context.Attr("mode"); - auto& data_format = context.Attr("data_format"); - - int numel = x->numel(); - auto dim = x->dims(); - auto x_rank = dim.size(); - - VLOG(4) << "dim[0]:" << dim[0] << ", dim[1]:" << dim[1] << ", dim[" - << x_rank - 1 << "]:" << dim[x_rank - 1] << ", numel:" << numel; - - if (mode == "channel") { - bool channel_last = data_format == "NHWC"; - size_t channel = channel_last ? dim[x_rank - 1] : dim[1]; - math::PreluChannelWiseDirectCUDAFunctor prelu_channel_wise; - prelu_channel_wise(context.cuda_device_context().stream(), x_ptr, - alpha_ptr, o_ptr, dim[0], channel, channel_last, - numel); - } else if (mode == "element") { - math::PreluElementWiseDirectCUDAFunctor prelu_element_wise; - prelu_element_wise(context.cuda_device_context().stream(), x_ptr, - alpha_ptr, o_ptr, dim[0], numel); - } else { - math::PreluScalarDirectCUDAFunctor prelu_scalar; - prelu_scalar(context.cuda_device_context().stream(), x_ptr, alpha_ptr, - o_ptr, numel); - } - } -}; - -enum PRELU_MODE { Element, ChannelFirst, ChannelLast, Scalar }; - -template -__global__ void PReluOpGradKernel(const T* x_ptr, const T* alpha_ptr, - const T* dy_ptr, T* dx_ptr, T* dalpha_ptr, - size_t channel_num, size_t plane_size, - size_t spatial_size, size_t numel, - PRELU_MODE mode) { - CUDA_KERNEL_LOOP(index, numel) { - T scale; - if (mode == Element) { - size_t element_index = index % spatial_size; - scale = alpha_ptr[element_index]; - } else if (mode == ChannelFirst) { - size_t temp = index / plane_size; - size_t channel_index = temp % channel_num; - scale = alpha_ptr[channel_index]; - } else if (mode == ChannelLast) { - size_t channel_index = index % channel_num; - scale = alpha_ptr[channel_index]; - } else { - scale = alpha_ptr[0]; - } - T x = x_ptr[index]; - T dy = dy_ptr[index]; - T zero = static_cast(0); - if (dx_ptr != nullptr) dx_ptr[index] = (x > zero) ? dy : scale * dy; - if (dalpha_ptr != nullptr) dalpha_ptr[index] = (x > zero) ? zero : x * dy; - } -} - -template -class PreluOpGradFunctor { - public: - void operator()(gpuStream_t stream, const T* x, const T* alpha, const T* dy, - T* dx, T* dalpha, const framework::DDim& input_dims, - PRELU_MODE mode) { - size_t numel = 1; - for (size_t i = 0; i < input_dims.size(); ++i) { - numel *= input_dims[i]; - } - size_t plane_size = numel / input_dims[0] / input_dims[1]; - size_t spatial_size = numel / input_dims[0]; - size_t channel = - mode == ChannelLast ? input_dims[input_dims.size() - 1] : input_dims[1]; - - PReluOpGradKernel< - T><<>>( - x, alpha, dy, dx, dalpha, channel, plane_size, spatial_size, numel, - mode); - } -}; - -template -class CUDAPReluGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* alpha = context.Input("Alpha"); - auto* dx = context.Output(framework::GradVarName("X")); - auto* dy = context.Input(framework::GradVarName("Out")); - auto* dalpha = context.Output(framework::GradVarName("Alpha")); - - const T* x_ptr = x->data(); - const T* alpha_ptr = alpha->data(); - const T* dy_ptr = dy->data(); - T* dx_ptr = dx ? dx->mutable_data(context.GetPlace()) : nullptr; - T* dalpha_ptr = - dalpha ? dalpha->mutable_data(context.GetPlace()) : nullptr; - - if (!dx && !dalpha) return; - - auto& mode = context.Attr("mode"); - auto& data_format = context.Attr("data_format"); - - int numel = x->numel(); - auto dim = x->dims(); - auto x_rank = dim.size(); - std::vector input_shape = phi::vectorize(dim); - auto stream = context.cuda_device_context().stream(); - - T* dalpha_tmp_ptr; - Tensor dalpha_tmp; - if (dalpha_ptr == nullptr) { - dalpha_tmp_ptr = dalpha_ptr; - } else { - auto& dev_ctx = context.template device_context(); - dalpha_tmp = context.AllocateTmpTensor(dim, dev_ctx); - dalpha_tmp_ptr = dalpha_tmp.mutable_data(context.GetPlace()); - } - - PRELU_MODE m; - bool channel_last = false; - if (mode == "element") { - m = Element; - } else if (mode == "channel") { - channel_last = data_format == "NHWC"; - m = channel_last ? ChannelLast : ChannelFirst; - } else { - m = Scalar; - } - PreluOpGradFunctor prelu_grad; - prelu_grad(stream, x_ptr, alpha_ptr, dy_ptr, dx_ptr, dalpha_tmp_ptr, dim, - m); - - if (dalpha_tmp_ptr == nullptr) return; - - std::vector reduce_dims; - for (size_t i = 0; i < dim.size(); i++) { - if (mode == "channel" && !channel_last && i == 1) continue; - if (mode == "channel" && channel_last && i == dim.size() - 1) continue; - if (mode == "element" && i != 0) continue; - reduce_dims.push_back(i); - } - - TensorReduceImpl>( - context.cuda_device_context(), dalpha_tmp, dalpha, - kps::IdentityFunctor(), reduce_dims, stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL( - prelu, ops::CUDAPReluKernel, - ops::CUDAPReluKernel, - ops::CUDAPReluKernel); -REGISTER_OP_CUDA_KERNEL( - prelu_grad, - ops::CUDAPReluGradKernel, - ops::CUDAPReluGradKernel, - ops::CUDAPReluGradKernel); diff --git a/paddle/fluid/operators/prelu_op.h b/paddle/fluid/operators/prelu_op.h deleted file mode 100644 index 384994eb37c2a955c383ddeebafe5f0e64d3c961..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/prelu_op.h +++ /dev/null @@ -1,172 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/transform.h" -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using platform::Transform; - -template -class PReluKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* alpha = context.Input("Alpha"); - auto* out = context.Output("Out"); - - const T* x_ptr = x->data(); - T* o_ptr = out->mutable_data(context.GetPlace()); - - const T* alpha_ptr = alpha->data(); - auto& mode = context.Attr("mode"); - auto& data_format = context.Attr("data_format"); - - int numel = x->numel(); - auto dim = x->dims(); - int index = 0; - int i = 0; - if (mode == "channel") { - if (data_format == "NCHW") { - int temp = 1; - for (int j = 2; j < dim.size(); j++) { - temp *= dim[j]; - } - for (i = 0; i < numel; i++) { - index = (i / temp) % dim[1]; - o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i]; - } - } else { - for (i = 0; i < numel; i++) { - index = i % dim[dim.size() - 1]; - o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i]; - } - } - } else if (mode == "element") { - int temp = 1; - for (int j = 1; j < dim.size(); j++) { - temp *= dim[j]; - } - for (i = 0; i < numel; i++) { - index = i % temp; - o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i]; - } - } else { - for (i = 0; i < numel; i++) { - o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[0] * x_ptr[i]; - } - } - } -}; - -template -class PReluGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* dx = context.Output(framework::GradVarName("X")); - auto* dout = context.Input(framework::GradVarName("Out")); - auto* dalpha = context.Output(framework::GradVarName("Alpha")); - auto* alpha = context.Input("Alpha"); - const T* alpha_ptr = alpha->data(); - const T* x_ptr = x->data(); - const T* dout_ptr = dout->data(); - std::string mode = context.Attr("mode"); - auto& data_format = context.Attr("data_format"); - int numel = x->numel(); - auto dim = x->dims(); - int index = 0; - int i = 0; - if (dx) { - T* dx_ptr = dx->mutable_data(context.GetPlace()); - if (mode == "channel") { - if (data_format == "NCHW") { - int temp = 1; - for (int j = 2; j < dim.size(); j++) { - temp *= dim[j]; - } - for (i = 0; i < numel; i++) { - index = (i / temp) % dim[1]; - dx_ptr[i] = - x_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[index] * dout_ptr[i]; - } - } else { - for (i = 0; i < numel; i++) { - index = i % dim[dim.size() - 1]; - dx_ptr[i] = - x_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[index] * dout_ptr[i]; - } - } - } else if (mode == "element") { - int temp = 1; - for (int j = 1; j < dim.size(); j++) { - temp *= dim[j]; - } - for (i = 0; i < numel; i++) { - index = i % temp; - dx_ptr[i] = - x_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[index] * dout_ptr[i]; - } - } else { - for (i = 0; i < numel; i++) { - dx_ptr[i] = x_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[0] * dout_ptr[i]; - } - } - } - - index = 0; - if (dalpha) { - T* dalpha_ptr = dalpha->mutable_data(context.GetPlace()); - memset(dalpha_ptr, 0, sizeof(T) * dalpha->numel()); - - if (mode == "channel") { - if (data_format == "NCHW") { - int temp = 1; - for (int j = 2; j < dim.size(); j++) { - temp *= dim[j]; - } - for (i = 0; i < numel; i++) { - index = (i / temp) % dim[1]; - dalpha_ptr[index] += x_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i]; - } - } else { - for (i = 0; i < numel; i++) { - index = i % dim[dim.size() - 1]; - dalpha_ptr[index] += x_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i]; - } - } - } else if (mode == "element") { - int temp = 1; - for (int j = 1; j < dim.size(); j++) { - temp *= dim[j]; - } - for (i = 0; i < numel; i++) { - index = i % temp; - dalpha_ptr[index] += x_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i]; - } - } else { - for (i = 0; i < numel; i++) { - dalpha_ptr[0] += x_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i]; - } - } - } - - // TODO(Guanzhong): add GPU kernels - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op.cc index 41df8e4a15f093a40a31c70eea98dfb7e575f4cd..15812778e0023e30a29f259bbd14b4c564ea8d46 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_max_op.cc @@ -35,13 +35,3 @@ REGISTER_OPERATOR( paddle::framework::DefaultGradOpMaker, ReduceMaxInferShapeFunctor); REGISTER_OPERATOR(reduce_max_grad, ops::ReduceGradOp) - -REGISTER_OP_CPU_KERNEL( - reduce_max_grad, ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_max_op.part.cu deleted file mode 100644 index 5ee38b8fa46290c86cd44ef1bcc71bd2fcd9bcd4..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/reduce_ops/reduce_max_op.part.cu +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h" - -REGISTER_OP_CUDA_KERNEL( - reduce_max_grad, ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc index 4a18330913803f822436118a35fb957b7e31b391..dc41979defb9314f2efb942f0f530c3b5da3bb8b 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc @@ -107,12 +107,3 @@ REGISTER_OPERATOR(reduce_mean_grad, ops::ReduceGradOp, ops::ReduceMeanDoubleGradDescMaker, ops::ReduceMeanDoubleGradOpBaseMaker, ops::ReduceMeanGradNoNeedBufferVarInferer); - -template -using CPUReduceMeanGradKernel = - ops::ReduceGradKernel; - -REGISTER_OP_CPU_KERNEL(reduce_mean_grad, CPUReduceMeanGradKernel, - CPUReduceMeanGradKernel, - CPUReduceMeanGradKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cc b/paddle/fluid/operators/reduce_ops/reduce_min_op.cc index b9915f2b484f140bfd776b64459a19c6788a55c9..5e5b04d57b002d8e8ecab9ddaf8186118f4bf187 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_min_op.cc @@ -35,13 +35,3 @@ REGISTER_OPERATOR( paddle::framework::DefaultGradOpMaker, ReduceMinInferShapeFunctor); REGISTER_OPERATOR(reduce_min_grad, ops::ReduceGradOp) - -REGISTER_OP_CPU_KERNEL( - reduce_min_grad, ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_min_op.part.cu deleted file mode 100644 index bf886063786a8c36884ed20fef41c99468156c01..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/reduce_ops/reduce_min_op.part.cu +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h" - -REGISTER_OP_CUDA_KERNEL( - reduce_min_grad, ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc index eb745ab9c56c5b3cfa62eb36713ebc2485282d6d..b1abdf9e8a758008dff49176c2d6b6682de5b622 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc @@ -14,6 +14,10 @@ #include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + namespace paddle { namespace framework { class OpDesc; @@ -26,14 +30,20 @@ class CPUDeviceContext; } // namespace platform } // namespace paddle -REGISTER_REDUCE_OP(reduce_prod); +namespace ops = paddle::operators; + +class ReduceProdOpMaker : public ops::ReduceOpMaker { + protected: + virtual std::string GetName() const { return "reduce_prod"; } + virtual std::string GetOpType() const { return "Reduce reduce_prod"; } +}; + +DECLARE_INFER_SHAPE_FUNCTOR(reduce_prod, ReduceProdInferShapeFunctor, + PD_INFER_META(phi::ReduceInferMetaBase)); -REGISTER_OP_CPU_KERNEL(reduce_prod_grad, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); +REGISTER_OPERATOR( + reduce_prod, ops::ReduceOp, ReduceProdOpMaker, + paddle::framework::DefaultGradOpMaker, + paddle::framework::DefaultGradOpMaker, + ReduceProdInferShapeFunctor); +REGISTER_OPERATOR(reduce_prod_grad, ops::ReduceGradOp); diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_prod_op.part.cu deleted file mode 100644 index 0610cdd94f89c0371988fac7955d07fc5498a69f..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.part.cu +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h" - -REGISTER_OP_CUDA_KERNEL( - reduce_prod_grad, ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc index 9fd66590cb7298d62a4720ff3a8276eca49df884..12e33d56c0020858ba44709572ee8e526bc949df 100644 --- a/paddle/fluid/operators/roi_pool_op.cc +++ b/paddle/fluid/operators/roi_pool_op.cc @@ -13,9 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/phi/kernels/roi_pool_kernel.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/ternary.h" namespace paddle { namespace operators { @@ -27,74 +29,6 @@ class ROIPoolOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "roi_pool"); - OP_INOUT_CHECK(ctx->HasInput("ROIs"), "Input", "ROIs", "roi_pool"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "roi_pool"); - OP_INOUT_CHECK(ctx->HasOutput("Argmax"), "Output", "Argmax", "roi_pool"); - - auto input_dims = ctx->GetInputDim("X"); - auto rois_dims = ctx->GetInputDim("ROIs"); - - if (ctx->HasInput("RoisNum")) { - auto rois_num_dims = ctx->GetInputDim("RoisNum"); - PADDLE_ENFORCE_EQ(rois_num_dims.size(), 1, - platform::errors::InvalidArgument( - "The second dimension of RoisNum should " - "be 1, but received dimension is %d", - rois_num_dims.size())); - } - PADDLE_ENFORCE_EQ(input_dims.size(), 4, - platform::errors::InvalidArgument( - "The input data should be a four-dimensional " - "tensor with [N,C,H,W], but received input data with " - " %d dimension", - input_dims.size())); - PADDLE_ENFORCE_EQ( - rois_dims.size(), 2, - platform::errors::InvalidArgument( - "ROIs should be a 2-D LoDTensor with shape (num_rois, 4)" - "given as [[x1, y1, x2, y2], ...], but received ROIs is " - "%d-dimensional LoDTensor", - rois_dims.size())); - PADDLE_ENFORCE_EQ( - rois_dims[1], phi::kROISize, - platform::errors::InvalidArgument( - "ROIs should be a 2-D LoDTensor with shape (num_rois, 4)" - "given as [[x1, y1, x2, y2], ...]. But the second dimension of " - "the received data is %d", - rois_dims[1])); - - int pooled_height = ctx->Attrs().Get("pooled_height"); - int pooled_width = ctx->Attrs().Get("pooled_width"); - float spatial_scale = ctx->Attrs().Get("spatial_scale"); - - PADDLE_ENFORCE_GT(pooled_height, 0, - platform::errors::OutOfRange( - "The pooled output height must be greater than 0" - "but received height is %d", - pooled_height)); - PADDLE_ENFORCE_GT(pooled_width, 0, - platform::errors::OutOfRange( - "The pooled output width must be greater than 0" - "but received width is %d", - pooled_width)); - PADDLE_ENFORCE_GT(spatial_scale, 0.0f, - platform::errors::OutOfRange( - "The spatial scale must be greater than 0, " - "but received spatial scale is %f", - spatial_scale)); - - auto out_dims = input_dims; - out_dims[0] = rois_dims[0]; - out_dims[1] = input_dims[1]; - out_dims[2] = pooled_height; - out_dims[3] = pooled_width; - - ctx->SetOutputDim("Out", out_dims); - ctx->SetOutputDim("Argmax", out_dims); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -213,9 +147,13 @@ class ROIPoolGradMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(roi_pool, RoiPoolInferShapeFunctor, + PD_INFER_META(phi::RoiPoolInferMeta)); + REGISTER_OPERATOR(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker, ops::ROIPoolGradMaker, - ops::ROIPoolGradMaker); + ops::ROIPoolGradMaker, + RoiPoolInferShapeFunctor); REGISTER_OPERATOR(roi_pool_grad, ops::ROIPoolGradOp); REGISTER_OP_VERSION(roi_pool) diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu index 19a395e72314db52d52cf704a567dce8dd58318a..41545a1ca20b267e79f43c2af4c58ea64dd479b2 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu @@ -760,8 +760,9 @@ static void SoftmaxWithCrossEntropyHardLabel( */ template __global__ void SoftmaxWithCrossEntropyGradHardLabel( - T* logits_grad, const T* loss_grad, const LabelT* labels, const int64_t n, - const int64_t dim, const int64_t d, const int ignore_index) { + T* logits_grad, const T* loss_grad, const T* softmax, const LabelT* labels, + const int64_t n, const int64_t dim, const int64_t d, + const int ignore_index) { int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; int64_t idx_n = idx / (d * dim); int64_t idx_dim = (idx / d) % dim; @@ -773,10 +774,9 @@ __global__ void SoftmaxWithCrossEntropyGradHardLabel( if (lbl == ignore_index) { logits_grad[idx] = static_cast(0.0); } else if (lbl == idx_dim) { - logits_grad[idx] = - (logits_grad[idx] - static_cast(1.0)) * loss_grad[ids]; + logits_grad[idx] = (softmax[idx] - static_cast(1.0)) * loss_grad[ids]; } else { - logits_grad[idx] *= loss_grad[ids]; + logits_grad[idx] = softmax[idx] * loss_grad[ids]; } } } @@ -1395,11 +1395,20 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel { Tensor* logit_grad = context.Output(framework::GradVarName("Logits")); const Tensor* softmax = context.Input("Softmax"); - if (logit_grad != softmax) { + auto stream = context.cuda_device_context().stream(); + auto ignore_index = context.Attr("ignore_index"); + auto use_softmax = context.Attr("use_softmax"); + + T* logit_grad_data = nullptr; + bool copy_flag = (logit_grad != softmax && (!use_softmax || soft_label)); + if (copy_flag) { framework::TensorCopy(*softmax, context.GetPlace(), context.device_context(), logit_grad); + logit_grad_data = logit_grad->template data(); + } else { + logit_grad_data = + logit_grad->template mutable_data(context.GetPlace()); } - T* logit_grad_data = logit_grad->template data(); const int rank = logit_grad->dims().size(); const int axis = phi::funcs::CanonicalAxis(context.Attr("axis"), rank); @@ -1414,9 +1423,6 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel { #else int block = 512; #endif - auto stream = context.cuda_device_context().stream(); - auto ignore_index = context.Attr("ignore_index"); - auto use_softmax = context.Attr("use_softmax"); // do not with softmax op, and input is softmax if (!use_softmax) { @@ -1451,11 +1457,12 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel { SoftCrossEntropyGradientKernel<<>>( logit_grad_data, loss_grad_data, label_data, n, d, remain); } else { + const T* softmax_data = softmax->template data(); const auto* label_data = labels.template data(); int grid = (n * d + block - 1) / block; SoftmaxWithCrossEntropyGradHardLabel<<>>( - logit_grad_data, loss_grad_data, label_data, n, d / remain, remain, - ignore_index); + logit_grad_data, loss_grad_data, softmax_data, label_data, n, + d / remain, remain, ignore_index); } } }; diff --git a/paddle/fluid/operators/sync_batch_norm_op.cc b/paddle/fluid/operators/sync_batch_norm_op.cc index d198992abde7dc79f0732928a3cb0cb0e6549ded..0c178b02d03099c6e9df4c3eae5ce95352982d7e 100644 --- a/paddle/fluid/operators/sync_batch_norm_op.cc +++ b/paddle/fluid/operators/sync_batch_norm_op.cc @@ -50,6 +50,7 @@ class SyncBatchNormGradMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; + REGISTER_OPERATOR(sync_batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker, ops::BatchNormOpInferVarType, ops::SyncBatchNormGradMaker, diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc index 1df917b8c3594d4505d9e92cd9a8c64bffd50279..e89d8d96342e723724bb867a14bc4262c6ab7b16 100644 --- a/paddle/fluid/pybind/distributed_py.cc +++ b/paddle/fluid/pybind/distributed_py.cc @@ -235,25 +235,13 @@ void BindDistributed(py::module *m) { py::call_guard()); #if defined(PADDLE_WITH_GLOO) - py::class_(*m, "GlooOptions") - .def(py::init<>()) - .def_readwrite("_device", &GlooOptions::device) - .def_static("create", &GlooOptions::create); - - py::class_>(*m, "GlooStore") - .def(py::init( - [](const std::shared_ptr &store) { - return std::make_shared(store); - }), - py::call_guard()); - py::class_>( *m, "ProcessGroupGloo", ProcessGroup) - .def(py::init &, int, int, - std::shared_ptr &>(), + .def(py::init &, int, + int, std::shared_ptr &>(), py::call_guard()) - .def(py::init([](const std::shared_ptr &store, int rank, - int world_size) { + .def(py::init([](const std::shared_ptr &store, + int rank, int world_size) { auto opts = GlooOptions::create(); char *ifname = getenv(GLOO_SOCKET_IFNAME_ENV.c_str()); if (ifname && strlen(ifname) > 1) { diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc index c9e80c7b4b407456fc962f508ae441a9c07914b2..528bd75eb0013b95057d7549e083b2fa1318cac1 100644 --- a/paddle/fluid/pybind/eager_functions.cc +++ b/paddle/fluid/pybind/eager_functions.cc @@ -375,6 +375,7 @@ static PyObject* eager_api_run_costum_op(PyObject* self, PyObject* args, ins_auto_grad_metas.resize(ctx.InputRange().size()); VLOG(7) << "We got slot num of outs is: " << ctx.OutputRange().size(); outs_auto_grad_metas.resize(ctx.OutputRange().size()); + for (size_t i = 0; i < ctx.InputRange().size(); i++) { ins_auto_grad_metas[i] = egr::EagerUtils::nullable_autograd_meta(ctx.InputsBetween( @@ -404,11 +405,15 @@ static PyObject* eager_api_run_costum_op(PyObject* self, PyObject* args, // Prepare Grad outputs size_t no_grad_cnt = 0; for (size_t i = 0; i < ins_auto_grad_metas.size(); i++) { + const std::vector& in_tensors = + ctx.InputsBetween(ctx.InputRangeAt(i).first, + ctx.InputRangeAt(i).second); + if (slot_map[0].find(i) != slot_map[0].end()) { - grad_node->SetGradOutMeta(&ins_auto_grad_metas[i], slot_map[0][i]); + grad_node->SetGradOutMeta(in_tensors, slot_map[0][i]); grad_node->AddEdges(&ins_auto_grad_metas[i], slot_map[0][i]); } else { - grad_node->SetGradOutMeta(&ins_auto_grad_metas[i], + grad_node->SetGradOutMeta(in_tensors, ins_auto_grad_metas.size() - 1 - no_grad_cnt); grad_node->AddEdges(&ins_auto_grad_metas[i], ins_auto_grad_metas.size() - 1 - no_grad_cnt); @@ -417,11 +422,14 @@ static PyObject* eager_api_run_costum_op(PyObject* self, PyObject* args, } // Prepare Grad inputs with grad of fwd outputs for (size_t i = 0; i < outs_auto_grad_metas.size(); i++) { + const std::vector& out_tensors = + ctx.OutputsBetweeen(ctx.OutputRangeAt(i).first, + ctx.OutputRangeAt(i).second); + egr::EagerUtils::SetOutRankWithSlot(&(outs_auto_grad_metas[i]), i); egr::EagerUtils::SetHistory(&(outs_auto_grad_metas[i]), grad_node); - grad_node->SetGradInMeta(&(outs_auto_grad_metas[i]), i); - egr::EagerUtils::CheckAndRetainGrad(ctx.OutputsBetweeen( - ctx.OutputRangeAt(i).first, ctx.OutputRangeAt(i).second)); + grad_node->SetGradInMeta(out_tensors, i); + egr::EagerUtils::CheckAndRetainGrad(out_tensors); } // Prepare Grad inputs with fwd outputs diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index 7f8fcd351fe2a0a9712560f913a83f2cc3580395..e0a3931c3e3d369dcd5798298414dafc8a87e290 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -327,23 +327,25 @@ static PyObject* tensor_clear_gradient(TensorObject* self, PyObject* args, grad = meta->MutableGrad(); } - if (grad->is_selected_rows()) { - auto selected_rows = - std::dynamic_pointer_cast(grad->impl()); - if (selected_rows->mutable_value()->IsInitialized()) { - selected_rows->mutable_rows()->clear(); - selected_rows->mutable_value()->clear(); - } - } else if (grad->is_dense_tensor()) { - if (grad->initialized()) { - if (set_to_zero) { - grad->set_impl(paddle::experimental::zeros_like(*grad).impl()); - } else { - VLOG(4) << "Gradient of " << self->tensor.name() - << " is initialized, will be released."; - auto dense_tensor = - std::dynamic_pointer_cast(grad->impl()); - dense_tensor->MoveMemoryHolder(); + if (grad->impl()) { + if (grad->is_selected_rows()) { + auto selected_rows = + std::dynamic_pointer_cast(grad->impl()); + if (selected_rows->mutable_value()->IsInitialized()) { + selected_rows->mutable_rows()->clear(); + selected_rows->mutable_value()->clear(); + } + } else if (grad->is_dense_tensor()) { + if (grad->initialized()) { + if (set_to_zero) { + grad->set_impl(paddle::experimental::zeros_like(*grad).impl()); + } else { + VLOG(4) << "Gradient of " << self->tensor.name() + << " is initialized, will be released."; + auto dense_tensor = + std::dynamic_pointer_cast(grad->impl()); + dense_tensor->MoveMemoryHolder(); + } } } } @@ -716,6 +718,15 @@ static PyObject* set_grad_type(TensorObject* self, PyObject* args, EAGER_CATCH_AND_THROW_RETURN_NULL } +static PyObject* tensor__inplace_version(TensorObject* self, PyObject* args, + PyObject* kwargs) { + EAGER_TRY + uint32_t inplace_version = self->tensor.current_inplace_version(); + + return ToPyObject(inplace_version); + EAGER_CATCH_AND_THROW_RETURN_NULL +} + PyMethodDef variable_methods[] = { {"numpy", (PyCFunction)(void (*)(void))tensor_method_numpy, METH_VARARGS | METH_KEYWORDS, NULL}, @@ -764,6 +775,8 @@ PyMethodDef variable_methods[] = { METH_VARARGS | METH_KEYWORDS, NULL}, {"_set_grad_type", (PyCFunction)(void (*)(void))set_grad_type, METH_VARARGS | METH_KEYWORDS, NULL}, + {"_inplace_version", (PyCFunction)(void (*)(void))tensor__inplace_version, + METH_VARARGS | METH_KEYWORDS, NULL}, {NULL, NULL, 0, NULL}}; } // namespace pybind diff --git a/paddle/fluid/pybind/eager_op_function_generator.cc b/paddle/fluid/pybind/eager_op_function_generator.cc index 102cdbb91ab066c4a6d499688bca30c1c3d185ad..685e20aef2591492340d228f0a48d7a426ddb889 100644 --- a/paddle/fluid/pybind/eager_op_function_generator.cc +++ b/paddle/fluid/pybind/eager_op_function_generator.cc @@ -162,17 +162,22 @@ static inline std::string TempName(const std::string& name) { std::string GenerateOpFunctionsBody( const paddle::framework::proto::OpProto* op_proto, std::string func_name, - bool use_inplace_strategy = false, std::map inplace_map = {}) { auto& op_type = op_proto->type(); std::string input_args = ""; - std::string call_api_str = "auto out = " + op_type + "_dygraph_function("; + std::string call_api_str = ""; std::string ins_initializer_with_null = ""; std::string py_arg = ""; int arg_idx = 0; int input_args_num = 0; std::string ins_cast_str = ""; std::string view_strategy_str = ""; + if (!inplace_map.empty()) { + // change call_api_str for inplace op + call_api_str = "auto out = " + op_type + "__dygraph_function("; + } else { + call_api_str = "auto out = " + op_type + "_dygraph_function("; + } for (auto& input : op_proto->inputs()) { auto& in_name = input.name(); // skip those dispensable inputs, like ResidualData in conv2d @@ -288,8 +293,31 @@ std::string GenerateOpFunctionsBody( HANDLE_VIEW_BETWEEN_INPUT_AND_OUTPUT, viwe_input_name, viwe_output_name, viwe_input_name, viwe_output_name); } - - return_str = "return ToPyObject(out);"; + if (!inplace_map.empty()) { + // For inplace op, Use the input PyObject directly. + for (auto& inplace_pair : inplace_map) { + // Find index of inplace tensor, and directly use input PyObject. + std::string inplace_arg_name = inplace_pair.second; + std::string inplace_return_name = inplace_pair.first; + const char* RETURN_INPLACE_TENSOR_TEMPLATE = + "ssize_t arg_id = GetIdxFromCoreOpsInfoMap(core_ops_args_info, " + "\"%s\", \"%s\");\n" + " ssize_t return_id = " + "GetIdxFromCoreOpsInfoMap(core_ops_returns_info, \"%s\", \"%s\");\n" + " return ToPyObject(out, return_id, args, arg_id);"; + return_str = paddle::string::Sprintf(RETURN_INPLACE_TENSOR_TEMPLATE, + op_type, inplace_arg_name, op_type, + inplace_return_name); + // only support one inplace_var in temporary. + PADDLE_ENFORCE_EQ( + inplace_map.size(), 1, + paddle::platform::errors::InvalidArgument( + "size of inplace_map must be 1, but got %d", inplace_map.size())); + break; + } + } else { + return_str = "return ToPyObject(out);"; + } std::string function_args = ""; if (input_args == "") { @@ -383,7 +411,8 @@ GenerateOpFunctions() { continue; } std::string func_name = "eager_api_" + op_type; - std::string op_function_str = GenerateOpFunctionsBody(op_proto, func_name); + std::string op_function_str = + GenerateOpFunctionsBody(op_proto, func_name, {}); // generate pybind item auto bind_function_str = paddle::string::Sprintf( @@ -391,6 +420,40 @@ GenerateOpFunctions() { op_function_list.emplace_back(std::move(op_function_str)); bind_function_list.emplace_back(std::move(bind_function_str)); + + // NOTE(pangyoki): Inplace Strategy. + // In this case, output will reuse input varbase. + // Dygraph mode needs to be aligned with the in-place strategy in static + // mode, and the mapping relationships between output and input that have + // been defined in static mode should be used in dygraph mode. + // Find which ops need to use Inplace strategy in static mode, and get the + // mapping relationship between Inplace output and input. + auto& infer_inplace = + paddle::framework::OpInfoMap::Instance().Get(op_type).infer_inplace_; + std::map inplace_map; + // `sum` op has duplicate input. Don't consider adding inplace strategy + // for `sum` in temporary. + if (op_type != "sum" && infer_inplace) { + // Inplace OP: op_type_. + // The inplace OP needs a new implementation method. + auto in_to_outs = infer_inplace(true); + for (auto& inplace_pair : in_to_outs) { + inplace_map[inplace_pair.second] = inplace_pair.first; + } + + std::string inplace_op_type = op_type + "_"; + std::string inplace_func_name = "eager_api_" + inplace_op_type; + std::string inplace_op_function_str = + GenerateOpFunctionsBody(op_proto, inplace_func_name, inplace_map); + + // generate pybind item + auto inplace_bind_function_str = + paddle::string::Sprintf(PYBIND_ITEM_TEMPLATE, inplace_op_type, + inplace_func_name, inplace_op_type); + + op_function_list.emplace_back(std::move(inplace_op_function_str)); + bind_function_list.emplace_back(std::move(inplace_bind_function_str)); + } } if (append_custom_head_file) { op_function_list.emplace_back(CUSTOM_HANDWRITE_OP_FUNC_FILE); diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc index ecf75da080788fb4377923d4d34088f63ebd4969..bd219eb0df495e046562ce466a75bc24287ed066 100644 --- a/paddle/fluid/pybind/eager_utils.cc +++ b/paddle/fluid/pybind/eager_utils.cc @@ -417,6 +417,8 @@ PyObject* ToPyObject(bool value) { PyObject* ToPyObject(int value) { return PyLong_FromLong(value); } +PyObject* ToPyObject(uint32_t value) { return PyLong_FromUnsignedLong(value); } + PyObject* ToPyObject(int64_t value) { return PyLong_FromLongLong(value); } PyObject* ToPyObject(float value) { return PyLong_FromDouble(value); } @@ -442,6 +444,20 @@ PyObject* ToPyObject(const paddle::experimental::Tensor& value) { return obj; } +PyObject* ToPyObject(const paddle::experimental::Tensor& value, + ssize_t value_idx, PyObject* args, ssize_t arg_idx) { + // For inplace op, directly return the input PyObject of the inplace tensor. + // [Parameter] + // value: Useless parameter. + // value_idx: Useless parameter. + // args: Input PyObject. + // arg_idx: Index of inplace PyObject in input args. Used to find the input + // inplace PyObject. + PyObject* obj = PyTuple_GET_ITEM(args, arg_idx); + Py_INCREF(obj); + return obj; +} + PyObject* ToPyObject(const std::vector& value) { PyObject* result = PyList_New((Py_ssize_t)value.size()); diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h index 1c4e2ab69a5ecba1209a11651c3c11972dff565c..fba1485bcf44ea70db286225fbbe3c70caceb4bd 100644 --- a/paddle/fluid/pybind/eager_utils.h +++ b/paddle/fluid/pybind/eager_utils.h @@ -56,6 +56,7 @@ framework::proto::VarType::Type CastPyArg2ProtoType(PyObject* obj, ssize_t arg_pos); PyObject* ToPyObject(int value); +PyObject* ToPyObject(uint32_t value); PyObject* ToPyObject(bool value); PyObject* ToPyObject(int64_t value); PyObject* ToPyObject(float value); @@ -63,6 +64,8 @@ PyObject* ToPyObject(double value); PyObject* ToPyObject(const char* value); PyObject* ToPyObject(const std::string& value); PyObject* ToPyObject(const paddle::experimental::Tensor& value); +PyObject* ToPyObject(const paddle::experimental::Tensor& value, + ssize_t value_idx, PyObject* args, ssize_t arg_idx); PyObject* ToPyObject(const std::vector& value); PyObject* ToPyObject(const std::vector& value); PyObject* ToPyObject(const std::vector& value); @@ -84,6 +87,17 @@ struct TupleTensorResult { TupleTensorResult::Run(out, result); PyTuple_SET_ITEM(result, N - 1, ToPyObject(std::get(out))); } + + static void Run(const Tuple& out, PyObject* result, ssize_t value_idx, + PyObject* args, ssize_t arg_idx) { + TupleTensorResult::Run(out, result, value_idx, args, arg_idx); + if (N - 1 == value_idx) { + PyTuple_SET_ITEM(result, N - 1, ToPyObject(std::get(out), + value_idx, args, arg_idx)); + } else { + PyTuple_SET_ITEM(result, N - 1, ToPyObject(std::get(out))); + } + } }; template @@ -91,6 +105,16 @@ struct TupleTensorResult { static void Run(const Tuple& out, PyObject* result) { PyTuple_SET_ITEM(result, 0, ToPyObject(std::get<0>(out))); } + + static void Run(const Tuple& out, PyObject* result, ssize_t value_idx, + PyObject* args, ssize_t arg_idx) { + if (value_idx == 0) { + PyTuple_SET_ITEM(result, 0, + ToPyObject(std::get<0>(out), value_idx, args, arg_idx)); + } else { + PyTuple_SET_ITEM(result, 0, ToPyObject(std::get<0>(out))); + } + } }; template @@ -103,6 +127,26 @@ PyObject* ToPyObject(const std::tuple& out) { return result; } +template +PyObject* ToPyObject(const std::tuple& out, ssize_t value_idx, + PyObject* args, ssize_t arg_idx) { + // For inplace op, directly return the input PyObject of the inplace tensor. + // [Parameter] + // out: Outputs tuple after executing op. + // value_idx: Index of inplace tensor in outputs tuple. Used to find the + // output inplace tensor. + // args: Input PyObject. + // arg_idx: Index of inplace PyObject in input args. Used to find the input + // inplace PyObject. + auto len = sizeof...(Args); + PyObject* result = PyTuple_New(len); + + TupleTensorResult::Run(out, result, value_idx, + args, arg_idx); + + return result; +} + paddle::experimental::Scalar CastPyArg2Scalar(PyObject* obj, const std::string& op_type, ssize_t arg_pos); diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 85427a8455b54ddb8dcfd453ba3d16684729a9d5..3a2c93309f34454ae0ce2d3419e3fce474f7c06b 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -119,7 +119,11 @@ class PyVariableWrapperHook : public imperative::VariableWrapperHook { return var; } - return PyObjectCast>(res)->SharedVar(); + auto res_varbase = PyObjectCast>(res); + // Here the reference count of `res` is 2, so we decreases the reference + // count manually to avoid memory leaks + Py_DECREF(res); + return res_varbase->SharedVar(); } private: diff --git a/paddle/fluid/pybind/kernel_signature_generator.cc b/paddle/fluid/pybind/kernel_signature_generator.cc index 8d78adaf5a4735d87e2206df6c8b55875db68118..1520174fba288b4ecf683e79c36b6c0228237b2e 100644 --- a/paddle/fluid/pybind/kernel_signature_generator.cc +++ b/paddle/fluid/pybind/kernel_signature_generator.cc @@ -46,10 +46,19 @@ int main(int argc, char **argv) { auto &kernel_factory = phi::KernelFactory::Instance(); std::string kernel_signature_map_str{"{"}; for (const auto &op_kernel_pair : kernel_factory.kernels()) { - if (kernel_signature_map.Has(op_kernel_pair.first)) { + std::string op_name = op_kernel_pair.first; + const paddle::flat_hash_map &kernel_name_map = + phi::OpUtilsMap::Instance().base_kernel_name_map(); + for (auto &it : kernel_name_map) { + if (it.second == op_name) { + op_name = it.first; + break; + } + } + if (kernel_signature_map.Has(op_name)) { kernel_signature_map_str = kernel_signature_map_str + "\"" + op_kernel_pair.first + "\":{"; - auto &args = kernel_signature_map.Get(op_kernel_pair.first).args; + auto &args = kernel_signature_map.Get(op_name).args; kernel_signature_map_str += "\"inputs\":["; auto inputs_ = std::get<0>(args); diff --git a/paddle/fluid/pybind/op_function_common.cc b/paddle/fluid/pybind/op_function_common.cc index 09c3cea398b2aec4d7cf0953ffb0aed75de37601..1d483abd7746c104c3f1dcf318f45850e4fcb855 100644 --- a/paddle/fluid/pybind/op_function_common.cc +++ b/paddle/fluid/pybind/op_function_common.cc @@ -854,5 +854,30 @@ void InitOpsAttrTypeMap() { } } +ssize_t GetIdxFromCoreOpsInfoMap( + const std::unordered_map>& + core_ops_info_map, + const std::string& op_type, const std::string& name) { + // `core_ops_info_map` can be `core_ops_args_info` or `core_ops_returns_info`. + // `core_ops_args_info`: get index from core_ops_args_info[op_type] according + // to input name. + // `core_ops_returns_info`: get index from core_ops_returns_info[op_type] + // according to return name. + if (!core_ops_info_map.count(op_type)) { + PADDLE_THROW(platform::errors::Fatal( + "Op %s is not found in core_ops_*_info map.", op_type)); + } else { + auto args_list = core_ops_info_map.at(op_type); + auto it = std::find(args_list.begin(), args_list.end(), name); + if (it == args_list.end()) { + PADDLE_THROW(platform::errors::Fatal("%s is not found in op %s's args.", + name, op_type)); + } else { + return std::distance(args_list.begin(), it); + } + } + return -1; +} + } // namespace pybind } // namespace paddle diff --git a/paddle/fluid/pybind/op_function_common.h b/paddle/fluid/pybind/op_function_common.h index 7ead9852667252d189b1fcdecc6b4ac7b86d785f..33d0e242a027d250904a21ca36a39b6a639178e1 100644 --- a/paddle/fluid/pybind/op_function_common.h +++ b/paddle/fluid/pybind/op_function_common.h @@ -146,5 +146,10 @@ unsigned long GetUnsignedLongFromArgs( // NOLINT void InitOpsAttrTypeMap(); +ssize_t GetIdxFromCoreOpsInfoMap( + const std::unordered_map>& + core_ops_info_map, + const std::string& op_type, const std::string& name); + } // namespace pybind } // namespace paddle diff --git a/paddle/infrt/backends/tensorrt/test_trt_engine.cc b/paddle/infrt/backends/tensorrt/test_trt_engine.cc index 0ab64dd51c88758d043fb9105ffbf0d109e44cc0..89dd3b0dc7abf48102b48f16fb974b3c902fe049 100644 --- a/paddle/infrt/backends/tensorrt/test_trt_engine.cc +++ b/paddle/infrt/backends/tensorrt/test_trt_engine.cc @@ -82,9 +82,176 @@ TrtUniquePtr ConstructNetwork( return network; } +TrtUniquePtr ConstructFCNetwork( + nvinfer1::IBuilder* builder, nvinfer1::Dims dims, bool is_static_shape) { + TrtUniquePtr network; + if (is_static_shape) { + network.reset(builder->createNetworkV2(0U)); + } else { + auto networkFlags = + 1U << static_cast( + nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); + network.reset(builder->createNetworkV2(networkFlags)); + } + + ITensor* data = + network->addInput(model_input, nvinfer1::DataType::kFLOAT, dims); + CHECK_NOTNULL(data); + nvinfer1::Weights kernel_weights; + kernel_weights.type = nvinfer1::DataType::kFLOAT; + kernel_weights.count = 7840; + std::vector weight_data(kernel_weights.count); + for (size_t i = 0; i < weight_data.size(); ++i) { + weight_data[i] = i % 255 * 0.02f; + } + kernel_weights.values = weight_data.data(); + auto* layer = network->addFullyConnected( + *data, 10, kernel_weights, nvinfer1::Weights{}); + CHECK_NOTNULL(layer); + auto* out = layer->getOutput(0); + out->setName(model_output); + network->markOutput(*out); + return network; +} + +TrtUniquePtr ConstructConvNetwork( + nvinfer1::IBuilder* builder, nvinfer1::Dims dims, bool is_static_shape) { + TrtUniquePtr network; + if (is_static_shape) { + network.reset(builder->createNetworkV2(0U)); + } else { + auto networkFlags = + 1U << static_cast( + nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); + network.reset(builder->createNetworkV2(networkFlags)); + } + + ITensor* data = + network->addInput(model_input, nvinfer1::DataType::kFLOAT, dims); + CHECK_NOTNULL(data); + nvinfer1::Weights kernel_weights, bias_weights; + kernel_weights.type = nvinfer1::DataType::kFLOAT; + bias_weights.type = nvinfer1::DataType::kFLOAT; + kernel_weights.count = 81; + bias_weights.count = 3; + std::vector weight_data(kernel_weights.count); + for (size_t i = 0; i < weight_data.size(); ++i) { + weight_data[i] = i * 0.02f; + } + std::vector bias_data(bias_weights.count); + for (size_t i = 0; i < bias_data.size(); ++i) { + bias_data[i] = i * 0.5f; + } + kernel_weights.values = weight_data.data(); + bias_weights.values = bias_data.data(); + nvinfer1::Dims ksize; + ksize.nbDims = 2; + ksize.d[0] = 3; + ksize.d[1] = 3; + auto* layer = + network->addConvolutionNd(*data, 3, ksize, kernel_weights, bias_weights); + CHECK_NOTNULL(layer); + auto* out = layer->getOutput(0); + out->setName(model_output); + network->markOutput(*out); + return network; +} + // sigmoid(x) = 1 / (1 + exp(-x)) inline float sigmoid(float x) { return 1.f / (1.f + exp(-1 * x)); } +TEST(trt, run_fc_static) { + TrtEngine engine(0); + auto net = ConstructFCNetwork( + engine.GetTrtBuilder(), nvinfer1::Dims3{1, 28, 28}, true); + BuildOptions build_options; + build_options.max_batch = 4; + build_options.workspace = 1024; + engine.Build(std::move(net), build_options); + + InferenceOptions inference_options; + inference_options.batch = 1; + + phi::GPUPlace place; + phi::GPUContext context; + context.PartialInitWithoutAllocator(); + context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(place, context.stream()) + .get()); + context.PartialInitWithAllocator(); + + phi::DenseTensorMeta meta( + phi::DataType::FLOAT32, + phi::make_ddim({inference_options.batch, 1, 28, 28})); + phi::DenseTensor input; + input.set_meta(meta); + context.Alloc(&input, input.numel() * sizeof(float)); + std::vector host_data(inference_options.batch * 1 * 28 * 28, 0); + for (size_t i = 0; i < host_data.size(); ++i) { + host_data[i] = i % 100 * 0.016f; + } + paddle::memory::Copy(place, + input.data(), + phi::CPUPlace(), + host_data.data(), + sizeof(float) * host_data.size(), + context.stream()); + + std::unordered_map inputs; + inputs.emplace(std::make_pair(model_input, &input)); + engine.PrepareOutputHandle("output_0"); + engine.SetUpInference(inference_options, inputs); + engine.GetEngineInfo(); + engine.Run(context); + cudaStreamSynchronize(context.stream()); +} + +TEST(trt, run_conv_static) { + TrtEngine engine(0); + auto net = ConstructConvNetwork( + engine.GetTrtBuilder(), nvinfer1::Dims3{3, 28, 28}, true); + BuildOptions build_options; + build_options.max_batch = 4; + build_options.workspace = 1024; + engine.Build(std::move(net), build_options); + + InferenceOptions inference_options; + inference_options.batch = 1; + + phi::GPUPlace place; + phi::GPUContext context; + context.PartialInitWithoutAllocator(); + context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(place, context.stream()) + .get()); + context.PartialInitWithAllocator(); + + phi::DenseTensorMeta meta( + phi::DataType::FLOAT32, + phi::make_ddim({inference_options.batch, 3, 28, 28})); + phi::DenseTensor input; + input.set_meta(meta); + context.Alloc(&input, input.numel() * sizeof(float)); + std::vector host_data(inference_options.batch * 3 * 28 * 28, 0); + for (size_t i = 0; i < host_data.size(); ++i) { + host_data[i] = i % 100 * 0.016f; + } + paddle::memory::Copy(place, + input.data(), + phi::CPUPlace(), + host_data.data(), + sizeof(float) * host_data.size(), + context.stream()); + + std::unordered_map inputs; + inputs.emplace(std::make_pair(model_input, &input)); + engine.PrepareOutputHandle("output_0"); + engine.SetUpInference(inference_options, inputs); + engine.GetEngineInfo(); + engine.Run(context); + cudaStreamSynchronize(context.stream()); +} + TEST(trt, run_static) { TrtEngine static_trt_engine(0); auto net = ConstructNetwork( diff --git a/paddle/infrt/dialect/infrt/common/types.cc b/paddle/infrt/dialect/infrt/common/types.cc index 62419a196288bb052a9f240ecc25f34c102a5b35..c10679b01342f03b35e816bf290f71790f541ee2 100644 --- a/paddle/infrt/dialect/infrt/common/types.cc +++ b/paddle/infrt/dialect/infrt/common/types.cc @@ -30,6 +30,8 @@ llvm::Optional GetLayoutType(llvm::StringRef key) { return LayoutType::NCHW; else if (key.equals_insensitive("NHWC")) return LayoutType::NHWC; + else if (key.equals_insensitive("ANY")) + return LayoutType::ANY; else return llvm::None; } @@ -39,6 +41,8 @@ llvm::Optional GetPrecisionType(llvm::StringRef key) { return PrecisionType::FLOAT32; else if (key.equals_insensitive("FP16")) return PrecisionType::FLOAT16; + else if (key.equals_insensitive("UNK")) + return PrecisionType::UNK; else return llvm::None; } @@ -67,6 +71,9 @@ llvm::StringRef GetString(LayoutType type) { case (LayoutType::NHWC): str = "NHWC"; break; + case (LayoutType::ANY): + str = "ANY"; + break; default: str = "Unsupported"; } @@ -82,6 +89,9 @@ llvm::StringRef GetString(PrecisionType type) { case (PrecisionType::FLOAT16): str = "FP16"; break; + case (PrecisionType::UNK): + str = "UNK"; + break; default: str = "Unsupported"; } diff --git a/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc b/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc index 8966ca13c2be08f1c744a73b4beaf20b0a3c015c..f8d8f514749f802299600acac60b12de70a8d3fe 100644 --- a/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc +++ b/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc @@ -142,9 +142,6 @@ mlir::Type InfrtDialect::parseType(::mlir::DialectAsmParser &parser) const { return infrt::DenseTensorListType::get(parser.getContext()); } - if (keyword == "dense_tensor_map") { - return DenseTensorMapType::get(parser.getContext()); - } // Todo: parse other type return mlir::Type(); } @@ -181,6 +178,7 @@ void InfrtDialect::printType(::mlir::Type type, if (type.isa()) { os << "tensor_list"; + return; } // print DenseTensorType, for example: !infrt.dense_tensor if (type.isa()) { diff --git a/paddle/infrt/dialect/init_dialects.cc b/paddle/infrt/dialect/init_dialects.cc index 6183295cafb356e85c0fd8bf417c3fb18eb30787..56c375c72d2bbb24e1a279c6f160e0ea7a98bd83 100644 --- a/paddle/infrt/dialect/init_dialects.cc +++ b/paddle/infrt/dialect/init_dialects.cc @@ -33,13 +33,14 @@ void registerCinnDialects(mlir::DialectRegistry ®istry) { // NOLINT registry.insert(); } diff --git a/paddle/infrt/dialect/pd/ir/pd_op_base.td b/paddle/infrt/dialect/pd/ir/pd_op_base.td index 7cab0eca45a1e7f74115f906db10a77f2eb1023b..e28854a848023c1161c8cda24edb705f536b5698 100644 --- a/paddle/infrt/dialect/pd/ir/pd_op_base.td +++ b/paddle/infrt/dialect/pd/ir/pd_op_base.td @@ -17,7 +17,7 @@ def Paddle_Dialect : Dialect { This dialect contains the PaddlePaddle operators. }]; let hasConstantMaterializer = 1; - let cppNamespace = "mlir::pd"; + let cppNamespace = "infrt::pd"; } class PD_Op traits = []> : @@ -25,7 +25,7 @@ class PD_Op traits = []> : class PD_PaddleAttr : - Attr()">, + Attr()">, "PaddlePaddle " # description # " attribute">; @@ -33,12 +33,12 @@ class PD_PaddleAttr : // PaddlePaddle type definitions //===----------------------------------------------------------------------===// -def PD_PDDialectType : Type()">, "PaddlePaddle type">; +def PD_PDDialectType : Type()">, "PaddlePaddle type">; class PD_PaddleType : - Type()">, + Type()">, "Paddle " # description # " type">, - BuildableType<"getType()">; + BuildableType<"getType()">; //===----------------------------------------------------------------------===// // Integer types diff --git a/paddle/infrt/dialect/pd/ir/pd_ops.cc b/paddle/infrt/dialect/pd/ir/pd_ops.cc index d105aa07dd06a9a9c3aba870702b1e304a3a938a..b5ba48581ee62f4e77328ed9f91ad956632dbbb7 100644 --- a/paddle/infrt/dialect/pd/ir/pd_ops.cc +++ b/paddle/infrt/dialect/pd/ir/pd_ops.cc @@ -24,7 +24,7 @@ #define GET_OP_CLASSES #include "paddle/infrt/dialect/pd/ir/pd_extra_ops.cpp.inc" // NOLINT -namespace mlir { +namespace infrt { namespace pd { void PaddleDialect::initialize() { addOperations< @@ -43,33 +43,34 @@ mlir::Operation *PaddleDialect::materializeConstant(mlir::OpBuilder &builder, return builder.create(loc, value); } -void ConstantOp::build(OpBuilder &builder, - OperationState &state, - Attribute value) { - if (auto elem_attr = value.dyn_cast()) { +void ConstantOp::build(mlir::OpBuilder &builder, + mlir::OperationState &state, + mlir::Attribute value) { + if (auto elem_attr = value.dyn_cast()) { return ConstantOp::build(builder, state, elem_attr); - } else if (value.isa()) { - ShapedType type = RankedTensorType::get(/*shape=*/{}, value.getType()); - state.addAttribute("value", DenseElementsAttr::get(type, value)); + } else if (value.isa()) { + mlir::ShapedType type = + mlir::RankedTensorType::get(/*shape=*/{}, value.getType()); + state.addAttribute("value", mlir::DenseElementsAttr::get(type, value)); state.addTypes(type); return; } llvm_unreachable("unsupported attribute type for building pd.constant"); } -LogicalResult ConstantOp::inferReturnTypes( - MLIRContext *context, - Optional location, - ValueRange operands, - DictionaryAttr attributes, - RegionRange regions, - SmallVectorImpl &inferredReturnTypes) { +mlir::LogicalResult ConstantOp::inferReturnTypes( + mlir::MLIRContext *context, + mlir::Optional location, + mlir::ValueRange operands, + mlir::DictionaryAttr attributes, + mlir::RegionRange regions, + llvm::SmallVectorImpl &inferredReturnTypes) { inferredReturnTypes.push_back(attributes.get("value").getType()); - return success(); + return mlir::success(); } mlir::OpFoldResult ConstantOp::fold( ::llvm::ArrayRef operands) { return value(); } } // namespace pd -} // namespace mlir +} // namespace infrt diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc index e22a2309cbe2d343fab4e6e918d3c5a3f98cbb4e..0878163a955af236c6a40f60850e9e5cad67b2aa 100644 --- a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc +++ b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc @@ -55,8 +55,8 @@ bool reverseDfs(std::vector source, // merge the first&second graph op to a new graph op. void mergeTwoAdjacentGraphOp(mlir::OpBuilder &builder, // NOLINT - mlir::pd::GraphOp first, - mlir::pd::GraphOp second) { + infrt::pd::GraphOp first, + infrt::pd::GraphOp second) { // comput inputs and outputs ::llvm::SmallVector inputs(first.getOperands()), outputs; for (mlir::Value input : second.getOperands()) { @@ -85,7 +85,7 @@ void mergeTwoAdjacentGraphOp(mlir::OpBuilder &builder, // NOLINT // create the new graph op builder.setInsertionPoint(first); auto loc = first.getLoc(); - auto graph_op = builder.create(loc, return_types, inputs); + auto graph_op = builder.create(loc, return_types, inputs); mlir::Block *block = new mlir::Block; auto copy_range = second.getBody()->without_terminator(); block->getOperations().splice(block->begin(), @@ -150,13 +150,13 @@ void TRTGraphFusePass::runOnFunction() { do { changed = false; for (auto &op : body) { - mlir::pd::GraphOp graph_op = - ::llvm::dyn_cast_or_null(&op); + infrt::pd::GraphOp graph_op = + ::llvm::dyn_cast_or_null(&op); if (nullptr == graph_op) continue; for (auto user_op : op.getUsers()) { - mlir::pd::GraphOp user_graph_op = - ::llvm::dyn_cast_or_null(user_op); + infrt::pd::GraphOp user_graph_op = + ::llvm::dyn_cast_or_null(user_op); if (nullptr == user_graph_op) continue; // get all dst input nodes except src. std::vector source_nodes; diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc index f81179e548fd5fb15850e9b8943bce440dc3091c..ade61bfc370f550cf85267b3088d697bf1bea997 100644 --- a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc +++ b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc @@ -21,18 +21,18 @@ namespace infrt { namespace trt { // Implementation of the trtGraphSplitPass。 void TRTGraphSplitPass::runOnFunction() { - std::vector worklist; + std::vector worklist; mlir::Block& block = getFunction().front(); for (auto& op : block) { - mlir::pd::GraphOp graph_op = - ::llvm::dyn_cast_or_null(&op); + infrt::pd::GraphOp graph_op = + ::llvm::dyn_cast_or_null(&op); if (nullptr != graph_op && graph_op.getBody()->getOperations().size() <= min_subgraph_size_) { worklist.push_back(graph_op); } } while (!worklist.empty()) { - mlir::pd::GraphOp graph_op = worklist.back(); + infrt::pd::GraphOp graph_op = worklist.back(); worklist.pop_back(); mlir::Block* body = graph_op.getBody(); auto return_op = body->getTerminator(); diff --git a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc index 1e6a3e1380555ea94b0d5de9d64cdc42a27e894e..19c6b13e971ec779ed178413ca08b42b23dc71d1 100644 --- a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc +++ b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc @@ -27,7 +27,7 @@ struct PD2TRT_GraphLower : public ::mlir::RewritePattern { : ::mlir::RewritePattern("pd.graph", 1, context, {"trt.create_engine"}) {} ::mlir::LogicalResult matchAndRewrite( ::mlir::Operation *op, ::mlir::PatternRewriter &rewriter) const override { - auto casted_op = ::llvm::dyn_cast(op); + auto casted_op = ::llvm::dyn_cast(op); ::mlir::Operation::operand_range inputs = casted_op.inputs(); auto ods_loc = rewriter.getFusedLoc(op->getLoc()); CreateEngineOp create_engine_op; diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc index 2c6f08277c803fe744bbfe559f21a6b8b085b816..ef9ccc82678f4bf2e2b518bf346d25393b9e480c 100644 --- a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc +++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc @@ -35,13 +35,13 @@ void TRTOpTellerPass::runOnFunction() { auto *op = worklist.back(); worklist.pop_back(); if (op == nullptr) continue; - if (::llvm::dyn_cast_or_null(op)) continue; - if (::llvm::dyn_cast_or_null(op)) continue; - if (::llvm::dyn_cast_or_null(op)) continue; + if (::llvm::dyn_cast_or_null(op)) continue; + if (::llvm::dyn_cast_or_null(op)) continue; + if (::llvm::dyn_cast_or_null(op)) continue; if (::llvm::dyn_cast_or_null<::infrt::ReturnOp>(op)) continue; builder.setInsertionPoint(op); auto loc = getFunction().getLoc(); - auto graph_op = builder.create( + auto graph_op = builder.create( loc, op->getResultTypes(), op->getOperands()); ::llvm::SmallVector tblgen_repl_values; diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.td b/paddle/infrt/dialect/tensorrt/trt_ops.td index 31b28a38e7cfee4eb6da68302d482218d97f8350..803a11ed5b7e5ce46211a85471536c0300d42630 100755 --- a/paddle/infrt/dialect/tensorrt/trt_ops.td +++ b/paddle/infrt/dialect/tensorrt/trt_ops.td @@ -60,6 +60,39 @@ def TRT_ActivationOp : TRT_Op<"Activation", [NoSideEffect]> { let results = (outs DenseTensor:$output); } +def TRT_FullyConnectedOp : TRT_Op<"FullyConnected", [NoSideEffect]> { + let summary = "TensorRT IFullyConnectedLayer"; + let description = [{ + TensorRT IFullyConnectedLayer + }]; + let arguments = (ins + DenseTensor:$input_tensor, + DenseTensor:$kernel_weights, + DenseTensor:$bias_weights, + SI32Attr:$out_channel_num + ); + let results = (outs + DenseTensor:$output_tensor + ); +} + +def TRT_ConvolutionOp : TRT_Op<"Convolution", [NoSideEffect]> { + let summary = "TensorRT IConvolutionLayer"; + let description = [{ + TensorRT IConvolutionLayer + }]; + let arguments = (ins + DenseTensor:$input_tensor, + DenseTensor:$kernel_weights, + DenseTensor:$bias_weights, + SI32Attr:$out_channel_num, + I32ArrayAttr:$kernel_size + ); + let results = (outs + DenseTensor:$output_tensor + ); +} + def TRT_ElementWiseOp : TRT_Op<"ElementWise", [NoSideEffect]> { let summary = "TensorRT IElementWiseLayer"; let description = [{ diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.cc b/paddle/infrt/host_context/mlir_to_runtime_translate.cc index 3d5cccb5c32694ff05d10811bbff0f068bd6bc51..bcd44540b336eee6d9a76fc14057e8454b9ae329 100644 --- a/paddle/infrt/host_context/mlir_to_runtime_translate.cc +++ b/paddle/infrt/host_context/mlir_to_runtime_translate.cc @@ -298,14 +298,21 @@ bool MlirToRuntimeTranslator::EmitGeneralOp( // add a naive implement. for (int i = 0, e = op->getNumOperands(); i < e; ++i) { auto operand = op->getOperand(i); + Value* arg_value{nullptr}; if (operand.isa()) { mlir::BlockArgument arg = operand.dyn_cast(); - Value* arg_value = GetValue(arg); - if (arg_value->is_type()) { - impl_->runtime->FeedInArgs( - std::make_pair(std::to_string(i), ValueRef(arg_value))); + arg_value = GetValue(arg); + } else { + arg_value = GetValue(operand); + if (!arg_value) { + auto upstream_op = operand.getDefiningOp(); + arg_value = GetOpResult(upstream_op); } } + if (arg_value->is_type()) { + impl_->runtime->FeedInArgs( + std::make_pair(std::to_string(i), ValueRef(arg_value))); + } } #else CHECK(false) << "should not reach here"; diff --git a/paddle/infrt/host_context/paddle_mlir.cc b/paddle/infrt/host_context/paddle_mlir.cc index 48999a23ef34cd119081810fb4baac77f5fb123b..29328520212fd4d020afc28c1e48d2db604414bc 100644 --- a/paddle/infrt/host_context/paddle_mlir.cc +++ b/paddle/infrt/host_context/paddle_mlir.cc @@ -22,7 +22,7 @@ MLIRModelGenImpl::MLIRModelGenImpl() context_->getOrLoadDialect(); context_->getOrLoadDialect(); context_->getOrLoadDialect(); - context_->getOrLoadDialect(); + context_->getOrLoadDialect(); context_->getOrLoadDialect<::infrt::InfrtDialect>(); module_ = mlir::ModuleOp::create(mlir::UnknownLoc::get(context_)); } @@ -91,11 +91,15 @@ llvm::SmallVector MLIRModelGenImpl::GetModelInputsType( if (var_desc.name() == input_var_name) { std::vector dims = RepeatedToVector( var_desc.type().lod_tensor().tensor().dims()); - mlir::Type precision_; - ConvertDataType(var_desc.type().lod_tensor().tensor().data_type(), - builder_, - &precision_); - mlir::Type type_ = mlir::RankedTensorType::get(dims, precision_); + infrt::PrecisionType precision_; + ConvertDataTypeToPhi( + var_desc.type().lod_tensor().tensor().data_type(), &precision_); + mlir::Type type_ = + infrt::DenseTensorType::get(context_, + infrt::TargetType::CPU, + precision_, + infrt::LayoutType::ANY); + operandTypes.push_back(type_); } } @@ -117,11 +121,14 @@ llvm::SmallVector MLIRModelGenImpl::GetModelOutputsType( if (var_desc.name() == input_var_name) { std::vector dims = RepeatedToVector( var_desc.type().lod_tensor().tensor().dims()); - mlir::Type precision_; - ConvertDataType(var_desc.type().lod_tensor().tensor().data_type(), - builder_, - &precision_); - mlir::Type type_ = mlir::RankedTensorType::get(dims, precision_); + infrt::PrecisionType precision_; + ConvertDataTypeToPhi( + var_desc.type().lod_tensor().tensor().data_type(), &precision_); + mlir::Type type_ = + infrt::DenseTensorType::get(context_, + infrt::TargetType::CPU, + precision_, + infrt::LayoutType::ANY); resultTypes.push_back(type_); } } @@ -168,15 +175,11 @@ void MLIRModelGenImpl::UpdateModelParams( auto name = builder_.getStringAttr(var_desc.name()); std::vector dims = RepeatedToVector( var_desc.type().lod_tensor().tensor().dims()); - mlir::Type precision_; - ConvertDataType(var_desc.type().lod_tensor().tensor().data_type(), - builder_, - &precision_); - mlir::Type type_ = - infrt::DenseTensorType::get(context_, - infrt::TargetType::CPU, - infrt::PrecisionType::FLOAT32, - infrt::LayoutType::NCHW); + infrt::PrecisionType precision_; + ConvertDataTypeToPhi(var_desc.type().lod_tensor().tensor().data_type(), + &precision_); + mlir::Type type_ = infrt::DenseTensorType::get( + context_, infrt::TargetType::CPU, precision_, infrt::LayoutType::ANY); auto op = builder_.create( mlir::UnknownLoc::get(context_), type_, map, name); params_map_.insert(std::pair( @@ -262,11 +265,13 @@ llvm::SmallVector MLIRModelGenImpl::GetOpOutputType( if (var_desc.name() == var_name) { std::vector dims = RepeatedToVector( var_desc.type().lod_tensor().tensor().dims()); - mlir::Type precision_; - ConvertDataType(var_desc.type().lod_tensor().tensor().data_type(), - builder_, - &precision_); - mlir::Type type_ = mlir::RankedTensorType::get(dims, precision_); + infrt::PrecisionType precision_; + ConvertDataTypeToPhi(var_desc.type().lod_tensor().tensor().data_type(), + &precision_); + mlir::Type type_ = infrt::DenseTensorType::get(context_, + infrt::TargetType::CPU, + precision_, + infrt::LayoutType::ANY); resultTypes.push_back(type_); } } @@ -403,3 +408,38 @@ bool ConvertDataType(infrt::paddle::framework_proto::VarType::Type dtype, return false; } } + +bool ConvertDataTypeToPhi(infrt::paddle::framework_proto::VarType::Type dtype, + infrt::PrecisionType *type) { + switch (dtype) { + case infrt::paddle::framework_proto::VarType::Type::VarType_Type_FP16: + *type = infrt::PrecisionType::FLOAT16; + return true; + case infrt::paddle::framework_proto::VarType::Type::VarType_Type_FP32: + *type = infrt::PrecisionType::FLOAT32; + return true; + case infrt::paddle::framework_proto::VarType::Type::VarType_Type_FP64: + *type = infrt::PrecisionType::FLOAT64; + return true; + case infrt::paddle::framework_proto::VarType::Type::VarType_Type_BOOL: + *type = infrt::PrecisionType::BOOL; + return true; + case infrt::paddle::framework_proto::VarType::Type::VarType_Type_INT8: + *type = infrt::PrecisionType::INT8; + return true; + case infrt::paddle::framework_proto::VarType::Type::VarType_Type_INT16: + *type = infrt::PrecisionType::INT16; + return true; + case infrt::paddle::framework_proto::VarType::Type::VarType_Type_INT32: + *type = infrt::PrecisionType::INT32; + return true; + case infrt::paddle::framework_proto::VarType::Type::VarType_Type_INT64: + *type = infrt::PrecisionType::INT64; + return true; + case infrt::paddle::framework_proto::VarType::Type::VarType_Type_UINT8: + *type = infrt::PrecisionType::UINT8; + return true; + default: + return false; + } +} diff --git a/paddle/infrt/host_context/paddle_mlir.h b/paddle/infrt/host_context/paddle_mlir.h index d5f1209b9925b6f2bb916cdd99024a5782485365..a351b5cf80e2356a6481ccd302a544dcfe595e05 100644 --- a/paddle/infrt/host_context/paddle_mlir.h +++ b/paddle/infrt/host_context/paddle_mlir.h @@ -102,4 +102,7 @@ inline std::vector RepeatedToVector( bool ConvertDataType(infrt::paddle::framework_proto::VarType::Type dtype, mlir::Builder builder, mlir::Type *type); +bool ConvertDataTypeToPhi(infrt::paddle::framework_proto::VarType::Type dtype, + infrt::PrecisionType *type); + #endif // PADDLE_INFRT_HOST_CONTEXT_PADDLE_MLIR_H_ diff --git a/paddle/infrt/kernel/tensor_kernels.cc b/paddle/infrt/kernel/tensor_kernels.cc index 79502f9fdfd4bd88666f61ff30bc526325b91341..a9077220cfc709116479a5d91b39d56ad4007af8 100644 --- a/paddle/infrt/kernel/tensor_kernels.cc +++ b/paddle/infrt/kernel/tensor_kernels.cc @@ -146,8 +146,8 @@ void RegisterTensorKernels(host_context::KernelRegistry *registry) { // TensorList related methods. #ifdef INFRT_WITH_PHI - registry->AddKernel("dt.tensor_list_get_tensor", - INFRT_KERNEL(TensorListGetTensor)); + registry->AddKernelWithAttrs( + "dt.tensor_list_get_tensor", INFRT_KERNEL(TensorListGetTensor), {"id"}); registry->AddKernel("dt.tensor_list_get_size", INFRT_KERNEL(TensorListGetSize)); #endif diff --git a/paddle/infrt/kernel/tensorrt/trt_helper.h b/paddle/infrt/kernel/tensorrt/trt_helper.h new file mode 100644 index 0000000000000000000000000000000000000000..96122bffacdb2251c28e311ae02fe6f9c5319615 --- /dev/null +++ b/paddle/infrt/kernel/tensorrt/trt_helper.h @@ -0,0 +1,66 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "glog/logging.h" +#include "llvm/Support/ErrorHandling.h" +#include "mlir/IR/BuiltinAttributes.h" +#include "paddle/phi/common/data_type.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace infrt { +namespace kernel { +namespace tensorrt { + +static nvinfer1::DataType TensorTypeToWeightType(phi::DataType tensor_type) { + switch (tensor_type) { + case phi::DataType::FLOAT32: + return nvinfer1::DataType::kFLOAT; + case phi::DataType::INT32: + return nvinfer1::DataType::kINT32; + case phi::DataType::FLOAT16: + return nvinfer1::DataType::kHALF; + default: + llvm_unreachable("should not reach here"); + } +} + +static nvinfer1::Dims ArrayAttrToNvDims(const mlir::ArrayAttr& int_array_attr) { + nvinfer1::Dims dims; + dims.nbDims = int_array_attr.size(); + CHECK(!int_array_attr.empty()); + CHECK(int_array_attr[0].getType().isIntOrIndex()); + for (int i = 0; i < dims.nbDims; ++i) { + dims.d[i] = int_array_attr[i].cast().getInt(); + } + return dims; +} + +static nvinfer1::Weights TensorToWeights(phi::DenseTensor* tensor) { + CHECK_NOTNULL(tensor); + nvinfer1::Weights ret; + ret.type = TensorTypeToWeightType(tensor->dtype()); + ret.count = tensor->numel(); + ret.values = tensor->data(); + return ret; +} + +} // namespace tensorrt +} // namespace kernel +} // namespace infrt diff --git a/paddle/infrt/kernel/tensorrt/trt_kernels.cc b/paddle/infrt/kernel/tensorrt/trt_kernels.cc index 04847ac8982f861ab2799bd23b1c2ab723422327..aa7609092b82c8ab08b75bfbd3e252801cc79c7d 100644 --- a/paddle/infrt/kernel/tensorrt/trt_kernels.cc +++ b/paddle/infrt/kernel/tensorrt/trt_kernels.cc @@ -21,13 +21,19 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/Support/Casting.h" #include "llvm/Support/raw_ostream.h" +#include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/Operation.h" #include "mlir/IR/Value.h" + +#include "paddle/infrt/kernel/tensorrt/trt_helper.h" +#include "paddle/infrt/kernel/tensorrt/trt_layers.h" + #include "paddle/infrt/backends/tensorrt/trt_engine.h" #include "paddle/infrt/backends/tensorrt/trt_options.h" #include "paddle/infrt/dialect/tensorrt/trt_ops.h" #include "paddle/infrt/host_context/symbol_table.h" +#include "paddle/phi/common/place.h" #include "paddle/phi/core/dense_tensor.h" namespace infrt { @@ -35,8 +41,7 @@ namespace kernel { namespace tensorrt { ::infrt::backends::tensorrt::TrtEngine CreateTrtEngine( - MlirOperationWithInfrtSymbol - create_engine_op /*, input_tensors, output_tensors, weights*/) { + MlirOperationWithInfrtSymbol create_engine_op) { // TODO(wilber): The device_id needs to get from mlir. int device_id = 0; backends::tensorrt::TrtEngine engine(device_id); @@ -51,6 +56,7 @@ namespace tensorrt { // TODO(wilber): The build option shoule be fiiled from mlir info. backends::tensorrt::BuildOptions options; options.max_batch = 4; + options.workspace = 1024; // Parse mlir Region which only has one block. mlir::Operation& operation = *create_engine_op.operation; @@ -62,8 +68,9 @@ namespace tensorrt { auto& region = operation.getRegion(0); auto& block = region.getBlocks().front(); - llvm::DenseMap map_info; std::unordered_map trt_bind_inputs; + ValueToITensorMap value_to_trt_tensor_map; + ValueToTensorMap value_to_tensor_map; for (auto index_operand : llvm::enumerate(operation.getOperands())) { mlir::Value operand = index_operand.value(); @@ -73,69 +80,72 @@ namespace tensorrt { auto* v = symbol_table->GetValue(std::to_string(idx)); CHECK_NOTNULL(v); auto* t = &v->get(); - trt_bind_inputs[input_name] = t; + value_to_tensor_map[operand] = t; + // TODO(wilber): get input info from mlir. + // TODO(wilber): input dims, now only support static_shape, and just remove - // the first dimension. + // the first dimension. If the first dim is not -1, maybe we can pass the + // origin dims. + // TODO(wilber): now only suppot float input. - nvinfer1::Dims dims; - dims.nbDims = t->dims().size() - 1; - for (int i = 0; i < dims.nbDims; ++i) { - dims.d[i] = t->dims()[i + 1]; - } - auto* in = - network->addInput(input_name.c_str(), nvinfer1::DataType::kFLOAT, dims); - map_info[operand] = in; - } - // TODO(wilber): Find a way to add layer. - for (auto& inner_op : block.without_terminator()) { - if (inner_op.getName().getStringRef() == "trt.Activation") { - trt::ActivationOp act_op = llvm::dyn_cast(inner_op); - auto in_arg = act_op.getOperand(); - if (!map_info.count(in_arg)) { - CHECK(false) << "map_info not has in_arg."; + if (operand.isa()) { + // TODO(wilber): A trick: the weights are CPU tensor and inputs are GPU + // tensor, so we treat all GPU tensors as inputs to trt. + if (t->place().GetType() == phi::AllocationType::GPU) { + trt_bind_inputs[input_name] = t; + nvinfer1::Dims dims; + dims.nbDims = t->dims().size() - 1; + for (int i = 0; i < dims.nbDims; ++i) { + dims.d[i] = t->dims()[i + 1]; + } + auto* in = network->addInput( + input_name.c_str(), nvinfer1::DataType::kFLOAT, dims); + value_to_trt_tensor_map[operand] = in; } - nvinfer1::ActivationType act_type = - static_cast(act_op.activation_type()); - auto* act_layer = network->addActivation(*map_info[in_arg], act_type); - act_layer->setAlpha(act_op.alpha().convertToFloat()); - act_layer->setBeta(act_op.beta().convertToFloat()); - for (size_t i = 0; i < act_op->getNumResults(); ++i) { - nvinfer1::ITensor* act_out_tensor = act_layer->getOutput(i); - mlir::Value act_out = act_op->getResult(i); - map_info[act_out] = act_out_tensor; + } else { + // TODO(wilber): Replace with the op name that generates the weights. + if (operand.getDefiningOp()->getName().getStringRef() != + "phi_dt.create_dense_tensor.cpu") { + trt_bind_inputs[input_name] = t; + nvinfer1::Dims dims; + dims.nbDims = t->dims().size() - 1; + for (int i = 0; i < dims.nbDims; ++i) { + dims.d[i] = t->dims()[i + 1]; + } + auto* in = network->addInput( + input_name.c_str(), nvinfer1::DataType::kFLOAT, dims); + value_to_trt_tensor_map[operand] = in; } } - - // if (inner_op.getName().getStringRef() == "trt.Constant") { - // trt::ConstantOp op = llvm::dyn_cast(inner_op); - // mlir::Value op_out = op.getResult(); - // std::vector weight_data{1}; - // auto* layer = network->addConstant(nvinfer1::Dims2(1, 1), - // nvinfer1::Weights{nvinfer1::DataType::kFLOAT, weight_data.data(), 1}); - // auto* op_out_tenor = layer->getOutput(0); - // map_info[op_out] = op_out_tenor; - // } } - for (auto& inner_op : block.without_terminator()) { - for (mlir::Value v : inner_op.getResults()) { - for (mlir::Operation* user : v.getUsers()) { - if (user->getName().getStringRef() == "infrt.return") { - if (!map_info.count(v)) { - CHECK(false) << "map_info not has value"; - } - network->markOutput(*map_info[v]); - } - } + + // TODO(wilber): Find a way to add layer. + for (auto& operation : block.without_terminator()) { + if (trt::ActivationOp op = llvm::dyn_cast(operation)) { + ActivationFunc( + op, network.get(), value_to_trt_tensor_map, value_to_tensor_map); + } else if (trt::FullyConnectedOp op = + llvm::dyn_cast(operation)) { + FcFunc(op, network.get(), value_to_trt_tensor_map, value_to_tensor_map); + } else if (trt::ConvolutionOp op = + llvm::dyn_cast(operation)) { + ConvFunc(op, network.get(), value_to_trt_tensor_map, value_to_tensor_map); + } else { + CHECK(false) << "not supported operation."; } } - // std::unordered_map trt_bind_outputs; - mlir::Operation* ret = block.getTerminator(); - for (unsigned int i = 0; i < ret->getNumOperands(); ++i) { - mlir::Value arg = ret->getOperand(i); - CHECK(map_info.count(arg)); - map_info[arg]->setName(("output_" + std::to_string(i)).c_str()); + + for (auto index_operand : + llvm::enumerate(block.getTerminator()->getOperands())) { + mlir::Value arg = index_operand.value(); + CHECK(value_to_trt_tensor_map.count(arg)); + // TODO(wilber): A trick that we name trt output tensor's name as output_0, + // output_1, ... + value_to_trt_tensor_map[arg]->setName( + ("output_" + std::to_string(index_operand.index())).c_str()); + network->markOutput(*value_to_trt_tensor_map[arg]); } for (int i = 0; i < network->getNbOutputs(); ++i) { engine.PrepareOutputHandle(network->getOutput(i)->getName()); diff --git a/paddle/infrt/kernel/tensorrt/trt_layers.h b/paddle/infrt/kernel/tensorrt/trt_layers.h new file mode 100644 index 0000000000000000000000000000000000000000..19e20c170ec835444a5a37818b837dafb096b2b8 --- /dev/null +++ b/paddle/infrt/kernel/tensorrt/trt_layers.h @@ -0,0 +1,104 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include + +#include "paddle/infrt/dialect/tensorrt/trt_ops.h" +#include "paddle/infrt/kernel/tensorrt/trt_helper.h" + +#include "paddle/phi/core/dense_tensor.h" + +namespace infrt { +namespace kernel { +namespace tensorrt { + +using ValueToTensorMap = llvm::DenseMap; +using ValueToITensorMap = llvm::DenseMap; + +inline void ActivationFunc( + trt::ActivationOp& act_op, // NOLINT + nvinfer1::INetworkDefinition* network, + ValueToITensorMap& value_to_trt_tensor_map, // NOLINT + ValueToTensorMap& value_to_tensor_map) { // NOLINT + auto in_arg = act_op.getOperand(); + CHECK(value_to_trt_tensor_map.count(in_arg)) + << "value_to_trt_tensor_map not has in_arg."; + + nvinfer1::ActivationType act_type = + static_cast(act_op.activation_type()); + auto* act_layer = + network->addActivation(*value_to_trt_tensor_map[in_arg], act_type); + act_layer->setAlpha(act_op.alpha().convertToFloat()); + act_layer->setBeta(act_op.beta().convertToFloat()); + for (size_t i = 0; i < act_op->getNumResults(); ++i) { + nvinfer1::ITensor* act_out_tensor = act_layer->getOutput(i); + mlir::Value act_out = act_op->getResult(i); + value_to_trt_tensor_map[act_out] = act_out_tensor; + } +} + +inline void ConvFunc(trt::ConvolutionOp& op, // NOLINT + nvinfer1::INetworkDefinition* network, + ValueToITensorMap& value_to_trt_tensor_map, // NOLINT + ValueToTensorMap& value_to_tensor_map) { // NOLINT + mlir::Value input_tensor_repr = op.input_tensor(); + int out_channel_num = op.out_channel_num(); + auto size_attrs = op.kernel_size(); + nvinfer1::Dims dims = ArrayAttrToNvDims(size_attrs); + auto kernel_weights = + TensorToWeights(value_to_tensor_map[op.kernel_weights()]); + auto bias_weights = TensorToWeights(value_to_tensor_map[op.bias_weights()]); + + auto* layer = + network->addConvolutionNd(*value_to_trt_tensor_map[input_tensor_repr], + out_channel_num, + dims, + kernel_weights, + bias_weights); + CHECK_NOTNULL(layer); + mlir::Value out_repr = op.output_tensor(); + nvinfer1::ITensor* out_tensor = layer->getOutput(0); + value_to_trt_tensor_map[out_repr] = out_tensor; +} + +inline void FcFunc(trt::FullyConnectedOp& op, // NOLINT + nvinfer1::INetworkDefinition* network, + ValueToITensorMap& value_to_trt_tensor_map, // NOLINT + ValueToTensorMap& value_to_tensor_map) { // NOLINT + mlir::Value input_tensor_repr = op.input_tensor(); + CHECK(value_to_trt_tensor_map.count(input_tensor_repr)); + + auto kernel_weights = + TensorToWeights(value_to_tensor_map[op.kernel_weights()]); + auto bias_weights = TensorToWeights(value_to_tensor_map[op.bias_weights()]); + + int out_channel_num = op.out_channel_num(); + auto* layer = + network->addFullyConnected(*value_to_trt_tensor_map[input_tensor_repr], + out_channel_num, + kernel_weights, + bias_weights); + + mlir::Value out_repr = op.output_tensor(); + nvinfer1::ITensor* out_tensor = layer->getOutput(0); + value_to_trt_tensor_map[out_repr] = out_tensor; +} +} // namespace tensorrt +} // namespace kernel +} // namespace infrt diff --git a/paddle/infrt/tests/dialect/disabled_trt.mlir b/paddle/infrt/tests/dialect/tensorrt/disabled_trt.mlir similarity index 100% rename from paddle/infrt/tests/dialect/disabled_trt.mlir rename to paddle/infrt/tests/dialect/tensorrt/disabled_trt.mlir diff --git a/paddle/infrt/tests/dialect/tensorrt/disabled_trt_conv.mlir b/paddle/infrt/tests/dialect/tensorrt/disabled_trt_conv.mlir new file mode 100644 index 0000000000000000000000000000000000000000..c67d47415bfb002d2c7a91ee8b222c6227968d52 --- /dev/null +++ b/paddle/infrt/tests/dialect/tensorrt/disabled_trt_conv.mlir @@ -0,0 +1,54 @@ +// RUN: infrtexec -i %s | FileCheck %s + +// CHECK-LABEL: @run_trt +func @run_trt(%input_tensor : !infrt.dense_tensor, %kernel_weight : !infrt.dense_tensor, %kernel_bias : !infrt.dense_tensor, %gpu_ctx : !phi.context) { + %a = "trt.create_engine"(%input_tensor, %kernel_weight, %kernel_bias) ({ + %1 = "trt.Activation"(%input_tensor) {activation_type = 1 : si32, alpha = 1.0 : f32, beta = 6.0 : f32} : (!infrt.dense_tensor) -> !infrt.dense_tensor + %2 = "trt.Convolution"(%input_tensor, %kernel_weight, %kernel_bias) {out_channel_num = 3 : si32, kernel_size = [3:i32, 3:i32]} : (!infrt.dense_tensor, !infrt.dense_tensor, !infrt.dense_tensor) -> !infrt.dense_tensor + "infrt.return"(%1, %2) : (!infrt.dense_tensor, !infrt.dense_tensor) -> () + }) : (!infrt.dense_tensor, !infrt.dense_tensor, !infrt.dense_tensor) -> !trt.engine + "trt.inspect_engine"(%a) {} : (!trt.engine) -> () + + %res = "trt.compute"(%a, %gpu_ctx) {} : (!trt.engine, !phi.context) -> (!infrt.tensor_list) + %size = "dt.tensor_list_get_size"(%res) {} : (!infrt.tensor_list) -> (i32) + "infrt.print.i32"(%size) {} : (i32) -> () + + %ts0 = "dt.tensor_list_get_tensor"(%res) {id = 0 : i32} : (!infrt.tensor_list) -> (!infrt.dense_tensor) + "phi_dt.print_tensor" (%ts0) : (!infrt.dense_tensor) -> () + + %ts1 = "dt.tensor_list_get_tensor"(%res) {id = 1 : i32} : (!infrt.tensor_list) -> (!infrt.dense_tensor) + "phi_dt.print_tensor" (%ts1) : (!infrt.dense_tensor) -> () + + infrt.return +} + +// CHECK-LABEL: @main +func @main() { + %gpu_ctx = "phi_dt.create_context.gpu" (): () -> !phi.context + %cpu_ctx = "phi_dt.create_context.cpu" (): () -> !phi.context + + %input_tensor = "phi_dt.create_dense_tensor.gpu" (%gpu_ctx) { + precision=#infrt.precision, + layout=#infrt.layout, + dims=[1:i64, 3:i64, 28:i64, 28:i64], lod=[0:i64]}: (!phi.context) -> (!infrt.dense_tensor) + "phi_dt.fill_dense_tensor.f32"(%input_tensor) {value=[3.8:f32, 2.4:f32, 1.3:f32]} : (!infrt.dense_tensor) -> () + // "phi_dt.print_tensor" (%input_tensor) : (!infrt.dense_tensor) -> () + + %kernel_weight = "phi_dt.create_dense_tensor.cpu"(%cpu_ctx) { + precision=#infrt.precision, + layout=#infrt.layout, + dims=[3:i64, 3:i64, 3:i64, 3:i64], lod=[0:i64]} : (!phi.context) -> (!infrt.dense_tensor) + "phi_dt.fill_dense_tensor.f32"(%kernel_weight) {value=[1.:f32, 2.:f32, 3.:f32, 4.:f32, 5.:f32, 6.:f32]} : (!infrt.dense_tensor) -> () + // "phi_dt.print_tensor" (%kernel_weight) : (!infrt.dense_tensor) -> () + + %kernel_bias = "phi_dt.create_dense_tensor.cpu"(%cpu_ctx) { + precision=#infrt.precision, + layout=#infrt.layout, + dims=[3:i64], lod=[0:i64]} : (!phi.context) -> (!infrt.dense_tensor) + "phi_dt.fill_dense_tensor.f32"(%kernel_bias) {value=[1.:f32]} : (!infrt.dense_tensor) -> () + // "phi_dt.print_tensor" (%kernel_bias) : (!infrt.dense_tensor) -> () + + infrt.call @run_trt(%input_tensor, %kernel_weight, %kernel_bias, %gpu_ctx) : (!infrt.dense_tensor, !infrt.dense_tensor, !infrt.dense_tensor, !phi.context) -> () + + infrt.return +} diff --git a/paddle/infrt/tests/dialect/tensorrt/disabled_trt_fc.mlir b/paddle/infrt/tests/dialect/tensorrt/disabled_trt_fc.mlir new file mode 100644 index 0000000000000000000000000000000000000000..78dc4ac1c1093c1eb9b3fb30d0ea3f0cd5be6104 --- /dev/null +++ b/paddle/infrt/tests/dialect/tensorrt/disabled_trt_fc.mlir @@ -0,0 +1,46 @@ +// RUN: infrtexec -i %s | FileCheck %s + +// CHECK-LABEL: @main +func @main() { + %ctx = "phi_dt.create_context.gpu" (): () -> !phi.context + %cpu_ctx = "phi_dt.create_context.cpu" (): () -> !phi.context + + %input_tensor = "phi_dt.create_dense_tensor.gpu" (%ctx) { + precision=#infrt.precision, + layout=#infrt.layout, + dims=[1:i64, 3:i64, 1:i64, 1:i64], lod=[1:i64]}: (!phi.context) -> (!infrt.dense_tensor) + "phi_dt.fill_dense_tensor.f32"(%input_tensor) {value=[3.8:f32, 2.4:f32, 1.3:f32]} : (!infrt.dense_tensor) -> () + //"phi_dt.print_tensor" (%input_tensor) : (!infrt.dense_tensor) -> () + + %kernel_weight = "phi_dt.create_dense_tensor.cpu"(%cpu_ctx) { + precision=#infrt.precision, + layout=#infrt.layout, + dims=[2:i64, 3:i64], lod=[1:i64]} : (!phi.context) -> (!infrt.dense_tensor) + "phi_dt.fill_dense_tensor.f32"(%kernel_weight) {value=[1.:f32, 2.:f32, 3.:f32, 4.:f32, 5.:f32, 6.:f32]} : (!infrt.dense_tensor) -> () + //"phi_dt.print_tensor" (%kernel_weight) : (!infrt.dense_tensor) -> () + + %kernel_bias = "phi_dt.create_dense_tensor.cpu"(%cpu_ctx) { + precision=#infrt.precision, + layout=#infrt.layout, + dims=[2:i64], lod=[1:i64]} : (!phi.context) -> (!infrt.dense_tensor) + "phi_dt.fill_dense_tensor.f32"(%kernel_bias) {value=[1.:f32, 2.:f32]} : (!infrt.dense_tensor) -> () + //"phi_dt.print_tensor" (%kernel_bias) : (!infrt.dense_tensor) -> () + + %engine = "trt.create_engine"(%input_tensor, %kernel_weight, %kernel_bias) ({ + %1 = "trt.Activation"(%input_tensor) {activation_type = 1 : si32, alpha = 1.0 : f32, beta = 6.0 : f32} : (!infrt.dense_tensor) -> !infrt.dense_tensor + %2 = "trt.FullyConnected"(%input_tensor, %kernel_weight, %kernel_bias) {out_channel_num = 2 : si32} : (!infrt.dense_tensor, !infrt.dense_tensor, !infrt.dense_tensor) -> !infrt.dense_tensor + "infrt.return"(%1, %2) : (!infrt.dense_tensor, !infrt.dense_tensor) -> () + }) : (!infrt.dense_tensor, !infrt.dense_tensor, !infrt.dense_tensor) -> !trt.engine + + %res = "trt.compute"(%engine, %ctx) {} : (!trt.engine, !phi.context) -> (!infrt.tensor_list) + %size = "dt.tensor_list_get_size"(%res) {} : (!infrt.tensor_list) -> (i32) + "infrt.print.i32"(%size) {} : (i32) -> () + + %ts0 = "dt.tensor_list_get_tensor"(%res) {id = 0 : i32} : (!infrt.tensor_list) -> (!infrt.dense_tensor) + "phi_dt.print_tensor" (%ts0) : (!infrt.dense_tensor) -> () + + %ts1 = "dt.tensor_list_get_tensor"(%res) {id = 1 : i32} : (!infrt.tensor_list) -> (!infrt.dense_tensor) + "phi_dt.print_tensor" (%ts1) : (!infrt.dense_tensor) -> () + + infrt.return +} diff --git a/paddle/infrt/tests/model/test_abs.cc b/paddle/infrt/tests/model/test_abs.cc index 5de159b86fce29f774b07770aaaee0c1b6aebd31..49266910dbd278fb8d429534134097751cf8b6b1 100644 --- a/paddle/infrt/tests/model/test_abs.cc +++ b/paddle/infrt/tests/model/test_abs.cc @@ -72,7 +72,7 @@ TEST(ABS_MODEL, convert_and_execute) { context->getOrLoadDialect(); context->getOrLoadDialect(); context->getOrLoadDialect(); - context->getOrLoadDialect(); + context->getOrLoadDialect(); context->getOrLoadDialect(); context->getOrLoadDialect(); diff --git a/paddle/phi/api/include/tensor.h b/paddle/phi/api/include/tensor.h index ce40627bb0d3742aff7f60583d2e0b9cbbd8fb02..eae8d12fb371ed48794b573c31f7bd19e21f04f2 100644 --- a/paddle/phi/api/include/tensor.h +++ b/paddle/phi/api/include/tensor.h @@ -481,7 +481,21 @@ class PADDLE_API Tensor final { */ void set_autograd_meta(std::shared_ptr autograd_meta); - /* Part 9: Auto generated Tensor methods */ + /* Part 9: Inplace methods */ + + /** + * @brief Increase inplace version + */ + void bump_inplace_version(); + + /** + * @brief Get current inplace version + * + * @return uint32_t + */ + uint32_t current_inplace_version(); + + /* Part 10: Auto generated Tensor methods */ private: /** diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc index 6be85d720007e8464647974f43d42f8430a827a8..6090e6a400ac38b321ca68835f728dd211a0f10b 100644 --- a/paddle/phi/api/lib/tensor.cc +++ b/paddle/phi/api/lib/tensor.cc @@ -347,5 +347,36 @@ void Tensor::set_autograd_meta( autograd_meta_ = std::move(autograd_meta); } +void Tensor::bump_inplace_version() { + if (is_dense_tensor()) { + auto &inplace_version_counter = + std::dynamic_pointer_cast(impl_) + ->InplaceVersionCounter(); + VLOG(3) << "yoki: before bump inplace version: " + << inplace_version_counter.CurrentVersion(); + inplace_version_counter.Bump(); + VLOG(3) << "yoki: after bump inplace version: " + << inplace_version_counter.CurrentVersion(); + } else { + PADDLE_THROW(phi::errors::Unimplemented( + "bump_inplace_version is only supported on DenseTensor now.")); + } +} + +uint32_t Tensor::current_inplace_version() { + if (is_dense_tensor()) { + auto &inplace_version_counter = + std::dynamic_pointer_cast(impl_) + ->InplaceVersionCounter(); + VLOG(3) << "yoki: print version: " + << inplace_version_counter.CurrentVersion(); + return inplace_version_counter.CurrentVersion(); + } else { + PADDLE_THROW(phi::errors::Unimplemented( + "current_inplace_version is only supported on DenseTensor now.")); + } + return 0; +} + } // namespace experimental } // namespace paddle diff --git a/paddle/phi/core/compat/op_utils.h b/paddle/phi/core/compat/op_utils.h index b1da573c49f2f20c6b25beae189fe5952efd3cef..946230cb169d20db56a46399552b629348c4783f 100644 --- a/paddle/phi/core/compat/op_utils.h +++ b/paddle/phi/core/compat/op_utils.h @@ -47,8 +47,13 @@ const std::unordered_set deprecated_op_names({"diag", "matmul_grad", "matmul_grad_grad", "mean", + "mean_grad", "max", + "max_grad", "min", + "min_grad", + "prod", + "prod_grad", "any", "all", "reshape", diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h index d9ed68593cd610790ee4a0015069ac5a8cfea61b..c3356eadcbd2156617a7a69324e7b440cc54b339 100644 --- a/paddle/phi/core/kernel_registry.h +++ b/paddle/phi/core/kernel_registry.h @@ -98,6 +98,28 @@ struct KernelArgsParseFunctor { default_tensor_layout, default_key.dtype(), arg_type); + } else if (arg_type == std::type_index(typeid(const SparseCooTensor&))) { + args_def->AppendInput(default_key.backend(), + default_tensor_layout, + default_key.dtype(), + arg_type); + } else if (arg_type == std::type_index(typeid( + paddle::optional))) { + args_def->AppendInput(default_key.backend(), + default_tensor_layout, + default_key.dtype(), + arg_type); + } else if (arg_type == std::type_index(typeid(const SparseCsrTensor&))) { + args_def->AppendInput(default_key.backend(), + default_tensor_layout, + default_key.dtype(), + arg_type); + } else if (arg_type == std::type_index(typeid( + paddle::optional))) { + args_def->AppendInput(default_key.backend(), + default_tensor_layout, + default_key.dtype(), + arg_type); } else if (arg_type == std::type_index(typeid(DenseTensor*))) { args_def->AppendOutput(default_key.backend(), default_tensor_layout, @@ -114,6 +136,16 @@ struct KernelArgsParseFunctor { default_tensor_layout, default_key.dtype(), arg_type); + } else if (arg_type == std::type_index(typeid(SparseCooTensor*))) { + args_def->AppendOutput(default_key.backend(), + default_tensor_layout, + default_key.dtype(), + arg_type); + } else if (arg_type == std::type_index(typeid(SparseCsrTensor*))) { + args_def->AppendOutput(default_key.backend(), + default_tensor_layout, + default_key.dtype(), + arg_type); } else { // Attribute deal with // TODO(chenweihang): now here allow any types of attribute, maybe diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index b7a7a4ec231ddfdbfd4da75e71aebaa49f99443f..aabb944db30b9f30394f092c245bc0307d8bbf3f 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -21,6 +21,8 @@ limitations under the License. */ #include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/kernels/funcs/common_shape.h" +#include "paddle/phi/kernels/cpu/conv_util.h" + namespace phi { namespace detail { @@ -355,6 +357,161 @@ void CrossInferMeta(const MetaTensor& x, out->share_lod(x); } +void ConvInferMeta(const MetaTensor& input, + const MetaTensor& filter, + const std::vector& strides, + const std::vector& paddings_t, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_t, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + MetaTensor* out, + MetaConfig config) { + std::vector paddings = paddings_t; + std::vector dilations = dilations_t; + auto in_dims = input.dims(); + auto filter_dims = filter.dims(); + int dilation_size = dilations.size(); + for (int i = 0; i < dilation_size; ++i) { + PADDLE_ENFORCE_GT( + dilations[i], + 0, + phi::errors::InvalidArgument( + "The dilation of Op(Conv) should be larget than 0, but received " + "dilation is %d.", + dilations[i])); + } + const bool channel_last = (config.is_run_mkldnn_kernel == false) && + (data_format == "NHWC" || data_format == "NDHWC"); + + PADDLE_ENFORCE_EQ( + in_dims.size() == 4 || in_dims.size() == 5, + true, + phi::errors::InvalidArgument( + "The input of Op(Conv) should be a 4-D or 5-D Tensor. But " + "received: input's dimension is %u, input's shape is [%s].", + in_dims.size(), + in_dims)); + + PADDLE_ENFORCE_EQ( + in_dims.size(), + filter_dims.size(), + phi::errors::InvalidArgument( + "The input's dimension and filter's dimension of " + "Op(Conv) should be equal. But received: the input's shape is [%s], " + "the input's dimension is %d; the filter's shape is [%s], " + "the filter's dimension is %d.", + in_dims, + in_dims.size(), + filter_dims, + filter_dims.size())); + + int stride_size = strides.size(); + for (int i = 0; i < stride_size; ++i) { + PADDLE_ENFORCE_GT( + strides[i], + 0, + phi::errors::InvalidArgument( + "The stride of Op(Conv) should be larget than 0, but received " + "stride is %d.", + strides[i])); + } + + int in_sub_stride_size = in_dims.size() - stride_size; + PADDLE_ENFORCE_EQ( + in_dims.size(), + strides.size() + 2U, + phi::errors::InvalidArgument( + "The difference of input's dimension and Attr(strides)'s " + "length must be euqal to 2 for Op(Conv). " + "But received: input's dimension is %d, input's shape is [%s]; " + "Attr(stride)'s length is %d, Attr(stride) is [%s]; " + "difference of input's dimention and Attr(strides)'s length = %u.", + in_dims.size(), + in_dims, + strides.size(), + phi::make_ddim(strides), + in_sub_stride_size)); + + const auto input_channels = + channel_last ? in_dims[in_dims.size() - 1] : in_dims[1]; + + PADDLE_ENFORCE_EQ( + input_channels, + filter_dims[1] * groups, + phi::errors::InvalidArgument( + "The number of input's channels should be equal to filter's channels " + "* groups for Op(Conv). But received: the input's channels is %d, " + "the input's shape is [%s]; the filter's channels is %d, the " + "filter's shape is [%s]; the groups is %d, the data_format is %s. " + "The error may come from wrong data_format setting.", + input_channels, + in_dims, + filter_dims[1], + filter_dims, + groups, + data_format)); + PADDLE_ENFORCE_EQ( + filter_dims[0] % groups, + 0, + phi::errors::InvalidArgument( + "The number of output's channels (filter's first dimension) of " + "Op(Conv) should be divided by groups. But received: " + "the output channels is %d, the filter's shape is [%s], " + "the groups is %d.", + filter_dims[0], + filter_dims, + groups)); + + if (config.is_runtime) { + PADDLE_ENFORCE_GT( + filter_dims[0], + 0, + phi::errors::InvalidArgument( + "the size of filter at axis 0 should be greater than 0")); + } + + DDim in_data_dims; + if (channel_last) { + in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); + } else { + in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); + } + + DDim filter_data_dims = phi::slice_ddim(filter_dims, 2, filter_dims.size()); + + std::vector ksize = phi::vectorize(filter_data_dims); + phi::UpdatePaddingAndDilation( + &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); + + std::vector output_shape({in_dims[0]}); + if (!channel_last) { + output_shape.push_back(filter_dims[0]); + } + for (int i = 0; i < in_data_dims.size(); ++i) { + if ((!config.is_runtime) && + (in_data_dims[i] <= 0 || filter_dims[i + 2] <= 0)) { + output_shape.push_back(-1); + } else { + const int dkernel = dilations[i] * (filter_data_dims[i] - 1) + 1; + int output_size = + (in_data_dims[i] + paddings[2 * i] + paddings[2 * i + 1] - dkernel) / + strides[i] + + 1; + output_shape.push_back(output_size); + } + } + if (channel_last) { + output_shape.push_back(filter_dims[0]); + } + + out->set_dims(make_ddim(output_shape)); + out->set_dtype(input.dtype()); +} + void DistInferMeta(const MetaTensor& x, const MetaTensor& y, float p, @@ -815,6 +972,13 @@ void LogLossInferMeta(const MetaTensor& input, out->share_lod(input); } +void MaskedSelectInferMeta(const MetaTensor& x, + const MetaTensor& mask, + MetaTensor* out) { + out->set_dims({-1}); // can not infer + out->set_dtype(x.dtype()); +} + void MatmulInferMeta(const MetaTensor& x, const MetaTensor& y, bool trans_x, @@ -918,6 +1082,103 @@ void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out) { out->share_lod(x); } +void PReluInferMeta(const MetaTensor& x, + const MetaTensor& alpha, + const std::string& mode, + const std::string& data_format, + MetaTensor* out, + MetaConfig config) { + auto x_dim = x.dims(); + if (mode == "all") { + PADDLE_ENFORCE_EQ(phi::product(alpha.dims()), + 1, + phi::errors::InvalidArgument( + "For mode 'all', size of weight Alpha must be one. " + "But recevied alpha's size: %d.", + product(alpha.dims()))); + } else if (mode == "channel") { + auto x_rank = x_dim.size(); + PADDLE_ENFORCE_GE(x_rank, + 2, + phi::errors::InvalidArgument( + "For mode 'channel', rank of input X must be " + "equal or larger than 2. But recevied X's " + "rank: %d", + x_rank)); + PADDLE_ENFORCE_EQ(data_format == "NCHW" || data_format == "NHWC", + true, + phi::errors::InvalidArgument( + "For mode 'channel', data_format must be one of " + "NCHW and NHWC. But recevied data_format: %s", + data_format)); + if (data_format == "NCHW" || config.is_run_mkldnn_kernel) { + PADDLE_ENFORCE_EQ(product(alpha.dims()) == x_dim[1], + true, + phi::errors::InvalidArgument( + "For mode 'channel', size of weight Alpha must be " + "equal to the number of channels of input(x). But " + "recevied alpha's size: %d, x_dim[1]: %d", + product(alpha.dims()), + x_dim[1])); + } else { + PADDLE_ENFORCE_EQ(product(alpha.dims()) == x_dim[x_rank - 1], + true, + phi::errors::InvalidArgument( + "For mode 'channel', size of weight Alpha must be " + "equal to the number of channels of input(x). But " + "recevied alpha's size: %d, x_dim[%d]: %d", + product(alpha.dims()), + x_rank - 1, + x_dim[x_rank - 1])); + } + } else if (mode == "element") { + auto alpha_dim = alpha.dims(); + auto alpha_rank = alpha_dim.size(); + auto x_rank = x_dim.size(); + PADDLE_ENFORCE_GE(x_rank, + 1, + phi::errors::InvalidArgument( + "For mode 'element', rank of input X must be " + "equal or larger than 2. But recevied X's " + "rank: %d", + x_rank)); + PADDLE_ENFORCE_EQ( + alpha_rank, + x_rank, + phi::errors::InvalidArgument( + "For mode 'element', rank of weight Alpha must be ", + "equal to the rank of input(x). But recevied alpha's rank: %d, " + "x's rank: %d.", + alpha_rank, + x_rank)); + size_t x_product = 1; + size_t alpha_product = 1; + for (int64_t i = x_rank - 1; i > 0; i--) { + x_product *= x_dim[i]; + alpha_product *= alpha_dim[i]; + } + PADDLE_ENFORCE_EQ( + alpha_product, + x_product, + phi::errors::InvalidArgument( + "For mode 'element', the size of weight Alpha must be " + "equal to the size of input(x). But recevied alpha's size: %d, " + "x's size: %d.", + alpha_product, + x_product)); + } else { + PADDLE_THROW(phi::errors::InvalidArgument( + "Attr(mode) of prelu must be one of 'all', 'channel', or 'element'. " + "But recevied " + "mode: '%s'.", + mode)); + } + out->set_dims(x_dim); + out->set_dtype(x.dtype()); + out->set_layout(x.layout()); + out->share_lod(x); +} + void SearchsortedInferMeta(const MetaTensor& sorted_sequence, const MetaTensor& value, bool out_int32, @@ -1091,6 +1352,118 @@ void TriangularSolveInferMeta(const MetaTensor& x, out->share_lod(y); } +void YoloBoxInferMeta(const MetaTensor& x, + const MetaTensor& img_size, + const std::vector& anchors, + int class_num, + float conf_thresh, + int downsample_ratio, + bool clip_bbox, + float scale_x_y, + bool iou_aware, + float iou_aware_factor, + MetaTensor* boxes, + MetaTensor* scores, + MetaConfig config) { + auto dim_x = x.dims(); + auto dim_imgsize = img_size.dims(); + int anchor_num = anchors.size() / 2; + + PADDLE_ENFORCE_EQ( + dim_x.size(), + 4, + phi::errors::InvalidArgument("Input(X) should be a 4-D tensor." + "But received X dimension(%s)", + dim_x.size())); + if (iou_aware) { + PADDLE_ENFORCE_EQ( + dim_x[1], + anchor_num * (6 + class_num), + phi::errors::InvalidArgument( + "Input(X) dim[1] should be equal to (anchor_mask_number * (6 " + "+ class_num)) while iou_aware is true." + "But received dim[1](%s) != (anchor_mask_number * " + "(6+class_num)(%s).", + dim_x[1], + anchor_num * (6 + class_num))); + PADDLE_ENFORCE_GE( + iou_aware_factor, + 0, + phi::errors::InvalidArgument( + "Attr(iou_aware_factor) should greater than or equal to 0." + "But received iou_aware_factor (%s)", + iou_aware_factor)); + PADDLE_ENFORCE_LE( + iou_aware_factor, + 1, + phi::errors::InvalidArgument( + "Attr(iou_aware_factor) should less than or equal to 1." + "But received iou_aware_factor (%s)", + iou_aware_factor)); + } else { + PADDLE_ENFORCE_EQ( + dim_x[1], + anchor_num * (5 + class_num), + phi::errors::InvalidArgument( + "Input(X) dim[1] should be equal to (anchor_mask_number * (5 " + "+ class_num))." + "But received dim[1](%s) != (anchor_mask_number * " + "(5+class_num)(%s).", + dim_x[1], + anchor_num * (5 + class_num))); + } + PADDLE_ENFORCE_EQ( + dim_imgsize.size(), + 2, + phi::errors::InvalidArgument("Input(ImgSize) should be a 2-D tensor." + "But received Imgsize size(%s)", + dim_imgsize.size())); + if ((dim_imgsize[0] > 0 && dim_x[0] > 0) || config.is_runtime) { + PADDLE_ENFORCE_EQ( + dim_imgsize[0], + dim_x[0], + phi::errors::InvalidArgument( + "Input(ImgSize) dim[0] and Input(X) dim[0] should be same.")); + } + PADDLE_ENFORCE_EQ( + dim_imgsize[1], + 2, + phi::errors::InvalidArgument("Input(ImgSize) dim[1] should be 2." + "But received imgsize dim[1](%s).", + dim_imgsize[1])); + PADDLE_ENFORCE_GT(anchors.size(), + 0, + phi::errors::InvalidArgument( + "Attr(anchors) length should be greater than 0." + "But received anchors length(%s).", + anchors.size())); + PADDLE_ENFORCE_EQ(anchors.size() % 2, + 0, + phi::errors::InvalidArgument( + "Attr(anchors) length should be even integer." + "But received anchors length (%s)", + anchors.size())); + PADDLE_ENFORCE_GT(class_num, + 0, + phi::errors::InvalidArgument( + "Attr(class_num) should be an integer greater than 0." + "But received class_num (%s)", + class_num)); + + int box_num; + if ((dim_x[2] > 0 && dim_x[3] > 0) || config.is_runtime) { + box_num = dim_x[2] * dim_x[3] * anchor_num; + } else { + box_num = -1; + } + std::vector dim_boxes({dim_x[0], box_num, 4}); + boxes->set_dims(phi::make_ddim(dim_boxes)); + boxes->set_dtype(x.dtype()); + + std::vector dim_scores({dim_x[0], box_num, class_num}); + scores->set_dims(phi::make_ddim(dim_scores)); +} + void ValueCompareInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out, @@ -1104,3 +1477,4 @@ void ValueCompareInferMeta(const MetaTensor& x, } // namespace phi PD_REGISTER_INFER_META_FN(add_raw, phi::ElementwiseRawInferMeta); +PD_REGISTER_INFER_META_FN(conv2d, phi::ConvInferMeta); diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h index cb680415e7d2c42de7b2339b27b22be500dfdf9b..d770a096de7c922c674b7edda55ae8cb531a6d00 100644 --- a/paddle/phi/infermeta/binary.h +++ b/paddle/phi/infermeta/binary.h @@ -69,6 +69,20 @@ void CompareInferMeta(const MetaTensor& x, int axis, MetaTensor* out); +void ConvInferMeta(const MetaTensor& input, + const MetaTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& paddding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + MetaTensor* out, + MetaConfig config = MetaConfig()); + void CrossInferMeta(const MetaTensor& x, const MetaTensor& y, int axis, @@ -138,6 +152,10 @@ void LogLossInferMeta(const MetaTensor& input, MetaTensor* out, MetaConfig config = MetaConfig()); +void MaskedSelectInferMeta(const MetaTensor& x, + const MetaTensor& mask, + MetaTensor* out); + void MatmulInferMeta(const MetaTensor& x, const MetaTensor& y, bool trans_x, @@ -146,6 +164,13 @@ void MatmulInferMeta(const MetaTensor& x, void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out); +void PReluInferMeta(const MetaTensor& x, + const MetaTensor& alpha, + const std::string& mode, + const std::string& data_format, + MetaTensor* out, + MetaConfig config); + void SearchsortedInferMeta(const MetaTensor& sorted_sequence, const MetaTensor& value, bool out_int32, @@ -173,6 +198,20 @@ void TriangularSolveInferMeta(const MetaTensor& x, bool unitriangular, MetaTensor* out); +void YoloBoxInferMeta(const MetaTensor& x, + const MetaTensor& img_size, + const std::vector& anchors, + int class_num, + float conf_thresh, + int downsample_ratio, + bool clip_bbox, + float scale_x_y, + bool iou_aware, + float iou_aware_factor, + MetaTensor* boxes, + MetaTensor* scores, + MetaConfig config = MetaConfig()); + void ValueCompareInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out, diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index ef75ab573c6d9bd5c65fad747a28f2c704257371..3e9da9a217a0a8837d7edadc70401fdad04b4869 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -14,7 +14,9 @@ limitations under the License. */ #include "paddle/phi/infermeta/multiary.h" #include +#include "paddle/phi/common/layout.h" #include "paddle/phi/common/scalar.h" +#include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/core/meta_tensor.h" #include "paddle/phi/kernels/funcs/concat_funcs.h" namespace phi { @@ -200,6 +202,114 @@ void AucInferMeta(const MetaTensor& input, } } +void BatchNormInferMeta(const MetaTensor& x, + const MetaTensor& scale, + const MetaTensor& bias, + const MetaTensor& mean, + const MetaTensor& variance, + float momentum, + float epsilon, + const std::string& data_layout_str, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + bool fuse_with_relu, + MetaTensor* y, + MetaTensor* mean_out, + MetaTensor* variance_out, + MetaTensor* saved_mean, + MetaTensor* saved_variance, + MetaTensor* reserve_space, + MetaConfig config) { + const auto x_dims = x.dims(); + for (int i = 0; i < x_dims.size(); i++) { + PADDLE_ENFORCE_EQ( + (x_dims[i] == -1) || (x_dims[i] > 0), + true, + phi::errors::InvalidArgument( + "Each dimension of input tensor is expected to be -1 or a " + "positive number, but recieved %d. Input's shape is [%s].", + x_dims[i], + x_dims)); + } + + const DataLayout data_layout = + paddle::framework::StringToDataLayout(data_layout_str); + + PADDLE_ENFORCE_GE( + x_dims.size(), + 2, + phi::errors::InvalidArgument( + "ShapeError: the dimension of input " + "X must greater than or equal to 2. But received: the shape of input " + "X = [%s], the dimension of input X =[%d]", + x_dims, + x_dims.size())); + PADDLE_ENFORCE_LE( + x_dims.size(), + 5, + phi::errors::InvalidArgument( + "ShapeError: the dimension of input X " + "must smaller than or equal to 5. But received: the shape of input X " + "= [%s], the dimension of input X = [%d]", + x_dims, + x_dims.size())); + + const int64_t C = ((config.is_run_mkldnn_kernel == true) || + (data_layout == DataLayout::kNCHW) + ? x_dims[1] + : x_dims[x_dims.size() - 1]); + auto scale_dim = scale.dims(); + auto bias_dim = bias.dims(); + + PADDLE_ENFORCE_EQ( + scale_dim.size(), + 1UL, + phi::errors::InvalidArgument( + "ShapeError: the dimension of scale must equal to 1." + "But received: the shape of scale is [%s], the dimension " + "of scale is [%d]", + scale_dim, + scale_dim.size())); + PADDLE_ENFORCE_EQ(bias_dim.size(), + 1UL, + phi::errors::InvalidArgument( + "ShapeError: the dimension of bias must equal to 1." + "But received: the shape of bias is [%s],the dimension " + "of bias is [%d]", + bias_dim, + bias_dim.size())); + + bool check = true; + if ((!config.is_runtime) && + (phi::product(scale_dim) <= 0 || phi::product(bias_dim) <= 0)) { + check = false; + } + + if (check) { + PADDLE_ENFORCE_EQ(scale_dim[0], + C, + phi::errors::InvalidArgument( + "ShapeError: the shape of scale must equal to [%d]" + "But received: the shape of scale is [%d]", + C, + scale_dim[0])); + PADDLE_ENFORCE_EQ(bias_dim[0], + C, + phi::errors::InvalidArgument( + "ShapeError: the shape of bias must equal to [%d]" + "But received: the shape of bias is [%d]", + C, + bias_dim[0])); + } + y->set_dims(x_dims); + mean_out->set_dims({C}); + variance_out->set_dims({C}); + saved_mean->set_dims({C}); + saved_variance->set_dims({C}); + y->share_lod(x); +} + void BilinearTensorProductInferMeta(const MetaTensor& x, const MetaTensor& y, const MetaTensor& weight, @@ -369,6 +479,40 @@ void ConcatInferMeta(const std::vector& x, out->share_lod(*x.at(0)); } +void HierarchicalSigmoidInferMeta(const MetaTensor& x, + const MetaTensor& w, + const MetaTensor& label, + paddle::optional path, + paddle::optional code, + paddle::optional bias, + int num_classes, + bool remote_prefetch, + int trainer_id, + const std::vector& height_sections, + const std::vector& epmap, + const std::vector& table_names, + bool is_sparse, + MetaTensor* out, + MetaTensor* pre_out, + MetaTensor* w_out) { + const int64_t input_dims = x.dims()[0]; + const int64_t label_dims = label.dims()[0]; + PADDLE_ENFORCE_EQ(input_dims, + label_dims, + phi::errors::InvalidArgument( + "The first dimension of " + "input and label is expected to be the same. " + "But received input's first dimension is %d; " + "label's first dimension is %d.", + input_dims, + label_dims)); + + std::vector output_shape({input_dims, 1}); + out->set_dims(phi::make_ddim(output_shape)); + out->share_lod(x); + out->set_dtype(x.dtype()); +} + void MultiDotInferMeta(const std::vector& x, MetaTensor* out) { auto inputs_dims = GetMetaTensorsDim(x); @@ -543,3 +687,5 @@ void WhereInferMeta(const MetaTensor& condition, } } // namespace phi + +PD_REGISTER_INFER_META_FN(batch_norm, phi::BatchNormInferMeta); diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index 6de95386dd998810b508db6d0469691a37cd53dd..068766c0e11671c93285c077ab2328ac20134a13 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -72,6 +72,26 @@ void AucInferMeta(const MetaTensor& input, MetaTensor* stat_neg_out, MetaConfig config = MetaConfig()); +void BatchNormInferMeta(const MetaTensor& x, + const MetaTensor& scale, + const MetaTensor& bias, + const MetaTensor& mean, + const MetaTensor& variance, + float momentum, + float epsilon, + const std::string& data_layout, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + bool fuse_with_relu, + MetaTensor* y, + MetaTensor* mean_out, + MetaTensor* variance_out, + MetaTensor* saved_mean, + MetaTensor* saved_variance, + MetaTensor* reserve_space, + MetaConfig config = MetaConfig()); + void BilinearTensorProductInferMeta(const MetaTensor& x, const MetaTensor& y, const MetaTensor& weight, @@ -87,6 +107,23 @@ void ConcatInferMeta(const std::vector& x, MetaTensor* out, MetaConfig config = MetaConfig()); +void HierarchicalSigmoidInferMeta(const MetaTensor& x, + const MetaTensor& w, + const MetaTensor& label, + paddle::optional path, + paddle::optional code, + paddle::optional bias, + int num_classes, + bool remote_prefetch, + int trainer_id, + const std::vector& height_sections, + const std::vector& epmap, + const std::vector& table_names, + bool is_sparse, + MetaTensor* out, + MetaTensor* pre_out, + MetaTensor* w_out); + void MultiDotInferMeta(const std::vector& x, MetaTensor* out); void PsroiPoolInferMeta(const MetaTensor& x, diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc index 837750710c9a3dcf3c8b414c5c52a7272a0b3f58..556fb874470dd248dcf7c77e8a8ac3510bd6f63e 100644 --- a/paddle/phi/infermeta/ternary.cc +++ b/paddle/phi/infermeta/ternary.cc @@ -340,29 +340,29 @@ void RoiAlignInferMeta(const MetaTensor& x, PADDLE_ENFORCE_EQ( boxes_num_dims.size(), 1, - phi::errors::InvalidArgument("The size of RoisNum should be 1" + phi::errors::InvalidArgument("The size of boxes_num should be 1" ", but received size = %d", boxes_num_dims.size())); } PADDLE_ENFORCE_EQ(input_dims.size(), 4, phi::errors::InvalidArgument( - "The format of Input(X) in" - "RoIAlignOp is NCHW. And the rank of input must be 4. " + "The format of Input(x) in" + "RoiAlignOp is NCHW. And the rank of input must be 4. " "But received rank = %d", input_dims.size())); PADDLE_ENFORCE_EQ(boxes_dims.size(), 2, - phi::errors::InvalidArgument("The rank of Input(ROIs) " - "in RoIAlignOp should be 2. " - "But the rank of RoIs is %d", + phi::errors::InvalidArgument("The rank of Input(boxes) " + "in RoiAlignOp should be 2. " + "But the rank of boxes is %d", boxes_dims.size())); if (config.is_runtime) { PADDLE_ENFORCE_EQ(boxes_dims[1], 4, phi::errors::InvalidArgument( "The second dimension " - "of Input(ROIs) should be 4. But received the " + "of Input(boxes) should be 4. But received the " "dimension = %d", boxes_dims[1])); } @@ -370,21 +370,21 @@ void RoiAlignInferMeta(const MetaTensor& x, PADDLE_ENFORCE_GT(pooled_height, 0, phi::errors::InvalidArgument( - "The 'pooled_height' attribute in RoIAlignOp is " + "The 'pooled_height' attribute in RoiAlignOp is " "invalid. The height must be greater than 0. But " "received 'pooled_height' = %d", pooled_height)); PADDLE_ENFORCE_GT(pooled_width, 0, phi::errors::InvalidArgument( - "The 'pooled_width' attribute in RoIAlignOp is " + "The 'pooled_width' attribute in RoiAlignOp is " "invalid. The width must be greater than 0. But " "received 'pooled_width' = %d", pooled_width)); PADDLE_ENFORCE_GT(spatial_scale, 0.0f, phi::errors::InvalidArgument( - "The 'spatial_scale' attribute in RoIAlignOp is " + "The 'spatial_scale' attribute in RoiAlignOp is " "invalid. The scale must be greater than 0. But " "received 'spatial_scale' = %f", spatial_scale)); @@ -399,6 +399,81 @@ void RoiAlignInferMeta(const MetaTensor& x, out->set_dtype(x.dtype()); } +void RoiPoolInferMeta(const MetaTensor& x, + const MetaTensor& boxes, + paddle::optional boxes_num, + int pooled_height, + int pooled_width, + float spatial_scale, + MetaTensor* out, + MetaTensor* arg_max) { + auto input_dims = x.dims(); + auto boxes_dims = boxes.dims(); + + if (boxes_num) { + auto boxes_num_dims = boxes_num->dims(); + PADDLE_ENFORCE_EQ( + boxes_num_dims.size(), + 1, + phi::errors::InvalidArgument("The second dimension of boxes_num should " + "be 1, but received dimension is %d", + boxes_num_dims.size())); + } + PADDLE_ENFORCE_EQ(input_dims.size(), + 4, + phi::errors::InvalidArgument( + "The input data should be a four-dimensional " + "tensor with [N,C,H,W], but received input data with " + " %d dimension", + input_dims.size())); + PADDLE_ENFORCE_EQ( + boxes_dims.size(), + 2, + phi::errors::InvalidArgument( + "boxes should be a 2-D LoDTensor with shape (num_boxes, 4)" + "given as [[x1, y1, x2, y2], ...], but received boxes is " + "%d-dimensional LoDTensor", + boxes_dims.size())); + PADDLE_ENFORCE_EQ( + boxes_dims[1], + 4, + phi::errors::InvalidArgument( + "boxes should be a 2-D LoDTensor with shape (num_boxes, 4)" + "given as [[x1, y1, x2, y2], ...]. But the second dimension of " + "the received data is %d", + boxes_dims[1])); + + PADDLE_ENFORCE_GT( + pooled_height, + 0, + phi::errors::OutOfRange("The pooled output height must be greater than 0" + "but received height is %d", + pooled_height)); + PADDLE_ENFORCE_GT( + pooled_width, + 0, + phi::errors::OutOfRange("The pooled output width must be greater than 0" + "but received width is %d", + pooled_width)); + PADDLE_ENFORCE_GT( + spatial_scale, + 0.0f, + phi::errors::OutOfRange("The spatial scale must be greater than 0, " + "but received spatial scale is %f", + spatial_scale)); + + auto out_dims = input_dims; + out_dims[0] = boxes_dims[0]; + out_dims[1] = input_dims[1]; + out_dims[2] = pooled_height; + out_dims[3] = pooled_width; + + out->set_dims(out_dims); + out->set_dtype(x.dtype()); + arg_max->set_dims(out_dims); + arg_max->set_dtype(DataType::INT64); +} + void ScatterInferMeta(const MetaTensor& x, const MetaTensor& index, const MetaTensor& updates, diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h index 0e7b9cb12a4d0b44727f488412af754e2ba8ad94..42a0f35dc1d8d6aef13b631d355a4cee951a4ed1 100644 --- a/paddle/phi/infermeta/ternary.h +++ b/paddle/phi/infermeta/ternary.h @@ -84,6 +84,15 @@ void RoiAlignInferMeta(const MetaTensor& x, MetaTensor* out, MetaConfig config = MetaConfig()); +void RoiPoolInferMeta(const MetaTensor& x, + const MetaTensor& boxes, + paddle::optional boxes_num, + int pooled_height, + int pooled_width, + float spatial_scale, + MetaTensor* out, + MetaTensor* arg_max); + void ScatterInferMeta(const MetaTensor& x, const MetaTensor& index, const MetaTensor& updates, diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index 8a2d718f124578dbab0164048f8daa09e9a54e8f..0f51839553158b6dce7ac90006c5c72ee8e3b57b 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -304,6 +304,17 @@ void DiagonalInferMeta(const MetaTensor& input, out->set_dims(phi::make_ddim(out_dims)); } +void DropoutInferMeta(const MetaTensor& x, MetaTensor* out, MetaTensor* mask) { + auto x_dims = x.dims(); + out->set_dims(x_dims); + out->share_lod(x); + out->set_dtype(x.dtype()); + + if (mask != nullptr) { + mask->set_dims(x_dims); + } +} + void EighInferMeta(const MetaTensor& x, const std::string& uplo, MetaTensor* out_w, @@ -392,6 +403,26 @@ void GumbelSoftmaxInferMeta(const MetaTensor& x, UnchangedInferMetaCheckAxis(x, axis, out); } +void HistogramInferMeta( + const MetaTensor& input, int64_t bins, int min, int max, MetaTensor* out) { + PADDLE_ENFORCE_GE(bins, + 1, + phi::errors::InvalidArgument( + "The bins should be greater than or equal to 1." + "But received nbins is %d", + bins)); + PADDLE_ENFORCE_GE( + max, + min, + phi::errors::InvalidArgument("max must be larger or equal to min." + "But received max is %d, min is %d", + max, + min)); + + out->set_dims({bins}); + out->share_lod(input); +} + void IncrementInferMeta(const MetaTensor& x, float value, MetaTensor* out) { PADDLE_ENFORCE_EQ( product(x.dims()), @@ -554,6 +585,67 @@ void IsfiniteInferMeta(const MetaTensor& x, MetaTensor* out) { out->set_dtype(DataType::BOOL); } +void KthvalueInferMeta(const MetaTensor& x, + int k, + int axis, + bool keepdim, + MetaTensor* out, + MetaTensor* indices, + MetaConfig config) { + auto input_dims = x.dims(); + const int& dim_size = input_dims.size(); + PADDLE_ENFORCE_LT(axis, + dim_size, + phi::errors::InvalidArgument( + "the axis must be [-%d, %d), but received %d .", + dim_size, + dim_size, + axis)); + PADDLE_ENFORCE_GE(axis, + -dim_size, + phi::errors::InvalidArgument( + "the axis must be [-%d, %d), but received %d .", + dim_size, + dim_size, + axis)); + if (axis < 0) axis += dim_size; + PADDLE_ENFORCE_GE( + k, + 1, + phi::errors::InvalidArgument( + "the k in the kthvalue must >= 1, but received %d .", k)); + PADDLE_ENFORCE_GE( + input_dims.size(), + 1, + phi::errors::InvalidArgument("input of kthvalue must have >= 1d shape")); + if (config.is_runtime) { + PADDLE_ENFORCE_GE( + input_dims[axis], + k, + phi::errors::InvalidArgument( + "input of kthvalue must have >= %d columns in axis of %d", + k, + axis)); + } + std::vector dimvec; + for (int64_t i = 0; i < axis; i++) { + dimvec.emplace_back(input_dims[i]); + } + if (keepdim) { + dimvec.emplace_back(static_cast(1)); + } + for (int64_t i = axis + 1; i < dim_size; i++) { + dimvec.emplace_back(input_dims[i]); + } + DDim dims = phi::make_ddim(dimvec); + out->set_dims(dims); + out->share_lod(x); + out->set_dtype(x.dtype()); + indices->set_dims(dims); + indices->share_lod(x); + indices->set_dtype(x.dtype()); +} + void MatrixPowerInferMeta(const MetaTensor& x, int n, MetaTensor* out) { auto dims = x.dims(); auto n_dim = dims.size(); @@ -726,6 +818,24 @@ void MultinomialInferMeta(const MetaTensor& x, out->set_dtype(DataType::INT64); } +void NormInferMeta(const MetaTensor& x, + int axis, + float epsilon, + bool is_test, + MetaTensor* out, + MetaTensor* norm) { + auto xdim = x.dims(); + out->set_dims(x.dims()); + out->set_dtype(x.dtype()); + + if (is_test == false) { + if (axis < 0) axis = xdim.size() + axis; + xdim[axis] = 1; + norm->set_dims(xdim); + norm->set_dtype(x.dtype()); + } +} + void PadInferMeta(const MetaTensor& input, const std::vector& paddings, float pad_value, @@ -1589,7 +1699,7 @@ void UnchangedInferMetaCheckAxis(const MetaTensor& x, PADDLE_ENFORCE_GE( axis, -rank, - errors::InvalidArgument( + phi::errors::InvalidArgument( "Attr(axis) value should be in range [-R, R-1], " "R is the rank of Input(X). But received axis: %d, R: %d.", axis, diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index 7203a327b55698c0d4bd0271b2908cbc4a9b5ca1..2d51bac995d5142871873dd4a12c22b4bf2de55e 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -74,6 +74,8 @@ void DiagInferMeta(const MetaTensor& x, void DiagonalInferMeta( const MetaTensor& input, int offset, int axis1, int axis2, MetaTensor* out); +void DropoutInferMeta(const MetaTensor& x, MetaTensor* out, MetaTensor* mask); + void EighInferMeta(const MetaTensor& x, const std::string& uplo, MetaTensor* out_w, @@ -89,6 +91,8 @@ void GumbelSoftmaxInferMeta(const MetaTensor& x, bool hard, int axis, MetaTensor* out); +void HistogramInferMeta( + const MetaTensor& input, int64_t bins, int min, int max, MetaTensor* out); void IncrementInferMeta(const MetaTensor& x, float value, MetaTensor* out); @@ -100,6 +104,14 @@ void IsEmptyInferMeta(const MetaTensor& x, MetaTensor* out); void IsfiniteInferMeta(const MetaTensor& input, MetaTensor* out); +void KthvalueInferMeta(const MetaTensor& x, + int k, + int axis, + bool keepdim, + MetaTensor* out, + MetaTensor* indices, + MetaConfig = MetaConfig()); + void MatrixPowerInferMeta(const MetaTensor& x, int n, MetaTensor* out); void MaxPoolWithIndexInferMeta(const MetaTensor& x, @@ -122,6 +134,12 @@ void MultinomialInferMeta(const MetaTensor& x, int num_samples, bool replacement, MetaTensor* out); +void NormInferMeta(const MetaTensor& x, + int axis, + float epsilon, + bool is_test, + MetaTensor* out, + MetaTensor* norm); void PadInferMeta(const MetaTensor& input, const std::vector& paddings, diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index 02b5b2d74ad2914f60a1df08e500b06733b95aaa..d140912aa783047ba021be171805adff071bf22b 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -27,14 +27,18 @@ kernel_library(full_kernel DEPS ${COMMON_KERNEL_DEPS} empty_kernel) # Some kernels depend on some targets that are not commonly used. # These targets are not suitable for common dependencies. # In this case, you need to manually generate them here. -set(MANUAL_BUILD_KERNELS eigh_kernel gumbel_softmax_kernel gumbel_softmax_grad_kernel +set(MANUAL_BUILD_KERNELS eigh_kernel gumbel_softmax_kernel gumbel_softmax_grad_kernel + hierarchical_sigmoid_kernel hierarchical_sigmoid_grad_kernel matrix_power_kernel matrix_power_grad_kernel maxout_kernel maxout_grad_kernel pool_kernel put_along_axis_kernel put_along_axis_grad_kernel segment_pool_kernel segment_pool_grad_kernel softmax_kernel softmax_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel - triangular_solve_grad_kernel determinant_grad_kernel) + triangular_solve_grad_kernel determinant_grad_kernel reduce_kernel) kernel_library(eigh_kernel DEPS ${COMMON_KERNEL_DEPS} lapack_function) +kernel_library(hierarchical_sigmoid_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_bit_code) +kernel_library(hierarchical_sigmoid_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_bit_code) kernel_library(gumbel_softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) kernel_library(gumbel_softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) +kernel_library(reduce_kernel DEPS ${COMMON_KERNEL_DEPS} cast_kernel) kernel_library(matrix_power_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse) kernel_library(matrix_power_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse) kernel_library(maxout_kernel DEPS ${COMMON_KERNEL_DEPS} maxouting) diff --git a/paddle/phi/kernels/cpu/deformable_conv_kernel.cc b/paddle/phi/kernels/cpu/deformable_conv_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..0d61f7be68af9cb23363a51065fd06d8b6492bfa --- /dev/null +++ b/paddle/phi/kernels/cpu/deformable_conv_kernel.cc @@ -0,0 +1,146 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/deformable_conv_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/deformable_conv_kernel_impl.h" + +namespace phi { + +template +inline void ModulatedDeformableIm2colCPUKernel( + const int num_kernels, + const T* data_im, + const T* data_offset, + const T* data_mask, + const int height, + const int width, + const int kernel_h, + const int kernel_w, + const int pad_h, + const int pad_w, + const int stride_h, + const int stride_w, + const int dilation_h, + const int dilation_w, + const int channel_per_deformable_group, + const int batch_size, + const int num_channels, + const int deformable_group, + const int height_col, + const int width_col, + T* data_col) { + for (int i = 0; i < num_kernels; i++) { + const int w_col = i % width_col; + const int h_col = (i / width_col) % height_col; + const int b_col = (i / width_col) / height_col % batch_size; + const int c_im = (i / width_col / height_col) / batch_size; + const int c_col = c_im * kernel_h * kernel_w; + + const int deformable_group_index = c_im / channel_per_deformable_group; + + const int h_in = h_col * stride_h - pad_h; + const int w_in = w_col * stride_w - pad_w; + + T* data_col_ptr = + data_col + + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col; + const T* data_im_ptr = + data_im + (b_col * num_channels + c_im) * height * width; + const T* data_offset_ptr = + data_offset + + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * + kernel_w * height_col * width_col; + const T* data_mask_ptr = + data_mask + + (b_col * deformable_group + deformable_group_index) * kernel_h * + kernel_w * height_col * width_col; + + for (int i = 0; i < kernel_h; ++i) { + for (int j = 0; j < kernel_w; ++j) { + const int data_offset_h_ptr = + ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col; + const int data_offset_w_ptr = + ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + + w_col; + const int data_mask_hw_ptr = + ((i * kernel_w + j) * height_col + h_col) * width_col + w_col; + + const T offset_h = data_offset_ptr[data_offset_h_ptr]; + const T offset_w = data_offset_ptr[data_offset_w_ptr]; + const T mask = data_mask_ptr[data_mask_hw_ptr]; + T val = static_cast(0); + const T h_im = h_in + i * dilation_h + offset_h; + const T w_im = w_in + j * dilation_w + offset_w; + if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) { + val = + DmcnIm2colBilinear(data_im_ptr, width, height, width, h_im, w_im); + } + *data_col_ptr = val * mask; + data_col_ptr += batch_size * height_col * width_col; + } + } + } +} + +template +void ModulatedDeformableIm2col(const Context& dev_ctx, + const T* data_im, + const T* data_offset, + const T* data_mask, + const std::vector& im_shape, + const std::vector& col_shape, + const std::vector& filter_shape, + const std::vector& paddings, + const std::vector& strides, + const std::vector& dilations, + const int deformable_groups, + T* data_col) { + int channel_per_deformable_group = im_shape[0] / deformable_groups; + int num_kernels = im_shape[0] * col_shape[1] * col_shape[2] * col_shape[3]; + + // get outputs of im2col with offset by bilinear interpolation + ModulatedDeformableIm2colCPUKernel(num_kernels, + data_im, + data_offset, + data_mask, + im_shape[1], + im_shape[2], + filter_shape[2], + filter_shape[3], + paddings[0], + paddings[1], + strides[0], + strides[1], + dilations[0], + dilations[1], + channel_per_deformable_group, + col_shape[1], + im_shape[0], + deformable_groups, + col_shape[2], + col_shape[3], + data_col); +} + +} // namespace phi + +PD_REGISTER_KERNEL(deformable_conv, + CPU, + ALL_LAYOUT, + phi::DeformableConvKernel, + float, + double) {} diff --git a/paddle/phi/kernels/cpu/gelu_grad_kernel.cc b/paddle/phi/kernels/cpu/gelu_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..254c4ea5716d19c65da6a46748a43db8dbddd52b --- /dev/null +++ b/paddle/phi/kernels/cpu/gelu_grad_kernel.cc @@ -0,0 +1,146 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/gelu_grad_kernel.h" + +#include +#include + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/blas/blas_impl.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" +#include "paddle/phi/kernels/gelu_kernel.h" + +namespace phi { + +template +struct GeluGradFunctor { + template + void operator()(Device d, X x, dOut dout, dX dx, bool approximate) const { + if (approximate) { + if (std::is_same::value) { + VLOG(4) << "cast from float16 to float before computing"; + auto casted_x = x.template cast(); + auto casted_dout = dout.template cast(); + + const float kAlpha = static_cast(M_2_SQRTPI * M_SQRT1_2); + const float kBeta = + kAlpha * static_cast(GELU_CONSTANT) * static_cast(3); + const auto y = + (kAlpha * + ((static_cast(GELU_CONSTANT) * casted_x.cube()) + casted_x)) + .tanh(); + dx.device(d) = (static_cast(0.5) * casted_dout * + (static_cast(1) + y + + (casted_x - casted_x * y.square()) * + (kAlpha + kBeta * casted_x.square()))) + .template cast(); + } else { + const T kAlpha = static_cast(M_2_SQRTPI * M_SQRT1_2); + const T kBeta = + kAlpha * static_cast(GELU_CONSTANT) * static_cast(3); + const auto y = + (kAlpha * ((static_cast(GELU_CONSTANT) * x.cube()) + x)).tanh(); + dx.device(d) = static_cast(0.5) * dout * + (static_cast(1) + y + + (x - x * y.square()) * (kAlpha + kBeta * x.square())); + } + } else { +#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \ + !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) && \ + !defined(PADDLE_WITH_HIP) + auto x_data = x.data(); + auto dx_data = dx.data(); + auto dout_data = dout.data(); + int n = std::min(x.size(), dx.size()); + + auto first = static_cast(std::malloc(n * sizeof(T))); + std::memset(first, 0, n * sizeof(T)); + auto second = static_cast(std::malloc(n * sizeof(T))); + std::memset(second, 0, n * sizeof(T)); + + // first = (0.5 * (1 + erf(x / sqrt(2)))) + phi::funcs::CBlas::AXPY( + n, static_cast(M_SQRT1_2), x_data, 1, first, 1); + phi::funcs::CBlas::VMERF(n, first, first, VML_LA); + for (int i = 0; i < n; i++) { + first[i] += static_cast(1); + } + phi::funcs::CBlas::SCAL(n, static_cast(0.5), first, 1); + + // second = (0.5 * 2/sqrt(pi) * 1/sqrt(2) * x * exp(-0.5 * x^2)) + phi::funcs::CBlas::VSQUARE(n, x_data, second); + phi::funcs::CBlas::SCAL(n, -static_cast(0.5), second, 1); + phi::funcs::CBlas::VEXP(n, second, second); + phi::funcs::CBlas::VMUL(n, x_data, second, second); + phi::funcs::CBlas::SCAL( + n, static_cast(0.5 * M_2_SQRTPI * M_SQRT1_2), second, 1); + + // dx = dout * (first + second); + phi::funcs::CBlas::VADD(n, first, second, first); + phi::funcs::CBlas::VMUL(n, dout_data, first, dx_data); + + std::free(first); + std::free(second); +#else + // gelu_grad(x) = dout * 0.5 * (1 + erf(x / sqrt(2)) + x * sqrt(2 / pi) * + // exp(- x^2 / 2) + if (std::is_same::value) { + VLOG(4) << "cast from float16 to float before computing"; + auto casted_x = x.template cast(); + auto casted_dout = dout.template cast(); + auto first = static_cast(0.5) * + (static_cast(1) + + ((casted_x * static_cast(M_SQRT1_2)).erf())); + auto second = static_cast(0.5 * M_2_SQRTPI * M_SQRT1_2) * + casted_x * + (-static_cast(0.5) * casted_x.square()).exp(); + dx.device(d) = (casted_dout * (first + second)).template cast(); + } else { + auto first = + static_cast(0.5) * + (static_cast(1) + ((x * static_cast(M_SQRT1_2)).erf())); + + auto second = static_cast(0.5 * M_2_SQRTPI * M_SQRT1_2) * x * + (-static_cast(0.5) * x.square()).exp(); + dx.device(d) = dout * (first + second); + } +#endif + } + } +}; + +template +void GeluGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + bool approximate, + DenseTensor* x_grad) { + dev_ctx.template Alloc(x_grad); + auto eigen_x = EigenVector::Flatten(x); + auto eigen_out_grad = EigenVector::Flatten(out_grad); + auto eigen_x_grad = EigenVector::Flatten(*x_grad); + auto& dev = *dev_ctx.eigen_device(); + + GeluGradFunctor functor; + functor(dev, eigen_x, eigen_out_grad, eigen_x_grad, approximate); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + gelu_grad, CPU, ALL_LAYOUT, phi::GeluGradKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/gelu_kernel.cc b/paddle/phi/kernels/cpu/gelu_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..d7af220574565ea96706c2a87aec6751c9203af4 --- /dev/null +++ b/paddle/phi/kernels/cpu/gelu_kernel.cc @@ -0,0 +1,102 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/gelu_kernel.h" +#include +#include +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/blas/blas_impl.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" + +namespace phi { + +template +struct GeluFunctor { + template + void operator()(Device d, X x, Out out, bool approximate) const { + if (approximate) { + // gelu(x) = 0.5 * x * (1 + tanh(sqrt(2 / \pi) * (x + 0.044715 * x^{3}))) + if (std::is_same::value) { + VLOG(4) << "cast from float16 to float before computing"; + auto casted_x = x.template cast(); + auto temp = + (static_cast(M_2_SQRTPI * M_SQRT1_2) * + (casted_x + static_cast(GELU_CONSTANT) * casted_x.cube())) + .tanh(); + out.device(d) = (casted_x * static_cast(0.5) * + (static_cast(1) + temp)) + .template cast(); + } else { + auto temp = (static_cast(M_2_SQRTPI * M_SQRT1_2) * + (x + static_cast(GELU_CONSTANT) * x.cube())) + .tanh(); + out.device(d) = x * static_cast(0.5) * (static_cast(1) + temp); + } + } else { +#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \ + !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) && \ + !defined(PADDLE_WITH_HIP) + auto x_data = x.data(); + auto out_data = out.data(); + int n = std::min(x.size(), out.size()); + + std::memset(out_data, 0, n * sizeof(T)); + phi::funcs::CBlas::AXPY( + n, static_cast(M_SQRT1_2), x_data, 1, out_data, 1); + phi::funcs::CBlas::VMERF(n, out_data, out_data, VML_LA); + for (int i = 0; i < n; i++) { + out_data[i] += static_cast(1); + } + phi::funcs::CBlas::VMUL(n, x_data, out_data, out_data); + for (int i = 0; i < n; i++) { + out_data[i] *= static_cast(0.5); + } +#else + // gelu(x) = 0.5 * x * (1 + erf(x / sqrt(2))) + if (std::is_same::value) { + VLOG(4) << "cast from float16 to float before computing"; + auto casted_x = x.template cast(); + auto temp = (casted_x * static_cast(M_SQRT1_2)).erf(); + out.device(d) = (casted_x * static_cast(0.5) * + (static_cast(1) + temp)) + .template cast(); + } else { + auto temp = (x * static_cast(M_SQRT1_2)).erf(); + out.device(d) = x * static_cast(0.5) * (static_cast(1) + temp); + } +#endif + } + } +}; + +template +void GeluKernel(const Context& dev_ctx, + const DenseTensor& x, + bool approximate, + DenseTensor* out) { + dev_ctx.template Alloc(out); + auto eigen_out = EigenVector::Flatten(*out); + auto eigen_x = EigenVector::Flatten(x); + auto& dev = *dev_ctx.eigen_device(); + + GeluFunctor functor; + functor(dev, eigen_x, eigen_out, approximate); +} + +} // namespace phi + +PD_REGISTER_KERNEL(gelu, CPU, ALL_LAYOUT, phi::GeluKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad.h b/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad.h new file mode 100644 index 0000000000000000000000000000000000000000..b79aab96c0fc2251f35fe93b525a03676e01fdb1 --- /dev/null +++ b/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad.h @@ -0,0 +1,110 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/operators/math/matrix_bit_code.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/selected_rows.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +namespace math = paddle::operators::math; + +template +void HierarchicalSigmoidGradKernelImpl( + const Context& ctx, + const DenseTensor& x, + const DenseTensor& w, + const DenseTensor& label, + const DenseTensor& pre_out, + const DenseTensor& out_grad, + paddle::optional path, + paddle::optional code, + paddle::optional bias, + int num_classes, + bool remote_prefetch, + int trainer_id, + const std::vector& height_sections, + const std::vector& epmap, + const std::vector& table_names, + bool is_sparse, + DenseTensor* x_grad, + DenseTensor* w_grad, + DenseTensor* bias_grad, + SelectedRows* w_grad_sr = nullptr) { + funcs::SetConstant zero; + DenseTensor pre_out_grad; + + pre_out_grad.Resize(pre_out.dims()); + ctx.template Alloc(&pre_out_grad); + ctx.template Alloc(x_grad); + zero(ctx, x_grad, static_cast(0.0)); + + bool is_custom = false; + if (path.get_ptr()) { + is_custom = true; + } + + std::unique_ptr> bit_code; + if (!is_custom) { + bit_code.reset(new math::MatrixBitCodeFunctor( + num_classes, label.template data())); + } else { + bit_code.reset(new math::MatrixBitCodeFunctor( + *(path.get_ptr()), *(code.get_ptr()), label.template data())); + } + + // softrelu derivative + + auto blas = funcs::GetBlas(ctx); + + auto* pre_out_grad_data = pre_out_grad.data(); + auto* pre_out_data = pre_out.template data(); + auto n = pre_out.numel(); + blas.VEXP(n, pre_out_data, pre_out_grad_data); + blas.VINV(n, pre_out_grad_data, pre_out_grad_data); + for (int64_t i = 0; i < n; ++i) { + pre_out_grad_data[i] = 1.0 - pre_out_grad_data[i]; + } + bit_code->Sub(&pre_out_grad); // the gradient of clip(w * x + b) + auto* out_grad_data = out_grad.template data(); + + int64_t dim0 = pre_out_grad.dims()[0]; + int64_t dim1 = pre_out_grad.dims()[1]; + for (int64_t i = 0; i < dim0; ++i) { + T tmp = out_grad_data[i]; + blas.SCAL(dim1, tmp, pre_out_grad_data + i * dim1); + } + // TODO(guosheng): multiply pre_out_grad with subgradient of clipping to + // be consistent with the clipping in forward. + if (bias_grad) { + ctx.template Alloc(bias_grad); + zero(ctx, bias_grad, static_cast(0.0)); + bit_code->AddGrad(pre_out_grad, bias_grad); + } + ctx.template Alloc(w_grad); + zero(ctx, w_grad, static_cast(0.0)); + if (!is_sparse) { + bit_code->MulGradWeight(pre_out_grad, w_grad, x); + } else { + bit_code->MulGradWeight(pre_out_grad, w_grad_sr, x); + } + bit_code->MulGradError(pre_out_grad, w, x_grad); +} + +} // namespace phi diff --git a/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad_kernel.cc b/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..f64a1a8162a379bdad99c6519ef996a4203544a7 --- /dev/null +++ b/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad_kernel.cc @@ -0,0 +1,71 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/hierarchical_sigmoid_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/hierarchical_sigmoid_grad.h" + +namespace phi { + +template +void HierarchicalSigmoidGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& w, + const DenseTensor& label, + const DenseTensor& pre_out, + const DenseTensor& out_grad, + paddle::optional path, + paddle::optional code, + paddle::optional bias, + int num_classes, + bool remote_prefetch, + int trainer_id, + const std::vector& height_sections, + const std::vector& epmap, + const std::vector& table_names, + bool is_sparse, + DenseTensor* x_grad, + DenseTensor* w_grad, + DenseTensor* bias_grad) { + HierarchicalSigmoidGradKernelImpl(ctx, + x, + w, + label, + pre_out, + out_grad, + path, + code, + bias, + num_classes, + remote_prefetch, + trainer_id, + height_sections, + epmap, + table_names, + is_sparse, + x_grad, + w_grad, + bias_grad); +} + +} // namespace phi + +PD_REGISTER_KERNEL(hierarchical_sigmoid_grad, + CPU, + ALL_LAYOUT, + phi::HierarchicalSigmoidGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/cpu/hierarchical_sigmoid_kernel.cc b/paddle/phi/kernels/cpu/hierarchical_sigmoid_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..096a54f9fb263d3c153ab687d83bb61c63b117d7 --- /dev/null +++ b/paddle/phi/kernels/cpu/hierarchical_sigmoid_kernel.cc @@ -0,0 +1,115 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/hierarchical_sigmoid_kernel.h" + +#include "paddle/fluid/operators/clip_op.h" +#include "paddle/fluid/operators/math/matrix_bit_code.h" +#include "paddle/fluid/platform/transform.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" +#include "paddle/phi/kernels/funcs/math_function_impl.h" + +namespace phi { + +namespace math = paddle::operators::math; + +template +void HierarchicalSigmoidKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& w, + const DenseTensor& label, + paddle::optional path, + paddle::optional code, + paddle::optional bias, + int num_classes, + bool remote_prefetch, + int trainer_id, + const std::vector& height_sections, + const std::vector& epmap, + const std::vector& table_names, + bool is_sparse, + DenseTensor* out, + DenseTensor* pre_out, + DenseTensor* w_out) { + size_t num_classes_st = static_cast(num_classes); + // for remote prefetch + + bool is_custom = false; + if (path.get_ptr()) { + is_custom = true; + } + int64_t code_length = path.get_ptr() ? path.get_ptr()->dims()[1] + : math::FindLastSet(num_classes_st - 1); + int64_t batch_size = x.dims()[0]; + DenseTensor sum; + pre_out->Resize(phi::make_ddim({batch_size, code_length})); + ctx.template Alloc(pre_out); + auto* pre_out_data = pre_out->data(); + auto pre_out_mat = EigenMatrix::From(*pre_out); + // Not all class(leaf) nodes' path lengths equal code_length, thus init as + // 0s can avoid out of path's loss. + funcs::SetConstant zero; + zero(ctx, pre_out, static_cast(0.0)); + auto& place = *ctx.eigen_device(); + funcs::RowwiseSum row_sum; + + std::unique_ptr> bit_code; + if (!is_custom) { + bit_code.reset(new math::MatrixBitCodeFunctor( + num_classes_st, label.template data())); + } else { + bit_code.reset(new math::MatrixBitCodeFunctor( + *(path.get_ptr()), *(code.get_ptr()), label.template data())); + } + + std::vector sum_dims({batch_size, 1UL}); + sum.Resize(phi::make_ddim(sum_dims)); + ctx.template Alloc(&sum); + auto sum_mat = EigenMatrix::From(sum); + ctx.template Alloc(out); + auto out_mat = EigenMatrix::From(*out); + if (bias.get_ptr()) { + bit_code->Add(*(bias.get_ptr()), pre_out); + } + bit_code->Mul(pre_out, w, x); + // clip to [-40, 40] + paddle::platform::Transform trans; + trans(ctx, + pre_out_data, + pre_out_data + pre_out->numel(), + pre_out_data, + paddle::operators::ClipFunctor(static_cast(-40.0), + static_cast(40.0))); + bit_code->Sum(*pre_out, out, static_cast(-1)); + // use softrelu to calculate cross entropy + pre_out_mat.device(place) = (static_cast(1.0) + pre_out_mat.exp()).log(); + row_sum(ctx, *pre_out, &sum); + // TODO(guosheng): Subtract the out of path's loss, since not all + // class(leaf) nodes' path lengths equal code_length. But it won't break the + // gradient check since both have the out of path's loss and will cancel out + // each other. + out_mat.device(place) = sum_mat + out_mat; +} + +} // namespace phi + +PD_REGISTER_KERNEL(hierarchical_sigmoid, + CPU, + ALL_LAYOUT, + phi::HierarchicalSigmoidKernel, + float, + double) {} diff --git a/paddle/phi/kernels/cpu/kthvalue_grad_kernel.cc b/paddle/phi/kernels/cpu/kthvalue_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..185d6cbedc85db83032cecd3c2f6cd1b0f46cbaf --- /dev/null +++ b/paddle/phi/kernels/cpu/kthvalue_grad_kernel.cc @@ -0,0 +1,168 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/kthvalue_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { +template +static void kthvalueAssign(const Type& input_height, + const Type& input_width, + const int& input_dim, + const DenseTensor* input, + const DenseTensor* indices, + T* output_data) { +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (Type i = 0; i < input_height; ++i) { + if (input_dim == 1) { + auto e_input = EigenVector::Flatten(*input); + auto e_indices = EigenVector::Flatten(*indices); + output_data[i * input_width + e_indices(0)] = e_input(0); + } else { + auto e_input = EigenMatrix::Reshape(*input, input_dim - 1); + auto e_indices = EigenMatrix::Reshape(*indices, input_dim - 1); + output_data[i * input_width + e_indices(i, 0)] = e_input(i, 0); + } + } +} + +template +void KthvalueGradKernel(const Context& dev_ctx, + const DenseTensor& d_out, + const DenseTensor& x, + const DenseTensor& indices, + int k, + int axis, + bool keepdim, + DenseTensor* d_x) { + auto in_dims = x.dims(); + auto out_dims = indices.dims(); + axis = (axis < 0) ? (in_dims.size() + axis) : axis; + if (!keepdim) { + std::vector tmp_out_shape; + for (int i = 0; i < axis; i++) { + tmp_out_shape.emplace_back(out_dims[i]); + } + tmp_out_shape.emplace_back(1); + for (int i = axis + 1; i < in_dims.size(); i++) { + tmp_out_shape.emplace_back(out_dims[i - 1]); + } + out_dims = phi::make_ddim(tmp_out_shape); + } + T* x_grad_data = dev_ctx.template Alloc(d_x); + if (axis == in_dims.size() - 1) { + const int64_t input_height = + phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); + const int64_t input_width = in_dims[in_dims.size() - 1]; + memset(x_grad_data, 0, d_x->numel() * sizeof(T)); + if (keepdim) { + kthvalueAssign(input_height, + input_width, + in_dims.size(), + &d_out, + &indices, + x_grad_data); + } else { + DenseTensor out_grad_tmp, indices_tmp; + out_grad_tmp.Resize(d_out.dims()); + indices_tmp.Resize(indices.dims()); + dev_ctx.template Alloc(&out_grad_tmp); + dev_ctx.template Alloc(&indices_tmp); + Copy(dev_ctx, d_out, dev_ctx.GetPlace(), false, &out_grad_tmp); + Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &indices_tmp); + out_grad_tmp.Resize(out_dims); + indices_tmp.Resize(out_dims); + kthvalueAssign(input_height, + input_width, + in_dims.size(), + &out_grad_tmp, + &indices_tmp, + x_grad_data); + } + } else { + std::vector trans; + for (int i = 0; i < axis; i++) { + trans.emplace_back(i); + } + trans.emplace_back(out_dims.size() - 1); + for (int i = axis + 1; i < out_dims.size() - 1; i++) { + trans.emplace_back(i); + } + trans.emplace_back(axis); + DDim trans_dims(out_dims); + DDim trans_in_dims(in_dims); + for (size_t i = 0; i < trans.size(); i++) { + trans_dims[i] = out_dims[trans[i]]; + trans_in_dims[i] = in_dims[trans[i]]; + } + DenseTensor trans_dO, trans_ind; + trans_dO.Resize(trans_dims); + trans_ind.Resize(trans_dims); + dev_ctx.template Alloc(&trans_dO); + dev_ctx.template Alloc(&trans_ind); + int ndims = trans.size(); + if (keepdim) { + funcs::TransCompute( + ndims, dev_ctx, d_out, &trans_dO, trans); + funcs::TransCompute( + ndims, dev_ctx, indices, &trans_ind, trans); + } else { + DenseTensor out_grad_tmp, indices_tmp; + out_grad_tmp.Resize(d_out.dims()); + indices_tmp.Resize(indices.dims()); + dev_ctx.template Alloc(&out_grad_tmp); + dev_ctx.template Alloc(&indices_tmp); + Copy(dev_ctx, d_out, dev_ctx.GetPlace(), false, &out_grad_tmp); + Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &indices_tmp); + out_grad_tmp.Resize(out_dims); + indices_tmp.Resize(out_dims); + funcs::TransCompute( + ndims, dev_ctx, out_grad_tmp, &trans_dO, trans); + funcs::TransCompute( + ndims, dev_ctx, indices_tmp, &trans_ind, trans); + } + const int64_t input_height = phi::product( + phi::slice_ddim(trans_in_dims, 0, trans_in_dims.size() - 1)); + const int64_t input_width = trans_in_dims[trans_in_dims.size() - 1]; + DenseTensor tmp_out; + tmp_out.Resize(trans_in_dims); + T* t_out = dev_ctx.template Alloc(&tmp_out); + memset(t_out, 0, d_x->numel() * sizeof(T)); + kthvalueAssign(input_height, + input_width, + in_dims.size(), + &trans_dO, + &trans_ind, + t_out); + funcs::TransCompute( + ndims, dev_ctx, tmp_out, d_x, trans); + } +} +} // namespace phi + +PD_REGISTER_KERNEL(kthvalue_grad, + CPU, + ALL_LAYOUT, + phi::KthvalueGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/kthvalue_kernel.cc b/paddle/phi/kernels/cpu/kthvalue_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..5e436623cae7bbc27903eff8e2bf01a41ded9c94 --- /dev/null +++ b/paddle/phi/kernels/cpu/kthvalue_kernel.cc @@ -0,0 +1,167 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/kthvalue_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { +template +static void getKthvalue(Type input_height, + Type input_width, + int input_dim, + const DenseTensor* input, + T* t_out, + Type* t_indices, + const int& k) { + bool partial_sort_flag = (k * 64) < input_width; +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (Type i = 0; i < input_height; ++i) { + std::vector> col_vec; + col_vec.reserve(input_width); + if (input_dim == 1) { + auto e_input = EigenVector::Flatten(*input); + for (Type j = 0; j < input_width; ++j) { + col_vec.emplace_back(std::pair(e_input(j), j)); + } + } else { + auto e_input = EigenMatrix::Reshape(*input, input_dim - 1); + for (Type j = 0; j < input_width; ++j) { + col_vec.emplace_back(std::pair(e_input(i, j), j)); + } + } + if (partial_sort_flag) { + std::partial_sort( + col_vec.begin(), + col_vec.begin() + k, + col_vec.end(), + [](const std::pair& l, const std::pair& r) { + return (!std::isnan(static_cast(l.first)) && + std::isnan(static_cast(r.first))) || + (l.first < r.first); + }); + } else { + std::nth_element( + col_vec.begin(), + col_vec.begin() + k - 1, + col_vec.end(), + [](const std::pair& l, const std::pair& r) { + return (!std::isnan(static_cast(l.first)) && + std::isnan(static_cast(r.first))) || + (l.first < r.first); + }); + } + t_out[i] = col_vec[k - 1].first; + t_indices[i] = col_vec[k - 1].second; + } +} + +template +void KthvalueKernel(const Context& dev_ctx, + const DenseTensor& x, + int k, + int axis, + bool keepdim, + DenseTensor* output, + DenseTensor* indices) { + const auto& in_dims = x.dims(); + if (axis < 0) axis += in_dims.size(); + T* output_data = dev_ctx.template Alloc(output); + int64_t* indices_data = dev_ctx.template Alloc(indices); + auto out_dims = output->dims(); + if (axis == in_dims.size() - 1) { + const int64_t& input_height = + phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); + const int64_t& input_width = in_dims[in_dims.size() - 1]; + getKthvalue(input_height, + input_width, + in_dims.size(), + &x, + output_data, + indices_data, + k); + } else { + std::vector trans; + for (int i = 0; i < axis; i++) { + trans.emplace_back(i); + } + trans.emplace_back(in_dims.size() - 1); + for (int i = axis + 1; i < in_dims.size() - 1; i++) { + trans.emplace_back(i); + } + trans.emplace_back(axis); + if (!keepdim) { + std::vector tmp_out_shape; + for (int i = 0; i < axis; i++) { + tmp_out_shape.emplace_back(in_dims[i]); + } + tmp_out_shape.emplace_back(1); + for (int i = axis + 1; i < in_dims.size(); i++) { + tmp_out_shape.emplace_back(in_dims[i]); + } + DDim tmp_out_dims = phi::make_ddim(tmp_out_shape); + output->Resize(tmp_out_dims); + indices->Resize(tmp_out_dims); + } + DDim trans_dims(in_dims); + DDim trans_out_dims(in_dims); + + for (size_t i = 0; i < trans.size(); i++) { + trans_dims[i] = in_dims[trans[i]]; + trans_out_dims[i] = in_dims[trans[i]]; + } + trans_out_dims[in_dims.size() - 1] = 1; + DenseTensor trans_inp; + trans_inp.Resize(trans_dims); + dev_ctx.template Alloc(&trans_inp); + int ndims = trans.size(); + funcs::TransCompute( + ndims, dev_ctx, x, &trans_inp, trans); + + const int64_t input_height = + phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); + const int64_t input_width = trans_dims[trans_dims.size() - 1]; + DenseTensor tmp_out, tmp_indices; + tmp_out.Resize(trans_out_dims); + T* t_out = dev_ctx.template Alloc(&tmp_out); + tmp_indices.Resize(trans_out_dims); + int64_t* t_ind = dev_ctx.template Alloc(&tmp_indices); + getKthvalue( + input_height, input_width, in_dims.size(), &trans_inp, t_out, t_ind, k); + funcs::TransCompute( + ndims, dev_ctx, tmp_indices, indices, trans); + funcs::TransCompute( + ndims, dev_ctx, tmp_out, output, trans); + if (!keepdim) { + output->Resize(out_dims); + indices->Resize(out_dims); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(kthvalue, + CPU, + ALL_LAYOUT, + phi::KthvalueKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/log_softmax_grad_kernel.cc b/paddle/phi/kernels/cpu/log_softmax_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..5f344b9cc3fe0a4c71470c361f2e8f370bc5908a --- /dev/null +++ b/paddle/phi/kernels/cpu/log_softmax_grad_kernel.cc @@ -0,0 +1,88 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/log_softmax_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/axis_utils.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" + +namespace phi { + +template +using EigenMatrixTemplate = EigenMatrix; + +template +struct LogSoftmaxGradFunctor { + void operator()(const Context& context, + const DenseTensor* Y, + const DenseTensor* dY, + DenseTensor* dX, + const int axis) { + constexpr int kBatchDim = 0; + constexpr int kClassDim = 1; + + const int n = funcs::SizeToAxis(axis, Y->dims()); + const int d = funcs::SizeFromAxis(axis, Y->dims()); + phi::DDim dim_2d{n, d}; + + auto y = EigenMatrixTemplate::From(*Y, dim_2d); + auto dy = EigenMatrixTemplate::From(*dY, dim_2d); + auto dx = EigenMatrixTemplate::From(*dX, dim_2d); + + const int axis_dim = Y->dims()[axis]; + const int batch_size = y.dimension(kBatchDim); + const int num_classes = y.dimension(kClassDim); + const int num_remain = num_classes / axis_dim; + + Eigen::DSizes along_class(kClassDim); + Eigen::DSizes batch_axis_remain(batch_size, axis_dim, num_remain); + Eigen::DSizes one_axis(1, axis_dim); + + dx.device(*context.eigen_device()) = + dy - + (y.exp()) * (dy.reshape(batch_axis_remain) + .sum(along_class) + .broadcast(one_axis)); + } +}; + +template +void LogSoftmaxGradKernel(const Context& dev_ctx, + const DenseTensor& out, + const DenseTensor& out_grad, + int axis, + DenseTensor* x_grad) { + const int rank = out.dims().size(); + const int canonical_axis = funcs::CanonicalAxis(axis, rank); + + dev_ctx.template Alloc(x_grad); + if (out.numel() != 0) { + LogSoftmaxGradFunctor()( + dev_ctx, &out, &out_grad, x_grad, canonical_axis); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(log_softmax_grad, + CPU, + ALL_LAYOUT, + phi::LogSoftmaxGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/cpu/log_softmax_kernel.cc b/paddle/phi/kernels/cpu/log_softmax_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..241742378cc5d012d2816745d0f83fc586089ef7 --- /dev/null +++ b/paddle/phi/kernels/cpu/log_softmax_kernel.cc @@ -0,0 +1,123 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/log_softmax_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/axis_utils.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" + +namespace phi { + +template +using EigenMatrixTemplate = EigenMatrix; + +template +struct ValueClip { + HOSTDEVICE T operator()(const T& x) const { + const T kThreshold = static_cast(-64.); + return x < kThreshold ? kThreshold : x; + } +}; + +template +struct LogSoftmaxFunctor { + void operator()(const Context& context, + const DenseTensor* X, + DenseTensor* Y, + const int axis) { + constexpr int kBatchDim = 0; + constexpr int kClassDim = 1; + constexpr int kAxisDim = 1; + + int axis_dim = X->dims()[axis]; + const int n = funcs::SizeToAxis(axis, X->dims()); + const int d = funcs::SizeFromAxis(axis, X->dims()); + phi::DDim dim_2d{n, d}; + + auto logits = EigenMatrixTemplate::From(*X, dim_2d); + auto log_softmax = EigenMatrixTemplate::From(*Y, dim_2d); + + const int batch_size = logits.dimension(kBatchDim); + const int num_classes = logits.dimension(kClassDim); + const int num_remain = num_classes / axis_dim; + + Eigen::DSizes along_axis(kAxisDim); + Eigen::DSizes batch_classes(batch_size, num_classes); + Eigen::DSizes batch_by_one(batch_size, 1); + Eigen::DSizes one_by_class(1, num_classes); + Eigen::DSizes batch_one_remain(batch_size, 1, num_remain); + Eigen::DSizes one_axis_one(1, axis_dim, 1); + Eigen::DSizes one_axis(1, axis_dim); + Eigen::DSizes batch_axis_remain(batch_size, axis_dim, num_remain); + + // For numerical stability, logits should be shifted by maximum number along + // axis, calculate shifted_logits into log_softmax tensor for memory reuse. + if (num_remain == 1) { + // axis == -1, axis and class in same dimension, calculate along + // class dimension directly for higher performance + log_softmax.device(*context.eigen_device()) = + (logits - + logits.maximum(along_axis) + .eval() + .reshape(batch_by_one) + .broadcast(one_by_class)) + .unaryExpr(ValueClip()); + } else { + // axis != -1, class dimension split into (axis, remain), max and sum + // should be calculated along axis dimension + log_softmax.device(*context.eigen_device()) = + (logits.reshape(batch_axis_remain) - + logits.reshape(batch_axis_remain) + .maximum(along_axis) + .eval() + .reshape(batch_one_remain) + .broadcast(one_axis_one) + .reshape(batch_classes)) + .unaryExpr(ValueClip()); + } + + log_softmax.device(*context.eigen_device()) = + log_softmax - + log_softmax.exp() + .eval() + .reshape(batch_axis_remain) + .sum(along_axis) + .log() + .broadcast(one_axis); + } +}; + +template +void LogSoftmaxKernel(const Context& dev_ctx, + const DenseTensor& x, + int axis, + DenseTensor* out) { + const int rank = x.dims().size(); + const int canonical_axis = funcs::CanonicalAxis(axis, rank); + + dev_ctx.template Alloc(out); + if (x.numel() != 0) { + LogSoftmaxFunctor()(dev_ctx, &x, out, canonical_axis); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + log_softmax, CPU, ALL_LAYOUT, phi::LogSoftmaxKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/prelu_grad_kernel.cc b/paddle/phi/kernels/cpu/prelu_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..97558cdb31f666fd7c5dd8b15e1d7feef6556a0b --- /dev/null +++ b/paddle/phi/kernels/cpu/prelu_grad_kernel.cc @@ -0,0 +1,119 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/prelu_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void PReluGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& alpha, + const DenseTensor& out_grad, + const std::string& mode, + const std::string& data_format, + DenseTensor* x_grad, + DenseTensor* alpha_grad) { + const T* alpha_ptr = alpha.data(); + const T* x_ptr = x.data(); + const T* out_grad_ptr = out_grad.data(); + int numel = x.numel(); + auto dim = x.dims(); + int index = 0; + int i = 0; + if (x_grad) { + T* x_grad_ptr = dev_ctx.template Alloc(x_grad); + if (mode == "channel") { + if (data_format == "NCHW") { + int temp = 1; + for (int j = 2; j < dim.size(); j++) { + temp *= dim[j]; + } + for (i = 0; i < numel; i++) { + index = (i / temp) % dim[1]; + x_grad_ptr[i] = x_ptr[i] > 0 ? out_grad_ptr[i] + : alpha_ptr[index] * out_grad_ptr[i]; + } + } else { + for (i = 0; i < numel; i++) { + index = i % dim[dim.size() - 1]; + x_grad_ptr[i] = x_ptr[i] > 0 ? out_grad_ptr[i] + : alpha_ptr[index] * out_grad_ptr[i]; + } + } + } else if (mode == "element") { + int temp = 1; + for (int j = 1; j < dim.size(); j++) { + temp *= dim[j]; + } + for (i = 0; i < numel; i++) { + index = i % temp; + x_grad_ptr[i] = + x_ptr[i] > 0 ? out_grad_ptr[i] : alpha_ptr[index] * out_grad_ptr[i]; + } + } else { + for (i = 0; i < numel; i++) { + x_grad_ptr[i] = + x_ptr[i] > 0 ? out_grad_ptr[i] : alpha_ptr[0] * out_grad_ptr[i]; + } + } + } + + index = 0; + if (alpha_grad) { + T* alpha_grad_ptr = dev_ctx.template Alloc(alpha_grad); + memset(alpha_grad_ptr, 0, sizeof(T) * alpha_grad->numel()); + + if (mode == "channel") { + if (data_format == "NCHW") { + int temp = 1; + for (int j = 2; j < dim.size(); j++) { + temp *= dim[j]; + } + for (i = 0; i < numel; i++) { + index = (i / temp) % dim[1]; + alpha_grad_ptr[index] += + x_ptr[i] > 0 ? 0 : x_ptr[i] * out_grad_ptr[i]; + } + } else { + for (i = 0; i < numel; i++) { + index = i % dim[dim.size() - 1]; + alpha_grad_ptr[index] += + x_ptr[i] > 0 ? 0 : x_ptr[i] * out_grad_ptr[i]; + } + } + } else if (mode == "element") { + int temp = 1; + for (int j = 1; j < dim.size(); j++) { + temp *= dim[j]; + } + for (i = 0; i < numel; i++) { + index = i % temp; + alpha_grad_ptr[index] += x_ptr[i] > 0 ? 0 : x_ptr[i] * out_grad_ptr[i]; + } + } else { + for (i = 0; i < numel; i++) { + alpha_grad_ptr[0] += x_ptr[i] > 0 ? 0 : x_ptr[i] * out_grad_ptr[i]; + } + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + prelu_grad, CPU, ALL_LAYOUT, phi::PReluGradKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/prelu_kernel.cc b/paddle/phi/kernels/cpu/prelu_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..8f389ab9ff459d1935518f35e7884d144bec5020 --- /dev/null +++ b/paddle/phi/kernels/cpu/prelu_kernel.cc @@ -0,0 +1,71 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/prelu_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void PReluKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& alpha, + const std::string& mode, + const std::string& data_format, + DenseTensor* out) { + const T* x_ptr = x.data(); + const T* alpha_ptr = alpha.data(); + T* o_ptr = dev_ctx.template Alloc(out); + + int numel = x.numel(); + auto dim = x.dims(); + int index = 0; + int i = 0; + if (mode == "channel") { + if (data_format == "NCHW") { + int temp = 1; + for (int j = 2; j < dim.size(); j++) { + temp *= dim[j]; + } + for (i = 0; i < numel; i++) { + index = (i / temp) % dim[1]; + o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i]; + } + } else { + for (i = 0; i < numel; i++) { + index = i % dim[dim.size() - 1]; + o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i]; + } + } + } else if (mode == "element") { + int temp = 1; + for (int j = 1; j < dim.size(); j++) { + temp *= dim[j]; + } + for (i = 0; i < numel; i++) { + index = i % temp; + o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i]; + } + } else { + for (i = 0; i < numel; i++) { + o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[0] * x_ptr[i]; + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(prelu, CPU, ALL_LAYOUT, phi::PReluKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc b/paddle/phi/kernels/cpu/reduce_grad_kernel.cc similarity index 53% rename from paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc rename to paddle/phi/kernels/cpu/reduce_grad_kernel.cc index efea054555e86be79b5cdb09fe8c4784a1ad0c3b..78a7ae8d415b5d4b18fdf8e469576db50f739e38 100644 --- a/paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/reduce_grad_kernel.cc @@ -12,33 +12,19 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/reduce_sum_grad_kernel.h" +#include "paddle/phi/kernels/reduce_grad_kernel.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/cast_kernel.h" -#include "paddle/phi/kernels/cpu/reduce_grad.h" #include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/funcs/reduce_functor.h" +#include "paddle/phi/kernels/impl/reduce_grad.h" +#include "paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h" +#include "paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h" +#include "paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h" namespace phi { -struct SumGradFunctor { - template - void operator()(const DeviceContext& place, - X* x, - Y* y, - DX* dx, - DY* dy, - const Dim& dim, - int size) { - dx->device(place) = dy->broadcast(dim); - } -}; - template void ComputeFromInput(const Context& dev_ctx, const DenseTensor& x, @@ -111,16 +97,38 @@ void ReduceSumGradKernel(const Context& dev_ctx, } } - ReduceGradKernel(dev_ctx, - x, - out_grad, - paddle::none, - dims, - keep_dim, - reduce_all, - in_dtype, - out_dtype, - x_grad); + ReduceGradKernel(dev_ctx, + x, + out_grad, + paddle::none, + dims, + keep_dim, + reduce_all, + in_dtype, + out_dtype, + x_grad); +} + +template +void ReduceMeanGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType in_dtype, + DataType out_dtype, + DenseTensor* x_grad) { + ReduceGradKernel(dev_ctx, + x, + out_grad, + paddle::none, + dims, + keep_dim, + reduce_all, + in_dtype, + out_dtype, + x_grad); } } // namespace phi @@ -137,3 +145,38 @@ PD_REGISTER_KERNEL(sum_grad, int64_t, phi::dtype::complex, phi::dtype::complex) {} + +PD_REGISTER_KERNEL(mean_grad, + CPU, + ALL_LAYOUT, + phi::ReduceMeanGradKernel, + bool, + float, + double) {} + +PD_REGISTER_KERNEL(prod_grad, + CPU, + ALL_LAYOUT, + phi::ReduceProdGradKernel, + float, + double, + int, + int64_t) {} + +PD_REGISTER_KERNEL(max_grad, + CPU, + ALL_LAYOUT, + phi::ReduceMaxGradKernel, + float, + double, + int, + int64_t) {} + +PD_REGISTER_KERNEL(min_grad, + CPU, + ALL_LAYOUT, + phi::ReduceMinGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cumsum_kernel.h b/paddle/phi/kernels/cumsum_kernel.h index fd90c7b8f5eee81b517013069ca9c2b366aa7d13..f105c94d559d873c8a11025a3c8c931010050445 100644 --- a/paddle/phi/kernels/cumsum_kernel.h +++ b/paddle/phi/kernels/cumsum_kernel.h @@ -18,7 +18,7 @@ namespace phi { -template +template void CumsumKernel(const Context& dev_ctx, const DenseTensor& x, int axis, diff --git a/paddle/phi/kernels/deformable_conv_kernel.h b/paddle/phi/kernels/deformable_conv_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..3886e6801a31bf9f747b324ae4c355bd48c53cd7 --- /dev/null +++ b/paddle/phi/kernels/deformable_conv_kernel.h @@ -0,0 +1,35 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void DeformableConvKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& offset, + const DenseTensor& filter, + const DenseTensor& mask, + const std::vector& strides, + const std::vector& paddings, + const std::vector& dilations, + int deformable_groups, + int groups, + int im2col_step, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/funcs/pooling.h b/paddle/phi/kernels/funcs/pooling.h index 19c6d52c4c9018f821c4e7f6ddaebf933aa045e8..fa285dc69d1ca552afbd1f41e050ee603be07239 100644 --- a/paddle/phi/kernels/funcs/pooling.h +++ b/paddle/phi/kernels/funcs/pooling.h @@ -43,7 +43,7 @@ template class MaxPool { public: DEVICE inline T initial() { return static_cast(-FLT_MAX); } - DEVICE inline void compute(const T& x, T* y) { *y = *y > x ? *y : x; } + HOSTDEVICE inline void compute(const T& x, T* y) { *y = *y > x ? *y : x; } DEVICE inline void finalize(const T& pool_field, T* y) {} }; diff --git a/paddle/phi/kernels/funcs/reduce_functor.h b/paddle/phi/kernels/funcs/reduce_functor.h index c74880e04322474e28385997b5022ebf52643bf4..b793afb63b1dca9bbd8ad09b83461567de6371ad 100644 --- a/paddle/phi/kernels/funcs/reduce_functor.h +++ b/paddle/phi/kernels/funcs/reduce_functor.h @@ -73,5 +73,82 @@ struct AnyFunctor { } }; +struct MeanGradFunctor { + template + void operator()(const DeviceContext& place, + X* x, + Y* y, + DX* dx, + DY* dy, + const Dim& dim, + int size) { + dx->device(place) = dy->broadcast(dim) / dx->constant(size); + } +}; + +struct SumGradFunctor { + template + void operator()(const DeviceContext& place, + X* x, + Y* y, + DX* dx, + DY* dy, + const Dim& dim, + int size) { + dx->device(place) = dy->broadcast(dim); + } +}; + +struct ProdGradFunctor { + template + void operator()(const DeviceContext& place, + X* x, + Y* y, + DX* dx, + DY* dy, + const Dim& dim, + int size) { + dx->device(place) = dy->broadcast(dim) * y->broadcast(dim) * x->inverse(); + } +}; + +struct MaxOrMinGradFunctor { + template + void operator()(const DeviceContext& place, + X* x, + Y* y, + DX* dx, + DY* dy, + const Dim& dim, + int size) { + auto equals = (*x) == y->broadcast(dim); + auto ones = dx->constant(1); + auto zeros = dx->constant(0); + // If there are multiple minimum or maximum elements, the subgradient of + // each is the set [0, 1], and we pass gradient to all of them here. + dx->device(place) = dy->broadcast(dim) * equals.select(ones, zeros); + } +}; + } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/funcs/reduce_grad_functions.h b/paddle/phi/kernels/funcs/reduce_grad_functions.h index 3488b6f2f86b20e0b758f3aa75a6739c40cd81db..11197a52261d7d0fd7618d2c7c0de09b57abe0d8 100644 --- a/paddle/phi/kernels/funcs/reduce_grad_functions.h +++ b/paddle/phi/kernels/funcs/reduce_grad_functions.h @@ -41,14 +41,14 @@ void ReduceGradFunctor(const Context& dev_ctx, Eigen::array broadcast_dim; for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1; - int broad_cats_times = 1; + int broad_cast_times = 1; for (size_t i = 0; i < dims_ref.size(); ++i) { if (dims_ref[i] < 0) { dims_ref[i] = x_rank + dims_ref[i]; } reduced_dims_v[dims_ref[i]] = 1; broadcast_dim[dims_ref[i]] = x_dims[dims_ref[i]]; - broad_cats_times *= x_dims[dims_ref[i]]; + broad_cast_times *= x_dims[dims_ref[i]]; } auto reduced_dims = phi::make_ddim(reduced_dims_v); auto x_reduce = EigenTensor::From(input1, reduced_dims); @@ -62,7 +62,7 @@ void ReduceGradFunctor(const Context& dev_ctx, &x_grad, &x_reduce_grad, broadcast_dim, - broad_cats_times); + broad_cast_times); } inline void GetOriginDimFromShuffled(const DDim& src_dim, diff --git a/paddle/phi/kernels/funcs/sparse/convolution.h b/paddle/phi/kernels/funcs/sparse/convolution.h index d82d793e5343a48306572068722e2fe587c0aa57..19f1f3d3cd2fadff918da25cc873944e927a473a 100644 --- a/paddle/phi/kernels/funcs/sparse/convolution.h +++ b/paddle/phi/kernels/funcs/sparse/convolution.h @@ -165,6 +165,26 @@ inline void SubmPreProcess(const Context& dev_ctx, x_grad_ptr); } +inline const std::vector PoolResetKernel( + const std::vector& kernel_sizes, + const int in_channels, + const int out_channels) { + std::vector res(kernel_sizes); + res.resize(5); + res[3] = in_channels; + res[4] = out_channels; + return res; +} + +inline void PrefixSum(const int* counter, int* offsets, const int n) { + int offset = 0; + for (int i = 0; i < n; i++) { + offsets[i] = offset; + offset += counter[i]; + } + offsets[n] = offset; +} + } // namespace sparse } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/reduce_sum_grad_kernel.h b/paddle/phi/kernels/gelu_grad_kernel.h similarity index 61% rename from paddle/phi/kernels/reduce_sum_grad_kernel.h rename to paddle/phi/kernels/gelu_grad_kernel.h index ab4d63297efffc70710e496efa08f4b9c7e5f7ce..fd70e8d54bc8d004373efd1874f4b07a9ebde6a8 100644 --- a/paddle/phi/kernels/reduce_sum_grad_kernel.h +++ b/paddle/phi/kernels/gelu_grad_kernel.h @@ -14,19 +14,18 @@ #pragma once -#include "paddle/phi/common/data_type.h" +#ifndef _USE_MATH_DEFINES +#define _USE_MATH_DEFINES // use M_2_SQRTPI on Windows +#endif + #include "paddle/phi/core/dense_tensor.h" + namespace phi { template -void ReduceSumGradKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& out_grad, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DataType in_dtype, - DataType out_dtype, - DenseTensor* x_grad); - +void GeluGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + bool approximate, + DenseTensor* x_grad); } // namespace phi diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu b/paddle/phi/kernels/gelu_kernel.h similarity index 50% rename from paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu rename to paddle/phi/kernels/gelu_kernel.h index a578c9f7d81083c533028b9c8912a24006ed0292..bc106a04031fbcc2a96209e170d60eda8cc7b5e1 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu +++ b/paddle/phi/kernels/gelu_kernel.h @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,14 +12,21 @@ // See the License for the specific language governing permissions and // limitations under the License. -// .part used to speed up nvcc compile -#include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h" +#pragma once -template -using CUDAReduceMeanGradKernel = - ops::ReduceCudaGradKernel; +#ifndef _USE_MATH_DEFINES +#define _USE_MATH_DEFINES // use M_2_SQRTPI on Windows +#endif -REGISTER_OP_CUDA_KERNEL(reduce_mean_grad, CUDAReduceMeanGradKernel, - CUDAReduceMeanGradKernel, - CUDAReduceMeanGradKernel, - CUDAReduceMeanGradKernel); +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +#define GELU_CONSTANT 0.044715 + +template +void GeluKernel(const Context& dev_ctx, + const DenseTensor& x, + bool approximate, + DenseTensor* out); +} // namespace phi diff --git a/paddle/phi/kernels/gpu/deformable_conv_kernel.cu b/paddle/phi/kernels/gpu/deformable_conv_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..1db6e1b7cf73375f2617c727a26e5768922777d4 --- /dev/null +++ b/paddle/phi/kernels/gpu/deformable_conv_kernel.cu @@ -0,0 +1,160 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/deformable_conv_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/impl/deformable_conv_kernel_impl.h" + +namespace phi { + +static constexpr int kNumCUDAThreads = 512; +static constexpr int kNumMaximumNumBlocks = 4096; + +static inline int NumBlocks(const int N) { + return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, + kNumMaximumNumBlocks); +} + +template +__global__ void ModulatedDeformableIm2colGpuKernel( + const int nthreads, + const T* data_im, + const T* data_offset, + const T* data_mask, + const int height, + const int width, + const int kernel_h, + const int kernel_w, + const int pad_h, + const int pad_w, + const int stride_h, + const int stride_w, + const int dilation_h, + const int dilation_w, + const int channel_per_deformable_group, + const int batch_size, + const int num_channels, + const int deformable_group, + const int height_col, + const int width_col, + T* data_col) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int offset = blockDim.x * gridDim.x; + for (size_t i = index; i < nthreads; i += offset) { + const int w_col = i % width_col; + const int h_col = (i / width_col) % height_col; + const int b_col = (i / width_col) / height_col % batch_size; + const int c_im = (i / width_col / height_col) / batch_size; + const int c_col = c_im * kernel_h * kernel_w; + + const int deformable_group_index = c_im / channel_per_deformable_group; + + const int h_in = h_col * stride_h - pad_h; + const int w_in = w_col * stride_w - pad_w; + + T* data_col_ptr = + data_col + + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col; + const T* data_im_ptr = + data_im + (b_col * num_channels + c_im) * height * width; + const T* data_offset_ptr = + data_offset + + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * + kernel_w * height_col * width_col; + const T* data_mask_ptr = + data_mask + + (b_col * deformable_group + deformable_group_index) * kernel_h * + kernel_w * height_col * width_col; + + for (int i = 0; i < kernel_h; ++i) { + for (int j = 0; j < kernel_w; ++j) { + const int data_offset_h_ptr = + ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col; + const int data_offset_w_ptr = + ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + + w_col; + const int data_mask_hw_ptr = + ((i * kernel_w + j) * height_col + h_col) * width_col + w_col; + + const T offset_h = data_offset_ptr[data_offset_h_ptr]; + const T offset_w = data_offset_ptr[data_offset_w_ptr]; + const T mask = data_mask_ptr[data_mask_hw_ptr]; + T val = static_cast(0); + const T h_im = h_in + i * dilation_h + offset_h; + const T w_im = w_in + j * dilation_w + offset_w; + if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) { + val = + DmcnIm2colBilinear(data_im_ptr, width, height, width, h_im, w_im); + } + *data_col_ptr = val * mask; + data_col_ptr += batch_size * height_col * width_col; + } + } + } +} + +template +void ModulatedDeformableIm2col(const Context& dev_ctx, + const T* data_im, + const T* data_offset, + const T* data_mask, + const std::vector& im_shape, + const std::vector& col_shape, + const std::vector& filter_shape, + const std::vector& paddings, + const std::vector& strides, + const std::vector& dilations, + const int deformable_groups, + T* data_col) { + int channel_per_deformable_group = im_shape[0] / deformable_groups; + int num_kernels = im_shape[0] * col_shape[1] * col_shape[2] * col_shape[3]; + + int blocks = NumBlocks(num_kernels); + int threads = kNumCUDAThreads; + + ModulatedDeformableIm2colGpuKernel< + T><<>>(num_kernels, + data_im, + data_offset, + data_mask, + im_shape[1], + im_shape[2], + filter_shape[2], + filter_shape[3], + paddings[0], + paddings[1], + strides[0], + strides[1], + dilations[0], + dilations[1], + channel_per_deformable_group, + col_shape[1], + im_shape[0], + deformable_groups, + col_shape[2], + col_shape[3], + data_col); +} + +} // namespace phi + +PD_REGISTER_KERNEL(deformable_conv, + GPU, + ALL_LAYOUT, + phi::DeformableConvKernel, + float, + double) {} diff --git a/paddle/phi/kernels/gpu/gelu_funcs.h b/paddle/phi/kernels/gpu/gelu_funcs.h new file mode 100644 index 0000000000000000000000000000000000000000..2b9be7c6154354f7fd20b316610521a02801243f --- /dev/null +++ b/paddle/phi/kernels/gpu/gelu_funcs.h @@ -0,0 +1,176 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/platform/flags.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/kernels/funcs/aligned_vector.h" + +DECLARE_bool(use_fast_math); + +namespace phi { + +#ifdef __NVCC__ +template +static __device__ __forceinline__ float FP32FastTanh(float x) { +#if __CUDA_ARCH__ >= 750 && CUDA_VERSION >= 11000 + if (FastMode) { + float y; + asm("tanh.approx.f32 %0,%1; \n\t" : "=f"(y) : "f"(x)); + return y; + } +#endif + return tanhf(x); +} + +template +static __device__ __forceinline__ float FP32GeluFwd(float x) { + auto tanh_out = + FP32FastTanh(0.79788456f * x * (1.0f + 0.044715f * x * x)); + return x * 0.5f * (1.0f + tanh_out); +} + +template +static __device__ __forceinline__ float FP32GeluBwd(float x, float y_g) { + auto tanh_out = + FP32FastTanh(0.79788456f * x * (1.0f + 0.044715f * x * x)); + auto tmp = 0.5f * x * ((1.0f - tanh_out * tanh_out) * + (0.79788456f + 0.1070322243f * x * x)) + + 0.5f * (1.0f + tanh_out); + return tmp * y_g; +} + +template +static __global__ void FP16FastGeluFwdCUDAKernel(const __half* x, + __half* y, + size_t n) { + size_t offset = + static_cast(threadIdx.x + blockIdx.x * blockDim.x) * VecSize; + size_t stride = static_cast(blockDim.x * gridDim.x) * VecSize; + for (; offset < n; offset += stride) { + using ArrT = phi::AlignedVector<__half, VecSize>; + ArrT in_arr = *reinterpret_cast(x + offset); +#pragma unroll + for (int i = 0; i < VecSize; ++i) { + float tmp = __half2float(in_arr[i]); + in_arr[i] = __float2half(FP32GeluFwd(tmp)); + } + *reinterpret_cast(y + offset) = in_arr; + } +} + +template +static __global__ void FP16FastGeluBwdCUDAKernel(const __half* x, + const __half* y_g, + __half* x_g, + size_t n) { + size_t offset = + static_cast(threadIdx.x + blockIdx.x * blockDim.x) * VecSize; + size_t stride = static_cast(blockDim.x * gridDim.x) * VecSize; + for (; offset < n; offset += stride) { + using ArrT = phi::AlignedVector<__half, VecSize>; + ArrT x_in_arr = *reinterpret_cast(x + offset); + ArrT y_g_in_arr = *reinterpret_cast(y_g + offset); +#pragma unroll + for (int i = 0; i < VecSize; ++i) { + __half2 tmp_fp16_2; + tmp_fp16_2.x = x_in_arr[i]; + tmp_fp16_2.y = y_g_in_arr[i]; + float2 tmp_fp32_2 = __half22float2(tmp_fp16_2); + x_in_arr[i] = + __float2half(FP32GeluBwd(tmp_fp32_2.x, tmp_fp32_2.y)); + } + *reinterpret_cast(x_g + offset) = x_in_arr; + } +} + +static bool TryLaunchFP16FastGeluFwdVectorizeCUDAKernel( + const GPUContext& dev_ctx, const __half* x, __half* y, size_t n) { + auto is_aligned = [](const void* p, size_t alignment) { + return reinterpret_cast(p) % alignment == 0; + }; + +#define PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(__vec_size, __use_fast_math) \ + do { \ + constexpr auto kAlignment = \ + alignof(phi::AlignedVector<__half, __vec_size>); \ + if (n % __vec_size == 0 && is_aligned(x, kAlignment) && \ + is_aligned(y, kAlignment)) { \ + size_t thread = std::min(512, dev_ctx.GetMaxThreadsPerBlock()); \ + size_t block = (n / __vec_size + thread - 1) / thread; \ + block = std::min(block, dev_ctx.GetCUDAMaxGridDimSize()[0]); \ + VLOG(10) << "Use FP16 fast gelu fwd kernel, block = " << block \ + << " , thread = " << thread; \ + FP16FastGeluFwdCUDAKernel< \ + __vec_size, \ + __use_fast_math><<>>(x, y, n); \ + return true; \ + } \ + } while (0) + + if (FLAGS_use_fast_math) { + PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(8, true); + } else { + PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(8, false); + } + +#undef PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL + return false; +} + +static bool TryLaunchFP16FastGeluBwdVectorizeCUDAKernel( + const GPUContext& dev_ctx, + const __half* x, + const __half* y_g, + __half* x_g, + size_t n) { + auto is_aligned = [](const void* p, size_t alignment) { + return reinterpret_cast(p) % alignment == 0; + }; + +#define PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(__vec_size, __use_fast_math) \ + do { \ + constexpr auto kAlignment = \ + alignof(phi::AlignedVector<__half, __vec_size>); \ + if (n % __vec_size == 0 && is_aligned(x, kAlignment) && \ + is_aligned(x, kAlignment) && is_aligned(y_g, kAlignment) && \ + is_aligned(x_g, kAlignment)) { \ + size_t thread = std::min(512, dev_ctx.GetMaxThreadsPerBlock()); \ + size_t block = (n / __vec_size + thread - 1) / thread; \ + block = std::min(block, dev_ctx.GetCUDAMaxGridDimSize()[0]); \ + VLOG(10) << "Use FP16 fast gelu bwd kernel, block = " << block \ + << " , thread = " << thread; \ + FP16FastGeluBwdCUDAKernel< \ + __vec_size, \ + __use_fast_math><<>>( \ + x, y_g, x_g, n); \ + return true; \ + } \ + } while (0) + + if (FLAGS_use_fast_math) { + PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(8, true); + } else { + PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(8, false); + } + +#undef PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL + return false; +} +#endif + +} // namespace phi diff --git a/paddle/phi/kernels/gpu/gelu_grad_kernel.cu b/paddle/phi/kernels/gpu/gelu_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..1e21f8d4267bca5363d58b63e0a37d076b4d06af --- /dev/null +++ b/paddle/phi/kernels/gpu/gelu_grad_kernel.cu @@ -0,0 +1,100 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/gelu_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/broadcast_function.h" +#include "paddle/phi/kernels/gpu/gelu_funcs.h" + +DECLARE_bool(use_fast_math); + +namespace phi { + +template +struct GeluWithApproximateGradFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + inline HOSTDEVICE T operator()(T arg_x, T arg_dout) { + MPType x = static_cast(arg_x); + MPType dout = static_cast(arg_dout); + MPType one = static_cast(1); + MPType half = static_cast(0.5); + MPType kAlpha = static_cast(M_2_SQRTPI * M_SQRT1_2); + MPType kBeta = + kAlpha * static_cast(GELU_CONSTANT) * static_cast(3); + auto cube_x = x * x * x; + auto tanh_out = + tanh(kAlpha * ((static_cast(GELU_CONSTANT) * cube_x) + x)); + auto ans = + half * (one + tanh_out + + (one - tanh_out * tanh_out) * (x * kAlpha + kBeta * cube_x)); + return static_cast(ans * dout); + } +}; + +template +struct GeluWithoutApproximateGradFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + inline HOSTDEVICE T operator()(T arg_x, T arg_dout) { + MPType x = static_cast(arg_x); + MPType dout = static_cast(arg_dout); + constexpr MPType kBeta = M_2_SQRTPI * M_SQRT1_2 * static_cast(0.5); + const MPType cdf = normcdf(x); + const MPType pdf = exp(static_cast(-0.5) * x * x) * kBeta; + return static_cast(dout * (cdf + x * pdf)); + } +}; + +template +void GeluGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + bool approximate, + DenseTensor* x_grad) { + dev_ctx.template Alloc(x_grad); + std::vector ins = {&x, &out_grad}; + std::vector outs = {x_grad}; + if (approximate) { +#ifdef __NVCC__ + if (std::is_same::value) { + size_t n = x.numel(); + const auto* x_ptr = reinterpret_cast(x.data()); + const auto* y_g_ptr = reinterpret_cast(out_grad.data()); + auto* x_g_ptr = reinterpret_cast<__half*>(x_grad->data()); + if (TryLaunchFP16FastGeluBwdVectorizeCUDAKernel( + dev_ctx, x_ptr, y_g_ptr, x_g_ptr, n)) { + return; + } + } +#endif + phi::funcs::BroadcastKernel( + dev_ctx, ins, &outs, 0, GeluWithApproximateGradFunctor()); + } else { + phi::funcs::BroadcastKernel( + dev_ctx, ins, &outs, 0, GeluWithoutApproximateGradFunctor()); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(gelu_grad, + GPU, + ALL_LAYOUT, + phi::GeluGradKernel, + float, + double, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpu/gelu_kernel.cu b/paddle/phi/kernels/gpu/gelu_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..ce6dda2d6cc6526853cf563779cfe5ad1a21ffe1 --- /dev/null +++ b/paddle/phi/kernels/gpu/gelu_kernel.cu @@ -0,0 +1,90 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/gelu_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/broadcast_function.h" +#include "paddle/phi/kernels/gpu/gelu_funcs.h" + +DECLARE_bool(use_fast_math); + +namespace phi { + +template +struct GeluWithApproximateFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + inline HOSTDEVICE T operator()(T arg_x) { + // this function is tanh approximation of gelu + MPType x = static_cast(arg_x); + MPType one = static_cast(1); + MPType half = static_cast(0.5); + MPType kAlpha = static_cast(M_2_SQRTPI * M_SQRT1_2); + auto tanh_out = + tanh(kAlpha * x * (one + static_cast(GELU_CONSTANT) * x * x)); + MPType out = x * half * (one + tanh_out); + return static_cast(out); + } +}; + +template +struct GeluWithoutApproximateFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + inline HOSTDEVICE T operator()(T arg_x) { + // actual gelu with approximation = false + MPType x = static_cast(arg_x); + return static_cast(x * normcdf(x)); + } +}; + +template +void GeluKernel(const Context& dev_ctx, + const DenseTensor& x, + bool approximate, + DenseTensor* out) { + dev_ctx.template Alloc(out); + std::vector ins = {&x}; + std::vector outs = {out}; + if (approximate) { +#ifdef __NVCC__ + if (std::is_same::value) { + size_t n = x.numel(); + const auto* in_ptr = reinterpret_cast(x.data()); + auto* out_ptr = reinterpret_cast<__half*>(out->data()); + if (TryLaunchFP16FastGeluFwdVectorizeCUDAKernel( + dev_ctx, in_ptr, out_ptr, n)) { + return; + } + } +#endif + phi::funcs::BroadcastKernel( + dev_ctx, ins, &outs, 0, GeluWithApproximateFunctor()); + } else { + phi::funcs::BroadcastKernel( + dev_ctx, ins, &outs, 0, GeluWithoutApproximateFunctor()); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(gelu, + GPU, + ALL_LAYOUT, + phi::GeluKernel, + float, + double, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpu/kthvalue_grad_kernel.cu b/paddle/phi/kernels/gpu/kthvalue_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..f6e96046a2bd799f4a6b8d30a239afb505582deb --- /dev/null +++ b/paddle/phi/kernels/gpu/kthvalue_grad_kernel.cu @@ -0,0 +1,70 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/kthvalue_grad_kernel.h" + +#include "paddle/fluid/operators/top_k_function_cuda.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { +static int getBlockSize(int col) { + if (col > 512) + return 1024; + else if (col > 256 && col <= 512) + return 512; + else if (col > 128 && col <= 256) + return 256; + else if (col > 64 && col <= 128) + return 128; + else + return 64; +} + +template +void KthvalueGradKernel(const Context& dev_ctx, + const DenseTensor& d_out, + const DenseTensor& x, + const DenseTensor& indices, + int k, + int axis, + bool keepdim, + DenseTensor* d_x) { + const auto& in_dims = x.dims(); + auto out_dims = indices.dims(); + if (axis < 0) axis += in_dims.size(); + T* x_grad_data = dev_ctx.template Alloc(d_x); + const T* out_grad_data = d_out.data(); + const int64_t* indices_data = indices.data(); + int pre, n, post; + paddle::operators::GetDims(in_dims, axis, &pre, &n, &post); + int block_size = getBlockSize(post * k); + int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); + const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1); + int grid_size = std::min(max_blocks, pre); + paddle::operators::AssignGradWithAxis< + T><<>>( + out_grad_data, indices_data, x_grad_data, pre, post, n, 1); +} + +} // namespace phi + +PD_REGISTER_KERNEL(kthvalue_grad, + GPU, + ALL_LAYOUT, + phi::KthvalueGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/kthvalue_kernel.cu b/paddle/phi/kernels/gpu/kthvalue_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..4218e153ec29bd1757b2405f0af638040de9bff2 --- /dev/null +++ b/paddle/phi/kernels/gpu/kthvalue_kernel.cu @@ -0,0 +1,252 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/kthvalue_kernel.h" + +#include "paddle/fluid/operators/top_k_function_cuda.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { +inline int getBlockSize(int col) { + if (col > 512) + return 1024; + else if (col > 256 && col <= 512) + return 512; + else if (col > 128 && col <= 256) + return 256; + else if (col > 64 && col <= 128) + return 128; + else + return 64; +} + +template +bool SortKthvalue(const phi::GPUContext& dev_ctx, + const DenseTensor* input_tensor, + const int64_t num_cols, + const int64_t num_rows, + const int k, + DenseTensor* out_tensor, + DenseTensor* indices_tensor) { + auto cu_stream = dev_ctx.stream(); + DenseTensor input_indices; + const std::vector dims = {num_rows, num_cols}; + auto dim = phi::make_ddim(dims); + input_indices.Resize(dim); + dev_ctx.template Alloc(&input_indices); + size_t temp_storage_bytes = -1; + int block_size = getBlockSize(num_cols); + unsigned int maxGridDimX = dev_ctx.GetCUDAMaxGridDimSize()[0]; + unsigned int grid_size = num_rows < maxGridDimX + ? static_cast(num_rows) + : maxGridDimX; + paddle::operators::InitIndex< + int64_t><<>>( + input_indices.data(), num_rows, num_cols); + cub::CountingInputIterator counting_iter(0); + cub::TransformInputIterator> + segment_offsets_t(counting_iter, + paddle::operators::SegmentOffsetIter(num_cols)); + T* sorted_values_ptr; + int64_t* sorted_indices_ptr; + DenseTensor temp_values, temp_indices; + const T* input = input_tensor->data(); + T* values = out_tensor->data(); + int64_t* indices = indices_tensor->mutable_data(dev_ctx.GetPlace()); + temp_values.Resize(dim); + temp_indices.Resize(dim); + sorted_values_ptr = dev_ctx.template Alloc(&temp_values); + sorted_indices_ptr = dev_ctx.template Alloc(&temp_indices); + auto err = + cub::DeviceSegmentedRadixSort::SortPairs(nullptr, + temp_storage_bytes, + input, + sorted_values_ptr, + input_indices.data(), + sorted_indices_ptr, + num_cols * num_rows, + num_rows, + segment_offsets_t, + segment_offsets_t + 1, + 0, + sizeof(T) * 8, + cu_stream); +#ifdef __HIPCC__ + if (err != hipSuccess) { + LOG(ERROR) << "KthvalueOP failed as could not launch " + "hipcub::DeviceSegmentedRadixSort::SortPairs, status: " + << hipGetErrorString(err); + return false; + } +#else + if (err != cudaSuccess) { + LOG(ERROR) << "KthvalueOP failed as could not launch " + "cub::DeviceSegmentedRadixSort::SortPairs, status: " + << cudaGetErrorString(err); + return false; + } +#endif + DenseTensor temp_storage; + temp_storage.Resize({static_cast(temp_storage_bytes / sizeof(uint8_t))}); + uint8_t* temp_storage_data = dev_ctx.template Alloc(&temp_storage); + + err = cub::DeviceSegmentedRadixSort::SortPairs(temp_storage_data, + temp_storage_bytes, + input, + sorted_values_ptr, + input_indices.data(), + sorted_indices_ptr, + num_cols * num_rows, + num_rows, + segment_offsets_t, + segment_offsets_t + 1, + 0, + sizeof(T) * 8, + cu_stream); +#ifdef __HIPCC__ + if (err != hipSuccess) { + LOG(ERROR) << "KthvalueOP failed as could not launch " + "hipcub::DeviceSegmentedRadixSort::SortPairs, " + << temp_storage_bytes << ", status: " << hipGetErrorString(err); + return false; + } +#else + if (err != cudaSuccess) { + LOG(ERROR) << "KthvalueOP failed as could not launch " + "cub::DeviceSegmentedRadixSort::SortPairs, " + << temp_storage_bytes << ", status: " << cudaGetErrorString(err); + return false; + } +#endif + auto& dev = *dev_ctx.eigen_device(); + const Eigen::DSizes slice_indices{0, k - 1}; + const Eigen::DSizes slice_sizes{num_rows, 1}; + auto e_indices = EigenMatrix::From(*indices_tensor, dim); + auto e_tmp_indices = + EigenMatrix::From(static_cast(temp_indices)); + std::vector odims = {static_cast(num_rows), static_cast(1)}; + dim = phi::make_ddim(odims); + auto e_values = EigenMatrix::From(*out_tensor, dim); + auto e_tmp_values = + EigenMatrix::From(static_cast(temp_values)); + + funcs::EigenSlice, int64_t, 2>::Eval( + dev, e_indices, e_tmp_indices, slice_indices, slice_sizes); + funcs::EigenSlice, T, 2>::Eval( + dev, e_values, e_tmp_values, slice_indices, slice_sizes); + return true; +} + +template +void KthvalueKernel(const Context& dev_ctx, + const DenseTensor& x, + int k, + int axis, + bool keepdim, + DenseTensor* output, + DenseTensor* indices) { + const auto& in_dims = x.dims(); + if (axis < 0) axis += in_dims.size(); + auto out_dims = output->dims(); + const T* input_data = x.data(); + T* output_data = dev_ctx.template Alloc(output); + int64_t* indices_data = dev_ctx.template Alloc(indices); + + if (axis == in_dims.size() - 1) { + const int64_t& input_height = + phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); + const int64_t& input_width = in_dims[in_dims.size() - 1]; + PADDLE_ENFORCE_EQ( + SortKthvalue( + dev_ctx, &x, input_width, input_height, k, output, indices), + true, + phi::errors::External("KthvalueOP: Error when use cub sorting")); + return; + } else { + std::vector trans; + for (int i = 0; i < axis; i++) { + trans.emplace_back(i); + } + trans.emplace_back(in_dims.size() - 1); + for (int i = axis + 1; i < in_dims.size() - 1; i++) { + trans.emplace_back(i); + } + trans.emplace_back(axis); + if (!keepdim) { + std::vector tmp_out_shape; + for (int i = 0; i < axis; i++) { + tmp_out_shape.emplace_back(in_dims[i]); + } + tmp_out_shape.emplace_back(1); + for (int i = axis + 1; i < in_dims.size(); i++) { + tmp_out_shape.emplace_back(in_dims[i]); + } + DDim tmp_out_dims = phi::make_ddim(tmp_out_shape); + output->Resize(tmp_out_dims); + indices->Resize(tmp_out_dims); + } + DDim trans_dims(in_dims); + DDim trans_out_dims(in_dims); + for (int i = 0; i < trans.size(); i++) { + trans_dims[i] = in_dims[trans[i]]; + trans_out_dims[i] = in_dims[trans[i]]; + } + trans_out_dims[in_dims.size() - 1] = 1; + DenseTensor trans_input; + trans_input.mutable_data(trans_dims, dev_ctx.GetPlace()); + int ndims = trans.size(); + funcs::TransCompute( + ndims, dev_ctx, x, &trans_input, trans); + DenseTensor trans_ind, trans_out; + trans_ind.mutable_data(trans_out_dims, dev_ctx.GetPlace()); + trans_out.mutable_data(trans_out_dims, dev_ctx.GetPlace()); + const int64_t input_height = + phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); + const int64_t input_width = trans_dims[trans_dims.size() - 1]; + PADDLE_ENFORCE_EQ( + SortKthvalue(dev_ctx, + &trans_input, + input_width, + input_height, + k, + &trans_out, + &trans_ind), + true, + phi::errors::External("KthvalueOP: Error when use cub sorting")); + funcs::TransCompute( + ndims, dev_ctx, trans_ind, indices, trans); + funcs::TransCompute( + ndims, dev_ctx, trans_out, output, trans); + if (!keepdim) { + output->Resize(out_dims); + indices->Resize(out_dims); + } + } +} +} // namespace phi + +PD_REGISTER_KERNEL(kthvalue, + GPU, + ALL_LAYOUT, + phi::KthvalueKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..f7b282536558db524c082de11c7ca92b2bd98edc --- /dev/null +++ b/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu @@ -0,0 +1,53 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/log_softmax_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h" + +namespace phi { + +template +void LogSoftmaxGradKernel(const Context &dev_ctx, + const DenseTensor &out, + const DenseTensor &out_grad, + int axis, + DenseTensor *x_grad) { + dev_ctx.template Alloc(x_grad); + phi::SoftmaxBackwardCUDAKernelDriver( + dev_ctx, out, out_grad, axis, x_grad); +} + +} // namespace phi + +#ifdef PADDLE_WITH_HIP +PD_REGISTER_KERNEL(log_softmax_grad, + GPU, + ALL_LAYOUT, + phi::LogSoftmaxGradKernel, + float, + phi::dtype::float16, + phi::dtype::bfloat16) {} +#else +PD_REGISTER_KERNEL(log_softmax_grad, + GPU, + ALL_LAYOUT, + phi::LogSoftmaxGradKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} +#endif diff --git a/paddle/phi/kernels/gpu/log_softmax_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..d7e34c6c14e7a49f50c016d888f6fb875dca0776 --- /dev/null +++ b/paddle/phi/kernels/gpu/log_softmax_kernel.cu @@ -0,0 +1,51 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/log_softmax_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h" + +namespace phi { + +template +void LogSoftmaxKernel(const Context &dev_ctx, + const DenseTensor &x, + int axis, + DenseTensor *out) { + dev_ctx.template Alloc(out); + phi::SoftmaxForwardCUDAKernelDriver(dev_ctx, x, axis, out); +} + +} // namespace phi + +#ifdef PADDLE_WITH_HIP +PD_REGISTER_KERNEL(log_softmax, + GPU, + ALL_LAYOUT, + phi::LogSoftmaxKernel, + float, + phi::dtype::float16, + phi::dtype::bfloat16) {} +#else +PD_REGISTER_KERNEL(log_softmax, + GPU, + ALL_LAYOUT, + phi::LogSoftmaxKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} +#endif diff --git a/paddle/phi/kernels/gpu/prelu_funcs.h b/paddle/phi/kernels/gpu/prelu_funcs.h new file mode 100644 index 0000000000000000000000000000000000000000..76ee9439a2050b000b5cffd1df47581141a874c7 --- /dev/null +++ b/paddle/phi/kernels/gpu/prelu_funcs.h @@ -0,0 +1,183 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +#define CUDA_NUM_THREADS 1024 + +inline static int PADDLE_GET_BLOCKS(const int N) { + return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; +} + +template +__global__ void PReluChannelFirstWiseKernel(const T *input, + const T *alpha, + T *output, + size_t channel_num, + size_t plane_size, + size_t numel) { + CUDA_KERNEL_LOOP(index, numel) { + size_t temp = index / plane_size; + size_t channel_index = temp % channel_num; + T scale = alpha[channel_index]; + T x = input[index]; + T zero = static_cast(0); + output[index] = (x > zero) ? x : scale * x; + } +} + +template +__global__ void PReluChannelLastWiseKernel(const T *input, + const T *alpha, + T *output, + size_t channel_num, + size_t numel) { + CUDA_KERNEL_LOOP(index, numel) { + size_t channel_index = index % channel_num; + T scale = alpha[channel_index]; + T x = input[index]; + T zero = static_cast(0); + output[index] = (x > zero) ? x : scale * x; + } +} + +template +__global__ void PReluElementWiseKernel(const T *input, + const T *alpha, + T *output, + size_t spatial_size, + size_t numel) { + CUDA_KERNEL_LOOP(index, numel) { + size_t element_index = index % spatial_size; + T scale = alpha[element_index]; + T x = input[index]; + T zero = static_cast(0); + output[index] = (x > zero) ? x : scale * x; + } +} + +template +__global__ void PReluScalarKernel(const T *input, + const T *alpha, + T *output, + size_t numel) { + T scale = alpha[0]; + CUDA_KERNEL_LOOP(index, numel) { + T x = input[index]; + T zero = static_cast(0); + output[index] = (x > zero) ? x : scale * x; + } +} + +template +class PreluChannelWiseDirectCUDAFunctor { + public: + void operator()(gpuStream_t stream, + const T *input, + const T *alpha, + T *output, + size_t batch_size, + size_t channel, + bool channel_last, + size_t numel); +}; + +template +class PreluElementWiseDirectCUDAFunctor { + public: + void operator()(gpuStream_t stream, + const T *input, + const T *alpha, + T *output, + size_t batch_size, + size_t numel); +}; + +template +class PreluScalarDirectCUDAFunctor { + public: + void operator()(gpuStream_t stream, + const T *input, + const T *alpha, + T *output, + size_t numel); +}; + +template +void PreluChannelWiseDirectCUDAFunctor::operator()(gpuStream_t stream, + const T *input, + const T *alpha, + T *output, + size_t batch_size, + size_t channel, + bool channel_last, + size_t numel) { + if (channel_last) { + PReluChannelLastWiseKernel<<>>( + input, alpha, output, channel, numel); + } else { + PReluChannelFirstWiseKernel<<>>( + input, alpha, output, channel, numel / batch_size / channel, numel); + } +} + +template +void PreluElementWiseDirectCUDAFunctor::operator()(gpuStream_t stream, + const T *input, + const T *alpha, + T *output, + size_t batch_size, + size_t numel) { + PReluElementWiseKernel<<>>( + input, alpha, output, numel / batch_size, numel); +} + +template +void PreluScalarDirectCUDAFunctor::operator()(gpuStream_t stream, + const T *input, + const T *alpha, + T *output, + size_t numel) { + PReluScalarKernel<<>>( + input, alpha, output, numel); +} + +template class PreluChannelWiseDirectCUDAFunctor; +template class PreluChannelWiseDirectCUDAFunctor; +template class PreluChannelWiseDirectCUDAFunctor; + +template class PreluElementWiseDirectCUDAFunctor; +template class PreluElementWiseDirectCUDAFunctor; +template class PreluElementWiseDirectCUDAFunctor; + +template class PreluScalarDirectCUDAFunctor; +template class PreluScalarDirectCUDAFunctor; +template class PreluScalarDirectCUDAFunctor; + +} // namespace phi diff --git a/paddle/phi/kernels/gpu/prelu_grad_kernel.cu b/paddle/phi/kernels/gpu/prelu_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..d8661268e82c35f48d9877120574628c4325ae4e --- /dev/null +++ b/paddle/phi/kernels/gpu/prelu_grad_kernel.cu @@ -0,0 +1,183 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/prelu_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/tensor_meta.h" +#include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/funcs/reduce_function.h" +#include "paddle/phi/kernels/gpu/prelu_funcs.h" +#include "paddle/phi/kernels/primitive/functor_primitives.h" + +namespace phi { + +enum PRELU_MODE { Element, ChannelFirst, ChannelLast, PRELU_Scalar }; + +template +__global__ void PReluOpGradKernel(const T* x_ptr, + const T* alpha_ptr, + const T* out_grad_ptr, + T* x_grad_ptr, + T* alpha_grad_ptr, + size_t channel_num, + size_t plane_size, + size_t spatial_size, + size_t numel, + PRELU_MODE mode) { + CUDA_KERNEL_LOOP(index, numel) { + T scale; + if (mode == Element) { + size_t element_index = index % spatial_size; + scale = alpha_ptr[element_index]; + } else if (mode == ChannelFirst) { + size_t temp = index / plane_size; + size_t channel_index = temp % channel_num; + scale = alpha_ptr[channel_index]; + } else if (mode == ChannelLast) { + size_t channel_index = index % channel_num; + scale = alpha_ptr[channel_index]; + } else { + scale = alpha_ptr[0]; + } + T x = x_ptr[index]; + T out_grad = out_grad_ptr[index]; + T zero = static_cast(0); + if (x_grad_ptr != nullptr) + x_grad_ptr[index] = (x > zero) ? out_grad : scale * out_grad; + if (alpha_grad_ptr != nullptr) + alpha_grad_ptr[index] = (x > zero) ? zero : x * out_grad; + } +} + +template +class PreluOpGradFunctor { + public: + void operator()(gpuStream_t stream, + const T* x, + const T* alpha, + const T* out_grad, + T* x_grad, + T* alpha_grad, + const DDim& input_dims, + PRELU_MODE mode) { + size_t numel = 1; + for (size_t i = 0; i < input_dims.size(); ++i) { + numel *= input_dims[i]; + } + size_t plane_size = numel / input_dims[0] / input_dims[1]; + size_t spatial_size = numel / input_dims[0]; + size_t channel = + mode == ChannelLast ? input_dims[input_dims.size() - 1] : input_dims[1]; + + PReluOpGradKernel< + T><<>>( + x, + alpha, + out_grad, + x_grad, + alpha_grad, + channel, + plane_size, + spatial_size, + numel, + mode); + } +}; + +template +void PReluGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& alpha, + const DenseTensor& out_grad, + const std::string& mode, + const std::string& data_format, + DenseTensor* x_grad, + DenseTensor* alpha_grad) { + dev_ctx.template Alloc(x_grad); + + const T* x_ptr = x.data(); + const T* alpha_ptr = alpha.data(); + const T* out_grad_ptr = out_grad.data(); + T* x_grad_ptr = x_grad ? dev_ctx.template Alloc(x_grad) : nullptr; + T* alpha_grad_ptr = + alpha_grad ? dev_ctx.template Alloc(alpha_grad) : nullptr; + + if (!x_grad && !alpha_grad) return; + + int numel = x.numel(); + auto dim = x.dims(); + auto x_rank = dim.size(); + std::vector input_shape = phi::vectorize(dim); + auto stream = dev_ctx.stream(); + + T* alpha_grad_tmp_ptr; + DenseTensor alpha_grad_tmp; + if (alpha_grad_ptr == nullptr) { + alpha_grad_tmp_ptr = alpha_grad_ptr; + } else { + DenseTensorMeta alpha_grad_meta( + alpha_grad->dtype(), dim, alpha_grad->layout()); + alpha_grad_tmp = phi::Empty(dev_ctx, std::move(alpha_grad_meta)); + alpha_grad_tmp_ptr = alpha_grad_tmp.data(); + } + + PRELU_MODE m; + bool channel_last = false; + if (mode == "element") { + m = Element; + } else if (mode == "channel") { + channel_last = data_format == "NHWC"; + m = channel_last ? ChannelLast : ChannelFirst; + } else { + m = PRELU_Scalar; + } + PreluOpGradFunctor prelu_grad; + prelu_grad(stream, + x_ptr, + alpha_ptr, + out_grad_ptr, + x_grad_ptr, + alpha_grad_tmp_ptr, + dim, + m); + + if (alpha_grad_tmp_ptr == nullptr) return; + + std::vector reduce_dims; + for (size_t i = 0; i < dim.size(); i++) { + if (mode == "channel" && !channel_last && i == 1) continue; + if (mode == "channel" && channel_last && i == dim.size() - 1) continue; + if (mode == "element" && i != 0) continue; + reduce_dims.push_back(i); + } + + phi::funcs::ReduceKernel>( + static_cast(dev_ctx), + alpha_grad_tmp, + alpha_grad, + kps::IdentityFunctor(), + reduce_dims); +} + +} // namespace phi + +PD_REGISTER_KERNEL(prelu_grad, + GPU, + ALL_LAYOUT, + phi::PReluGradKernel, + float, + phi::dtype::float16, + double) {} diff --git a/paddle/phi/kernels/gpu/prelu_kernel.cu b/paddle/phi/kernels/gpu/prelu_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..8255a7ba2ed96dcdeb8d6e23a4637ce56d636a12 --- /dev/null +++ b/paddle/phi/kernels/gpu/prelu_kernel.cu @@ -0,0 +1,71 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/prelu_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gpu/prelu_funcs.h" + +namespace phi { + +template +void PReluKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& alpha, + const std::string& mode, + const std::string& data_format, + DenseTensor* out) { + const T* x_ptr = x.data(); + T* o_ptr = dev_ctx.template Alloc(out); + + const T* alpha_ptr = alpha.data(); + int numel = x.numel(); + auto dim = x.dims(); + auto x_rank = dim.size(); + + VLOG(4) << "dim[0]:" << dim[0] << ", dim[1]:" << dim[1] << ", dim[" + << x_rank - 1 << "]:" << dim[x_rank - 1] << ", numel:" << numel; + + if (mode == "channel") { + bool channel_last = data_format == "NHWC"; + size_t channel = channel_last ? dim[x_rank - 1] : dim[1]; + PreluChannelWiseDirectCUDAFunctor prelu_channel_wise; + prelu_channel_wise(dev_ctx.stream(), + x_ptr, + alpha_ptr, + o_ptr, + dim[0], + channel, + channel_last, + numel); + } else if (mode == "element") { + PreluElementWiseDirectCUDAFunctor prelu_element_wise; + prelu_element_wise( + dev_ctx.stream(), x_ptr, alpha_ptr, o_ptr, dim[0], numel); + } else { + PreluScalarDirectCUDAFunctor prelu_scalar; + prelu_scalar(dev_ctx.stream(), x_ptr, alpha_ptr, o_ptr, numel); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(prelu, + GPU, + ALL_LAYOUT, + phi::PReluKernel, + float, + phi::dtype::float16, + double) {} diff --git a/paddle/phi/kernels/gpu/reduce_grad.h b/paddle/phi/kernels/gpu/reduce_grad.h index d21c8a3fa46f81c046c722db50ac62fb57cf64f4..e32101b73728f637da0626d691018842aedd62e7 100644 --- a/paddle/phi/kernels/gpu/reduce_grad.h +++ b/paddle/phi/kernels/gpu/reduce_grad.h @@ -43,5 +43,59 @@ void ReduceGrad(const GPUContext& dev_ctx, })); } +template class TransformOp> +void ReduceGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType in_dtype, + DataType out_dtype, + DenseTensor* x_grad) { + auto* in_x = &x; + auto* d_out = &out_grad; + auto* d_x = x_grad; + + auto pt_out_dtype = in_dtype; + + // get reduce_dim and reduce_num for reduce_mean_grad + int dim_size = in_x->dims().size(); + std::vector reduce_dims = + funcs::details::GetReduceDim(dims, dim_size, reduce_all); + + auto update_dims = vectorize(d_x->dims()); + int reduce_num = 1; + for (auto i : reduce_dims) { + reduce_num *= (in_x->dims())[i]; + update_dims[i] = 1; + } + // make new tensor + DenseTensor new_d_out(d_out->dtype()); + new_d_out.ShareDataWith(*d_out); + new_d_out.Resize(phi::make_ddim(update_dims)); + if (in_dtype != DataType::UNDEFINED) { + dev_ctx.Alloc(d_x, in_dtype); + } else { + dev_ctx.Alloc(d_x, d_out->dtype()); + } + + auto pt_d_out = new_d_out; + auto pt_d_x = *d_x; + if (in_dtype == DataType::UNDEFINED) { + pt_out_dtype = d_out->dtype(); + } + using MPType = typename kps::details::MPTypeTrait::Type; + + phi::ReduceGrad>( + dev_ctx, + &pt_d_out, + &pt_d_x, + pt_out_dtype, + TransformOp(reduce_num)); +} + } // namespace phi #endif diff --git a/paddle/phi/kernels/gpu/reduce_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..5256048267ea19a4cb12387ebbc582a2df1bd1b1 --- /dev/null +++ b/paddle/phi/kernels/gpu/reduce_grad_kernel.cu @@ -0,0 +1,119 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/reduce_function.h" +#include "paddle/phi/kernels/gpu/reduce_grad.h" +#include "paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h" +#include "paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h" +#include "paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h" + +namespace phi { + +template +void ReduceSumGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType in_dtype, + DataType out_dtype, + DenseTensor* x_grad) { + ReduceGradKernel(dev_ctx, + x, + out_grad, + dims, + keep_dim, + reduce_all, + in_dtype, + out_dtype, + x_grad); +} + +template +void ReduceMeanGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType in_dtype, + DataType out_dtype, + DenseTensor* x_grad) { + ReduceGradKernel(dev_ctx, + x, + out_grad, + dims, + keep_dim, + reduce_all, + in_dtype, + out_dtype, + x_grad); +} + +} // namespace phi + +PD_REGISTER_KERNEL(sum_grad, + GPU, + ALL_LAYOUT, + phi::ReduceSumGradKernel, + bool, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16, + int, + int64_t, + phi::dtype::complex, + phi::dtype::complex) {} + +PD_REGISTER_KERNEL(mean_grad, + GPU, + ALL_LAYOUT, + phi::ReduceMeanGradKernel, + bool, + float, + double, + phi::dtype::float16) {} + +PD_REGISTER_KERNEL(prod_grad, + GPU, + ALL_LAYOUT, + phi::ReduceProdGradKernel, + float, + double, + int, + int64_t) {} + +PD_REGISTER_KERNEL(max_grad, + GPU, + ALL_LAYOUT, + phi::ReduceMaxGradKernel, + float, + double, + int, + int64_t) {} + +PD_REGISTER_KERNEL(min_grad, + GPU, + ALL_LAYOUT, + phi::ReduceMinGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu deleted file mode 100644 index 9f4ddc3cf37a744355f6f79b7cd18b3d06b80062..0000000000000000000000000000000000000000 --- a/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu +++ /dev/null @@ -1,90 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/reduce_sum_grad_kernel.h" - -#include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/funcs/reduce_function.h" -#include "paddle/phi/kernels/gpu/reduce_grad.h" - -namespace phi { - -template -void ReduceSumGradKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& out_grad, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DataType in_dtype, - DataType out_dtype, - DenseTensor* x_grad) { - auto* in_x = &x; - auto* d_out = &out_grad; - auto* d_x = x_grad; - - auto pt_out_dtype = in_dtype; - - // get reduce_dim and reduce_num for reduce_mean_grad - int dim_size = in_x->dims().size(); - std::vector reduce_dims = - funcs::details::GetReduceDim(dims, dim_size, reduce_all); - - auto update_dims = vectorize(d_x->dims()); - int reduce_num = 1; - for (auto i : reduce_dims) { - reduce_num *= (in_x->dims())[i]; - update_dims[i] = 1; - } - // make new tensor - DenseTensor new_d_out(d_out->dtype()); - new_d_out.ShareDataWith(*d_out); - new_d_out.Resize(phi::make_ddim(update_dims)); - if (in_dtype != DataType::UNDEFINED) { - dev_ctx.Alloc(d_x, in_dtype); - } else { - dev_ctx.Alloc(d_x, d_out->dtype()); - } - - auto pt_d_out = new_d_out; - auto pt_d_x = *d_x; - if (in_dtype == DataType::UNDEFINED) { - pt_out_dtype = d_out->dtype(); - } - using MPType = typename kps::details::MPTypeTrait::Type; - - phi::ReduceGrad>( - dev_ctx, - &pt_d_out, - &pt_d_x, - pt_out_dtype, - kps::IdentityFunctor(reduce_num)); -} - -} // namespace phi - -PD_REGISTER_KERNEL(sum_grad, - GPU, - ALL_LAYOUT, - phi::ReduceSumGradKernel, - bool, - float, - double, - phi::dtype::float16, - phi::dtype::bfloat16, - int, - int64_t, - phi::dtype::complex, - phi::dtype::complex) {} diff --git a/paddle/phi/kernels/hierarchical_sigmoid_grad_kernel.h b/paddle/phi/kernels/hierarchical_sigmoid_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..f7a327cd3f566d7d3e3da9517ba2f50d67b6ba60 --- /dev/null +++ b/paddle/phi/kernels/hierarchical_sigmoid_grad_kernel.h @@ -0,0 +1,42 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void HierarchicalSigmoidGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& w, + const DenseTensor& label, + const DenseTensor& pre_out, + const DenseTensor& out_grad, + paddle::optional path, + paddle::optional code, + paddle::optional bias, + int num_classes, + bool remote_prefetch, + int trainer_id, + const std::vector& height_sections, + const std::vector& epmap, + const std::vector& table_names, + bool is_sparse, + DenseTensor* x_grad, + DenseTensor* w_grad, + DenseTensor* bias_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/hierarchical_sigmoid_kernel.h b/paddle/phi/kernels/hierarchical_sigmoid_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..619b022904b17b3669abe61dc5ce341f6c6ae9bc --- /dev/null +++ b/paddle/phi/kernels/hierarchical_sigmoid_kernel.h @@ -0,0 +1,40 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void HierarchicalSigmoidKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& w, + const DenseTensor& label, + paddle::optional path, + paddle::optional code, + paddle::optional bias, + int num_classes, + bool remote_prefetch, + int trainer_id, + const std::vector& height_sections, + const std::vector& epmap, + const std::vector& table_names, + bool is_sparse, + DenseTensor* out, + DenseTensor* pre_out, + DenseTensor* w_out); + +} // namespace phi diff --git a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..d8795808a643d2741ca210b13303febd187a193a --- /dev/null +++ b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h @@ -0,0 +1,173 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" + +namespace phi { + +template +HOSTDEVICE T DmcnIm2colBilinear(const T* bottom_data, + const int data_width, + const int height, + const int width, + T h, + T w) { + int h_low = floor(h); + int w_low = floor(w); + int h_high = h_low + 1; + int w_high = w_low + 1; + + T lh = h - h_low; + T lw = w - w_low; + T hh = 1 - lh; + T hw = 1 - lw; + + T v1 = + (h_low >= 0 && w_low >= 0) ? bottom_data[h_low * data_width + w_low] : 0; + T v2 = (h_low >= 0 && w_high <= width - 1) + ? bottom_data[h_low * data_width + w_high] + : 0; + T v3 = (h_high <= height - 1 && w_low >= 0) + ? bottom_data[h_high * data_width + w_low] + : 0; + T v4 = (h_high <= height - 1 && w_high <= width - 1) + ? bottom_data[h_high * data_width + w_high] + : 0; + + T w1 = hh * hw; + T w2 = hh * lw; + T w3 = lh * hw; + T w4 = lh * lw; + + return w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4; +} + +template +void ModulatedDeformableIm2col(const Context& dev_ctx, + const T* data_im, + const T* data_offset, + const T* data_mask, + const std::vector& im_shape, + const std::vector& col_shape, + const std::vector& filter_shape, + const std::vector& paddings, + const std::vector& strides, + const std::vector& dilations, + const int deformable_groups, + T* data_col); + +template +void DeformableConvKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& offset, + const DenseTensor& filter, + const DenseTensor& mask, + const std::vector& strides, + const std::vector& paddings, + const std::vector& dilations, + int deformable_groups, + int groups, + int im2col_step, + DenseTensor* out) { + const int batch_size = static_cast(x.dims()[0]); + + std::vector filter_shape_vec(phi::vectorize(filter.dims())); + std::vector output_shape_vec(phi::vectorize(out->dims())); + + // col_shape_vec: {c_i * k_h * k_w, im2col_step, o_h, o_w} + std::vector col_buffer_shape_vec(filter_shape_vec.size()); + col_buffer_shape_vec[0] = x.dims()[1] * filter.dims()[2] * filter.dims()[3]; + col_buffer_shape_vec[1] = im2col_step; + for (size_t j = 0; j < filter_shape_vec.size() - 2; ++j) { + col_buffer_shape_vec[j + 2] = output_shape_vec[j + 2]; + } + + std::vector output_buffer_shape_vec(1); + output_buffer_shape_vec[0] = batch_size * output_shape_vec[1] * + output_shape_vec[2] * output_shape_vec[3]; + + DenseTensor col_buffer = Empty(dev_ctx, col_buffer_shape_vec); + DenseTensor output_buffer = Empty(dev_ctx, output_buffer_shape_vec); + + int64_t M = output_shape_vec[1] / groups; + int64_t N = im2col_step * output_shape_vec[2] * output_shape_vec[3]; + int64_t K = x.dims()[1] * filter_shape_vec[2] * filter_shape_vec[3] / groups; + + DenseTensor weight_3d; + weight_3d.ShareDataWith(filter).Resize(phi::make_ddim({groups, M, K})); + + DenseTensor col_buffer_3d; + col_buffer_3d.ShareDataWith(col_buffer) + .Resize(phi::make_ddim({groups, K, N})); + + DenseTensor output_4d; + output_4d.ShareDataWith(output_buffer) + .Resize(phi::make_ddim({batch_size / im2col_step, groups, M, N})); + + DDim input_shape = phi::slice_ddim(x.dims(), 1, x.dims().size()); + std::vector input_shape_vec = phi::vectorize(input_shape); + + int input_dim = x.numel() / x.dims()[0]; + int input_offset_dim = offset.numel() / offset.dims()[0]; + int input_mask_dim = mask.numel() / mask.dims()[0]; + + auto blas = phi::funcs::GetBlas(dev_ctx); + + const T* input_ptr = x.data(); + const T* offset_ptr = offset.data(); + const T* mask_ptr = mask.data(); + T* col_buffer_ptr = col_buffer.data(); + + for (int i = 0; i < batch_size / im2col_step; ++i) { + ModulatedDeformableIm2col(dev_ctx, + input_ptr + i * im2col_step * input_dim, + offset_ptr + i * im2col_step * input_offset_dim, + mask_ptr + i * im2col_step * input_mask_dim, + input_shape_vec, + col_buffer_shape_vec, + filter_shape_vec, + paddings, + strides, + dilations, + deformable_groups, + col_buffer_ptr); + DenseTensor output_3d = output_4d.Slice(i, i + 1).Resize( + phi::slice_ddim(output_4d.dims(), 1, output_4d.dims().size())); + // get the product of pixel and weight + for (int g = 0; g < groups; ++g) { + DenseTensor weight_3d_slice = weight_3d.Slice(g, g + 1).Resize( + phi::slice_ddim(weight_3d.dims(), 1, weight_3d.dims().size())); + DenseTensor col_buffer_3d_slice = + col_buffer_3d.Slice(g, g + 1).Resize(phi::slice_ddim( + col_buffer_3d.dims(), 1, col_buffer_3d.dims().size())); + DenseTensor output_3d_slice = output_3d.Slice(g, g + 1).Resize( + phi::slice_ddim(output_3d.dims(), 1, output_3d.dims().size())); + blas.MatMul(weight_3d_slice, + false, + col_buffer_3d_slice, + false, + T(1.0), + &output_3d_slice, + T(0.0)); + } + } + out->ShareDataWith(output_buffer).Resize(phi::make_ddim(output_shape_vec)); +} + +} // namespace phi diff --git a/paddle/phi/kernels/cpu/reduce_grad.h b/paddle/phi/kernels/impl/reduce_grad.h similarity index 100% rename from paddle/phi/kernels/cpu/reduce_grad.h rename to paddle/phi/kernels/impl/reduce_grad.h diff --git a/paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h b/paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..4a74416e3916492e6d3a40e09ca347db485fff7c --- /dev/null +++ b/paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h @@ -0,0 +1,47 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/reduce_grad_kernel.h" + +#include "paddle/phi/kernels/funcs/reduce_functor.h" +#include "paddle/phi/kernels/impl/reduce_grad.h" + +namespace phi { + +template +void ReduceMaxGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const DenseTensor& out, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType in_dtype, + DataType out_dtype, + DenseTensor* x_grad) { + ReduceGradKernel(dev_ctx, + x, + out_grad, + out, + dims, + keep_dim, + reduce_all, + in_dtype, + out_dtype, + x_grad); +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h b/paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..baaa544f137366f1e0343c25bc373cc08350f7fd --- /dev/null +++ b/paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h @@ -0,0 +1,47 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/reduce_grad_kernel.h" + +#include "paddle/phi/kernels/funcs/reduce_functor.h" +#include "paddle/phi/kernels/impl/reduce_grad.h" + +namespace phi { + +template +void ReduceMinGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const DenseTensor& out, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType in_dtype, + DataType out_dtype, + DenseTensor* x_grad) { + ReduceGradKernel(dev_ctx, + x, + out_grad, + out, + dims, + keep_dim, + reduce_all, + in_dtype, + out_dtype, + x_grad); +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h b/paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..6b93e98cec0168ab55e15e3401a72738f79d3a07 --- /dev/null +++ b/paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h @@ -0,0 +1,47 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/reduce_grad_kernel.h" + +#include "paddle/phi/kernels/funcs/reduce_functor.h" +#include "paddle/phi/kernels/impl/reduce_grad.h" + +namespace phi { + +template +void ReduceProdGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const DenseTensor& out, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType in_dtype, + DataType out_dtype, + DenseTensor* x_grad) { + ReduceGradKernel(dev_ctx, + x, + out_grad, + out, + dims, + keep_dim, + reduce_all, + in_dtype, + out_dtype, + x_grad); +} + +} // namespace phi diff --git a/paddle/phi/kernels/kthvalue_grad_kernel.h b/paddle/phi/kernels/kthvalue_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..488dde8237b0882b6606834f8a510ef360da1b24 --- /dev/null +++ b/paddle/phi/kernels/kthvalue_grad_kernel.h @@ -0,0 +1,30 @@ + +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { +template +void KthvalueGradKernel(const Context& dev_ctx, + const DenseTensor& d_out, + const DenseTensor& x, + const DenseTensor& indices, + int k, + int axis, + bool keepdim, + DenseTensor* d_x); +} // namespace phi diff --git a/paddle/phi/kernels/kthvalue_kernel.h b/paddle/phi/kernels/kthvalue_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..4809b9af4832f5d9c036d26adfc6ba5c7a808889 --- /dev/null +++ b/paddle/phi/kernels/kthvalue_kernel.h @@ -0,0 +1,30 @@ + +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void KthvalueKernel(const Context& dev_ctx, + const DenseTensor& x, + int k, + int axis, + bool keepdim, + DenseTensor* out, + DenseTensor* indices); +} // namespace phi diff --git a/paddle/phi/kernels/log_softmax_grad_kernel.h b/paddle/phi/kernels/log_softmax_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..6336bc14105bb55deacbfdc20a69a56c6ceca81a --- /dev/null +++ b/paddle/phi/kernels/log_softmax_grad_kernel.h @@ -0,0 +1,27 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void LogSoftmaxGradKernel(const Context& dev_ctx, + const DenseTensor& out, + const DenseTensor& out_grad, + int axis, + DenseTensor* x_grad); +} // namespace phi diff --git a/paddle/phi/kernels/log_softmax_kernel.h b/paddle/phi/kernels/log_softmax_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..2caaa86d46c35888c5aaa944019c070f0dd64e17 --- /dev/null +++ b/paddle/phi/kernels/log_softmax_kernel.h @@ -0,0 +1,26 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void LogSoftmaxKernel(const Context& dev_ctx, + const DenseTensor& x, + int axis, + DenseTensor* out); +} // namespace phi diff --git a/paddle/phi/kernels/prelu_grad_kernel.h b/paddle/phi/kernels/prelu_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..15917e2e1f02e896d12e971e7dfa52685f57a676 --- /dev/null +++ b/paddle/phi/kernels/prelu_grad_kernel.h @@ -0,0 +1,31 @@ + +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void PReluGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& alpha, + const DenseTensor& out_grad, + const std::string& mode, + const std::string& data_format, + DenseTensor* x_grad, + DenseTensor* alpha_grad); +} // namespace phi diff --git a/paddle/phi/kernels/prelu_kernel.h b/paddle/phi/kernels/prelu_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..251332a8158dcbfa45cbb6c183e06789c21894db --- /dev/null +++ b/paddle/phi/kernels/prelu_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void PReluKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& alpha, + const std::string& mode, + const std::string& data_format, + DenseTensor* out); +} // namespace phi diff --git a/paddle/phi/kernels/reduce_grad_kernel.h b/paddle/phi/kernels/reduce_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..ee6f3d19a094d29546e82e7138933eceb96459d0 --- /dev/null +++ b/paddle/phi/kernels/reduce_grad_kernel.h @@ -0,0 +1,79 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/data_type.h" +#include "paddle/phi/core/dense_tensor.h" +namespace phi { + +template +void ReduceSumGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType in_dtype, + DataType out_dtype, + DenseTensor* x_grad); + +template +void ReduceMeanGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType in_dtype, + DataType out_dtype, + DenseTensor* x_grad); + +template +void ReduceProdGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const DenseTensor& out, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType in_dtype, + DataType out_dtype, + DenseTensor* x_grad); + +template +void ReduceMaxGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const DenseTensor& out, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType in_dtype, + DataType out_dtype, + DenseTensor* x_grad); + +template +void ReduceMinGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const DenseTensor& out, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType in_dtype, + DataType out_dtype, + DenseTensor* x_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/reduce_kernel.h b/paddle/phi/kernels/reduce_kernel.h index 75f52c36beb76abcd0cc05a7b46935a56d35da64..69bcb47bc98eadd46eeff5c1f92ccf9cf0c9a9d3 100644 --- a/paddle/phi/kernels/reduce_kernel.h +++ b/paddle/phi/kernels/reduce_kernel.h @@ -16,7 +16,6 @@ #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/infermeta/unary.h" -#include "paddle/phi/kernels/empty_kernel.h" namespace phi { template diff --git a/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.cc b/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..80b2a1f6678a27594f0fd3319ccb938dac67bf13 --- /dev/null +++ b/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.cc @@ -0,0 +1,99 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.h" + +#include "paddle/fluid/framework/mixed_vector.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/hierarchical_sigmoid_grad.h" + +namespace phi { +namespace sr { + +static std::vector PathToRows(const DenseTensor& path) { + std::set rows; + const int64_t* paths = path.data(); + for (int64_t i = 0; i < path.numel(); ++i) { + int64_t row = paths[i]; + if (row < 0) { + continue; + } + rows.emplace(row); + } + return std::vector(rows.begin(), rows.end()); +} + +template +void HierarchicalSigmoidGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& w, + const DenseTensor& label, + const DenseTensor& pre_out, + const DenseTensor& out_grad, + paddle::optional path, + paddle::optional code, + paddle::optional bias, + int num_classes, + bool remote_prefetch, + int trainer_id, + const std::vector& height_sections, + const std::vector& epmap, + const std::vector& table_names, + bool is_sparse, + DenseTensor* x_grad, + SelectedRows* w_grad, + DenseTensor* bias_grad) { + PADDLE_ENFORCE_NOT_NULL( + path.get_ptr(), + errors::NotFound("Custom tree must be set for sparse mode!")); + paddle::framework::Vector real_rows = PathToRows(*path); + w_grad->set_rows(real_rows); + // Build a map of id -> row_index to speed up finding the index of one id + w_grad->set_height(w.dims()[0]); + auto* w_grad_value = w_grad->mutable_value(); + phi::DDim temp_dim(w.dims()); + temp_dim[0] = real_rows.size(); + w_grad_value->Resize(temp_dim); + phi::HierarchicalSigmoidGradKernelImpl(ctx, + x, + w, + label, + pre_out, + out_grad, + path, + code, + bias, + num_classes, + remote_prefetch, + trainer_id, + height_sections, + epmap, + table_names, + is_sparse, + x_grad, + w_grad_value, + bias_grad, + w_grad); +} + +} // namespace sr +} // namespace phi + +PD_REGISTER_KERNEL(hierarchical_sigmoid_grad_sr, + CPU, + ALL_LAYOUT, + phi::sr::HierarchicalSigmoidGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.h b/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..557c8b1bc5eed2c64f3a5c16d52cead124815ffc --- /dev/null +++ b/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.h @@ -0,0 +1,45 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/selected_rows.h" + +namespace phi { +namespace sr { + +template +void HierarchicalSigmoidGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& w, + const DenseTensor& label, + const DenseTensor& pre_out, + const DenseTensor& out_grad, + paddle::optional path, + paddle::optional code, + paddle::optional bias, + int num_classes, + bool remote_prefetch, + int trainer_id, + const std::vector& height_sections, + const std::vector& epmap, + const std::vector& table_names, + bool is_sparse, + DenseTensor* x_grad, + SelectedRows* w_grad, + DenseTensor* bias_grad); + +} // namespace sr +} // namespace phi diff --git a/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..3010d480b55c9583ff5af9271b2e063667a69da7 --- /dev/null +++ b/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc @@ -0,0 +1,73 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/pooling.h" +#include "paddle/phi/kernels/funcs/sparse/convolution.h" + +namespace phi { +namespace sparse { + +template +void MaxPoolGradKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& rulebook, + const SparseCooTensor& out, + const DenseTensor& out_grad, + const std::vector& kernel_sizes, + DenseTensor* x_grad) { + int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2]; + const int channels = x.dims()[4]; + int rulebook_len = rulebook.dims()[1]; + const int* rulebook_ptr = rulebook.data(); + std::vector offsets(kernel_size + 1), counter(kernel_size, 0); + for (int i = 0; i < rulebook_len; i++) { + counter[rulebook_ptr[i]] += 1; + } + phi::funcs::sparse::PrefixSum(&counter[0], &offsets[0], kernel_size); + + const T* in_features_ptr = x.non_zero_elements().data(); + const T* out_features_ptr = out.non_zero_elements().data(); + const T* out_grad_ptr = out_grad.data(); + T* x_grad_ptr = x_grad->data(); + memset(x_grad_ptr, 0, sizeof(T) * x_grad->numel()); + + phi::funcs::MaxPoolGrad grad_functor; + for (int i = 0; i < kernel_size; i++) { + for (int j = 0; j < counter[i]; j++) { + int in_i = rulebook_ptr[rulebook_len + offsets[i] + j]; + int out_i = rulebook_ptr[rulebook_len * 2 + offsets[i] + j]; + for (int c = 0; c < channels; c++) { + grad_functor.compute(in_features_ptr[in_i * channels + c], + out_features_ptr[out_i * channels + c], + out_grad_ptr[out_i * channels + c], + 1, + &x_grad_ptr[in_i * channels + c]); + } + } + } +} + +} // namespace sparse +} // namespace phi + +PD_REGISTER_KERNEL(sparse_maxpool_grad, + CPU, + ALL_LAYOUT, + phi::sparse::MaxPoolGradKernel, + float, + double) { + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); +} diff --git a/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..86971242df5aeed5b0acd74f23db185e02544846 --- /dev/null +++ b/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc @@ -0,0 +1,108 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/sparse/sparse_pool_kernel.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/tensor_meta.h" +#include "paddle/phi/kernels/funcs/pooling.h" +#include "paddle/phi/kernels/funcs/sparse/convolution.h" +#include "paddle/phi/kernels/sparse/cpu/convolution.h" + +namespace phi { +namespace sparse { + +/** + * x: (N, D, H, W, C) + * kernel: (D, H, W, C, OC) + * out: (N, D, H, W, OC) +**/ +template +void MaxPoolKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const std::vector& kernel_sizes, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + SparseCooTensor* out, + DenseTensor* rulebook) { + const auto& x_dims = x.dims(); + int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2]; + const std::vector& real_kernel_sizes = + phi::funcs::sparse::PoolResetKernel(kernel_sizes, x_dims[4], x_dims[4]); + DDim out_dims = {1, 1, 1, 1, 1}; + phi::funcs::sparse::GetOutShape( + x_dims, real_kernel_sizes, paddings, dilations, strides, &out_dims); + const int in_channels = real_kernel_sizes[3]; + + DenseTensorMeta counter_meta( + DataType::INT32, {kernel_size}, DataLayout::NCHW); + DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta)); + + const T* in_features_ptr = x.non_zero_elements().data(); + // 1. product rule book + ProductRuleBook(dev_ctx, + x, + real_kernel_sizes, + paddings, + dilations, + strides, + out_dims, + false, + rulebook, + &counter_per_kernel); + + UpdateRulebookAndOutIndex( + dev_ctx, x, kernel_size, in_channels, out_dims, rulebook, out); + + int rulebook_len = rulebook->dims()[1]; + const int* rulebook_ptr = rulebook->data(); + const int* counter_ptr = counter_per_kernel.data(); + + std::vector offsets(kernel_size + 1); + phi::funcs::sparse::PrefixSum(counter_ptr, &offsets[0], kernel_size); + std::vector out_flags(out->nnz(), false); + + // 2. max pool + T* out_features_ptr = out->mutable_non_zero_elements()->data(); + phi::funcs::MaxPool max_pool_functor; + for (int i = 0; i < kernel_size; i++) { + for (int j = 0; j < counter_ptr[i]; j++) { + int in_i = rulebook_ptr[rulebook_len + offsets[i] + j]; + int out_i = rulebook_ptr[rulebook_len * 2 + offsets[i] + j]; + if (!out_flags[out_i]) { + out_flags[out_i] = true; + memcpy(&out_features_ptr[out_i * in_channels], + &in_features_ptr[in_i * in_channels], + in_channels * sizeof(T)); + } else { + for (int c = 0; c < in_channels; c++) { + max_pool_functor.compute(in_features_ptr[in_i * in_channels + c], + &out_features_ptr[out_i * in_channels + c]); + } + } + } + } +} + +} // namespace sparse +} // namespace phi + +PD_REGISTER_KERNEL(sparse_maxpool, + CPU, + ALL_LAYOUT, + phi::sparse::MaxPoolKernel, + float, + double) { + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); +} diff --git a/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..1048dd1be0c01c1fa40a8fb2bcab4dca01837d3c --- /dev/null +++ b/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu @@ -0,0 +1,120 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_info.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/pooling.h" +#include "paddle/phi/kernels/funcs/sparse/convolution.h" + +#include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h" + +namespace phi { +namespace sparse { + +template +__global__ void MaxPoolGradCudaKernel(const T* in_features_ptr, + const T* out_features_ptr, + const T* out_grad_ptr, + const int* rulebook_ptr, + const int n, + const int rulebook_len, + const int channels, + T* x_grad_ptr) { + phi::funcs::MaxPoolGrad grad_functor; + CUDA_KERNEL_LOOP_TYPE(i, n * channels, int64_t) { + int real_i = i / channels; + int c = i - real_i * channels; + int in_i = rulebook_ptr[real_i]; + int out_i = rulebook_ptr[real_i + rulebook_len]; + grad_functor.compute(in_features_ptr[in_i * channels + c], + out_features_ptr[out_i * channels + c], + out_grad_ptr[out_i * channels + c], + 1, + &x_grad_ptr[in_i * channels + c]); + } +} + +template +void MaxPoolGradKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& rulebook, + const SparseCooTensor& out, + const DenseTensor& out_grad, + const std::vector& kernel_sizes, + DenseTensor* x_grad) { + int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2]; + const int in_channels = x.dims()[4]; + int rulebook_len = rulebook.dims()[1]; + const int* rulebook_ptr = rulebook.data(); + std::vector offsets(kernel_size + 1), counter(kernel_size, 0), + h_counter(kernel_size); + phi::backends::gpu::GpuMemcpyAsync(&h_counter[0], + rulebook_ptr, + rulebook_len * sizeof(int), +#ifdef PADDLE_WITH_HIP + hipMemcpyDeviceToHost, +#else + cudaMemcpyDeviceToHost, +#endif + + dev_ctx.stream()); + dev_ctx.Wait(); + for (int i = 0; i < rulebook_len; i++) { + counter[h_counter[i]] += 1; + } + phi::funcs::sparse::PrefixSum(&counter[0], &offsets[0], kernel_size); + + const T* in_features_ptr = x.non_zero_elements().data(); + const T* out_features_ptr = out.non_zero_elements().data(); + const T* out_grad_ptr = out_grad.data(); + T* x_grad_ptr = x_grad->data(); + phi::funcs::SetConstant set_zero; + set_zero(dev_ctx, x_grad, static_cast(0.0f)); + + for (int i = 0; i < kernel_size; i++) { + if (counter[i] <= 0) { + continue; + } + + auto config = phi::backends::gpu::GetGpuLaunchConfig1D( + dev_ctx, counter[i] * in_channels, 1); + MaxPoolGradCudaKernel<<>>( + in_features_ptr, + out_features_ptr, + out_grad_ptr, + rulebook_ptr + offsets[i] + rulebook_len, + counter[i], + rulebook_len, + in_channels, + x_grad_ptr); + } +} + +} // namespace sparse +} // namespace phi + +PD_REGISTER_KERNEL(sparse_maxpool_grad, + GPU, + ALL_LAYOUT, + phi::sparse::MaxPoolGradKernel, + float, + double) { + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); +} diff --git a/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..0f6a0d13b1ddbd375a90808789a61e0cb045a7c9 --- /dev/null +++ b/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu @@ -0,0 +1,140 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/tensor_meta.h" +#include "paddle/phi/kernels/funcs/pooling.h" +#include "paddle/phi/kernels/funcs/sparse/convolution.h" +#include "paddle/phi/kernels/sparse/gpu/convolution.cu.h" +#include "paddle/phi/kernels/sparse/sparse_pool_kernel.h" + +namespace phi { +namespace sparse { + +template +__global__ void MaxPoolCudaKernel(const T* in_features_ptr, + const int* rulebook_ptr, + const int n, + const int rulebook_len, + const int channels, + T* out_features_ptr) { + phi::funcs::MaxPool max_pool_functor; + CUDA_KERNEL_LOOP_TYPE(i, n * channels, int64_t) { + int real_i = i / channels; + int channel_i = i - real_i * channels; + int in_i = rulebook_ptr[real_i]; + int out_i = rulebook_ptr[real_i + rulebook_len]; + max_pool_functor.compute(in_features_ptr[in_i * channels + channel_i], + &out_features_ptr[out_i * channels + channel_i]); + } +} + +/** + * x: (N, D, H, W, C) + * kernel: (D, H, W, C, OC) + * out: (N, D, H, W, OC) +**/ +template +void MaxPoolKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const std::vector& kernel_sizes, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + SparseCooTensor* out, + DenseTensor* rulebook) { + const auto& x_dims = x.dims(); + int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2]; + const std::vector& real_kernel_sizes = + phi::funcs::sparse::PoolResetKernel(kernel_sizes, x_dims[4], x_dims[4]); + DDim out_dims = {1, 1, 1, 1, 1}; + phi::funcs::sparse::GetOutShape( + x_dims, real_kernel_sizes, paddings, dilations, strides, &out_dims); + const int in_channels = real_kernel_sizes[3]; + + std::vector offsets(kernel_size + 1), counter(kernel_size); + DenseTensorMeta counter_meta( + DataType::INT32, {kernel_size}, DataLayout::NCHW); + DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta)); + DenseTensor offsets_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta)); + DenseTensorMeta index_meta(DataType::INT32, {1}, DataLayout::NCHW); + DenseTensor out_index = phi::Empty(dev_ctx, std::move(index_meta)); + DenseTensor unique_key = phi::Empty(dev_ctx, std::move(index_meta)); + DenseTensor unique_value = phi::Empty(dev_ctx, std::move(index_meta)); + + // 1. product rulebook + int rulebook_len = ProductRuleBook(dev_ctx, + x, + real_kernel_sizes, + paddings, + dilations, + strides, + out_dims, + false, + rulebook, + &counter_per_kernel, + &offsets_per_kernel, + &out_index, + &unique_key, + &unique_value, + out, + &counter, + &offsets); + + const int* rulebook_ptr = rulebook->data(); + + T* out_features_ptr = out->mutable_non_zero_elements()->data(); + const T* in_features_ptr = x.non_zero_elements().data(); +// 2. max pool +#ifdef PADDLE_WITH_HIP + thrust::fill(thrust::hip::par.on(dev_ctx.stream()), +#else + thrust::fill(thrust::cuda::par.on(dev_ctx.stream()), +#endif + out_features_ptr, + out_features_ptr + out->non_zero_elements().numel(), + static_cast(-FLT_MAX)); + // TODO(zhangkaihuo) Replacing multiple calls with one kernel may be faster + for (int i = 0; i < kernel_size; i++) { + if (counter[i] <= 0) { + continue; + } + + auto config = phi::backends::gpu::GetGpuLaunchConfig1D( + dev_ctx, counter[i] * in_channels, 1); + MaxPoolCudaKernel<<>>( + in_features_ptr, + rulebook_ptr + offsets[i] + rulebook_len, + counter[i], + rulebook_len, + in_channels, + out_features_ptr); + } +} + +} // namespace sparse +} // namespace phi + +PD_REGISTER_KERNEL(sparse_maxpool, + GPU, + ALL_LAYOUT, + phi::sparse::MaxPoolKernel, + float, + double, + phi::dtype::float16) { + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); +} diff --git a/paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h b/paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..572ade76281bc0e6af6be48ed8cc1a96751412ed --- /dev/null +++ b/paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h @@ -0,0 +1,49 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/sparse_coo_tensor.h" +#include "paddle/phi/kernels/empty_kernel.h" + +namespace phi { +namespace sparse { + +template +void MaxPoolGradKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& rulebook, + const SparseCooTensor& out, + const DenseTensor& out_grad, + const std::vector& kernel_sizes, + DenseTensor* x_grad); + +template +DenseTensor MaxPoolGrad(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& rulebook, + const SparseCooTensor& out, + const DenseTensor& out_grad, + const std::vector& kernel_sizes) { + DenseTensor x_grad = phi::Empty( + dev_ctx, + DenseTensorMeta(x.dtype(), x.non_zero_elements().dims(), x.layout())); + MaxPoolGradKernel( + dev_ctx, x, rulebook, out, out_grad, kernel_sizes, &x_grad); + return x_grad; +} + +} // namespace sparse +} // namespace phi diff --git a/paddle/phi/kernels/sparse/sparse_pool_kernel.h b/paddle/phi/kernels/sparse/sparse_pool_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..bfadbf72e300fd633e8475475442658a7db20ad9 --- /dev/null +++ b/paddle/phi/kernels/sparse/sparse_pool_kernel.h @@ -0,0 +1,53 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/sparse_coo_tensor.h" +#include "paddle/phi/kernels/empty_kernel.h" + +namespace phi { +namespace sparse { + +template +void MaxPoolKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const std::vector& kernel_sizes, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + SparseCooTensor* out, + DenseTensor* rulebook); + +template +SparseCooTensor MaxPool(const Context& dev_ctx, + const SparseCooTensor& x, + const std::vector& kernel_sizes, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + DenseTensor* rulebook) { + DenseTensor indices = phi::Empty( + dev_ctx, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW)); + DenseTensor values = + phi::Empty(dev_ctx, DenseTensorMeta(x.dtype(), {1}, x.layout())); + SparseCooTensor coo(indices, values, x.dims()); + MaxPoolKernel( + dev_ctx, x, kernel_sizes, paddings, dilations, strides, &coo, rulebook); + return coo; +} + +} // namespace sparse +} // namespace phi diff --git a/paddle/phi/ops/compat/cumprod_sig.cc b/paddle/phi/ops/compat/cumprod_sig.cc index 59b4eabfa47aa2693c4070ca3e4d8c54f5a42d32..01084e764ed9e41ffb1e67cda26051f5a61fdeeb 100644 --- a/paddle/phi/ops/compat/cumprod_sig.cc +++ b/paddle/phi/ops/compat/cumprod_sig.cc @@ -1,4 +1,3 @@ - // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/paddle/phi/ops/compat/deformable_conv_sig.cc b/paddle/phi/ops/compat/deformable_conv_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..e2a21673634c30988c64e74ffdb1f489a2392f63 --- /dev/null +++ b/paddle/phi/ops/compat/deformable_conv_sig.cc @@ -0,0 +1,34 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature DeformableConvOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("deformable_conv", + {"Input", "Offset", "Filter", "Mask"}, + {"strides", + "paddings", + "dilations", + "deformable_groups", + "groups", + "im2col_step"}, + {"Output"}); +} + +} // namespace phi +PD_REGISTER_ARG_MAPPING_FN(deformable_conv, + phi::DeformableConvOpArgumentMapping); diff --git a/paddle/phi/ops/compat/gelu_sig.cc b/paddle/phi/ops/compat/gelu_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..bf4b47bcf5fa9c1fb9d03f6b332c0c867211f5ac --- /dev/null +++ b/paddle/phi/ops/compat/gelu_sig.cc @@ -0,0 +1,33 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature GeluOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("gelu", {"X"}, {"approximate"}, {"Out"}); +} + +KernelSignature GeluGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("gelu_grad", + {"X", GradVarName("Out")}, + {"approximate"}, + {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(gelu_grad, phi::GeluGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(gelu, phi::GeluOpArgumentMapping); diff --git a/paddle/phi/ops/compat/hierarchical_sigmoid_sig.cc b/paddle/phi/ops/compat/hierarchical_sigmoid_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..20183d1a9b06634c38f9aa57a31cd58363e0095b --- /dev/null +++ b/paddle/phi/ops/compat/hierarchical_sigmoid_sig.cc @@ -0,0 +1,83 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature HierarchicalSigmoidOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("hierarchical_sigmoid", + {"X", "W", "Label", "PathTable", "PathCode", "Bias"}, + {"num_classes", + "remote_prefetch", + "trainer_id", + "height_sections", + "epmap", + "table_names", + "is_sparse"}, + {"Out", "PreOut", "W_Out"}); +} + +KernelSignature HierarchicalSigmoidGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + if (ctx.IsDenseTensorOutput(GradVarName("W"))) { + return KernelSignature( + "hierarchical_sigmoid_grad", + {"X", + "W", + "Label", + "PreOut", + GradVarName("Out"), + "PathTable", + "PathCode", + "Bias"}, + {"num_classes", + "remote_prefetch", + "trainer_id", + "height_sections", + "epmap", + "table_names", + "is_sparse"}, + {GradVarName("X"), GradVarName("W"), GradVarName("Bias")}); + } else if (ctx.IsSelectedRowsOutput(GradVarName("W"))) { + return KernelSignature( + "hierarchical_sigmoid_grad_sr", + {"X", + "W", + "Label", + "PreOut", + GradVarName("Out"), + "PathTable", + "PathCode", + "Bias"}, + {"num_classes", + "remote_prefetch", + "trainer_id", + "height_sections", + "epmap", + "table_names", + "is_sparse"}, + {GradVarName("X"), GradVarName("W"), GradVarName("Bias")}); + } else { + return KernelSignature("unregistered", {}, {}, {}); + } +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(hierarchical_sigmoid, + phi::HierarchicalSigmoidOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(hierarchical_sigmoid_grad, + phi::HierarchicalSigmoidGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/kthvalue_sig.cc b/paddle/phi/ops/compat/kthvalue_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..e59e9de1e43822ed8d50b8c1d1888e0d1d14540f --- /dev/null +++ b/paddle/phi/ops/compat/kthvalue_sig.cc @@ -0,0 +1,29 @@ + +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature KthvalueGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("kthvalue_grad", + {GradVarName("Out"), "X", "Indices"}, + {"k", "axis", "keepdim"}, + {GradVarName("X")}); +} + +} // namespace phi +PD_REGISTER_ARG_MAPPING_FN(kthvalue_grad, phi::KthvalueGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/log_softmax_sig.cc b/paddle/phi/ops/compat/log_softmax_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..b1ecc6d56768f069c208a0230722929200f1dfe0 --- /dev/null +++ b/paddle/phi/ops/compat/log_softmax_sig.cc @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature LogSoftmaxGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("log_softmax_grad", + {"Out", GradVarName("Out")}, + {"axis"}, + {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(log_softmax_grad, + phi::LogSoftmaxGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/prelu_sig.cc b/paddle/phi/ops/compat/prelu_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..bd296c5e95318332523a3cf07e85f1afd6f8a95c --- /dev/null +++ b/paddle/phi/ops/compat/prelu_sig.cc @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature PReluGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("prelu_grad", + {"X", "Alpha", GradVarName("Out")}, + {"mode", "data_format"}, + {GradVarName("X"), GradVarName("Alpha")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(prelu_grad, phi::PReluGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/reduce_sig.cc b/paddle/phi/ops/compat/reduce_sig.cc index 789496ccbd01c12504e1aeb9f89b60bf94a091c9..4bca0523801c1a94f90197c93cc495c2c4f56eeb 100644 --- a/paddle/phi/ops/compat/reduce_sig.cc +++ b/paddle/phi/ops/compat/reduce_sig.cc @@ -136,6 +136,42 @@ KernelSignature ReduceSumGradOpArgumentMapping( {GradVarName("X")}); } +KernelSignature ReduceMeanGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "mean_grad", + {"X", GradVarName("Out")}, + {"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"}, + {GradVarName("X")}); +} + +KernelSignature ReduceMaxGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "max_grad", + {"X", GradVarName("Out"), "Out"}, + {"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"}, + {GradVarName("X")}); +} + +KernelSignature ReduceMinGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "min_grad", + {"X", GradVarName("Out"), "Out"}, + {"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"}, + {GradVarName("X")}); +} + +KernelSignature ReduceProdGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "prod_grad", + {"X", GradVarName("Out"), "Out"}, + {"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"}, + {GradVarName("X")}); +} + } // namespace phi PD_REGISTER_BASE_KERNEL_NAME(reduce_sum, sum); @@ -147,6 +183,10 @@ PD_REGISTER_BASE_KERNEL_NAME(reduce_all, all); PD_REGISTER_BASE_KERNEL_NAME(reduce_any, any); PD_REGISTER_BASE_KERNEL_NAME(reduce_sum_grad, sum_grad); +PD_REGISTER_BASE_KERNEL_NAME(reduce_mean_grad, mean_grad); +PD_REGISTER_BASE_KERNEL_NAME(reduce_prod_grad, prod_grad); +PD_REGISTER_BASE_KERNEL_NAME(reduce_max_grad, max_grad); +PD_REGISTER_BASE_KERNEL_NAME(reduce_min_grad, min_grad); PD_REGISTER_ARG_MAPPING_FN(reduce_sum, phi::ReduceSumOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(reduce_mean, phi::ReduceMeanOpArgumentMapping); @@ -158,3 +198,11 @@ PD_REGISTER_ARG_MAPPING_FN(reduce_any, phi::ReduceAnyOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(reduce_sum_grad, phi::ReduceSumGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(reduce_mean_grad, + phi::ReduceMeanGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(reduce_prod_grad, + phi::ReduceProdGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(reduce_max_grad, + phi::ReduceMaxGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(reduce_min_grad, + phi::ReduceMinGradOpArgumentMapping); diff --git a/paddle/phi/tests/kernels/CMakeLists.txt b/paddle/phi/tests/kernels/CMakeLists.txt index 317dcce92c8edd1bb76b080cdb578d37eb8b1f58..3897c182e481ce3ae81c406c35e138adf2f7071f 100644 --- a/paddle/phi/tests/kernels/CMakeLists.txt +++ b/paddle/phi/tests/kernels/CMakeLists.txt @@ -14,6 +14,7 @@ cc_test(test_concat_dev_api SRCS test_concat_dev_api.cc DEPS phi phi_api_utils) cc_test(test_split_dev_api SRCS test_split_dev_api.cc DEPS phi phi_api_utils) cc_test(test_sparse_utils_dev_api SRCS test_sparse_utils_dev_api.cc DEPS phi phi_api_utils) cc_test(test_sparse_conv3d_dev_api SRCS test_sparse_conv3d_dev_api.cc DEPS phi phi_api_utils) +cc_test(test_sparse_pool_dev_api SRCS test_sparse_pool_dev_api.cc DEPS phi phi_api_utils) cc_test(test_math_function SRCS test_math_function.cc DEPS math_function) if(WITH_GPU) diff --git a/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc new file mode 100644 index 0000000000000000000000000000000000000000..27673704168c9eace0958db770a2309d10da648c --- /dev/null +++ b/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc @@ -0,0 +1,391 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h" +#include "paddle/phi/kernels/sparse/sparse_pool_kernel.h" + +#include "paddle/fluid/memory/allocation/allocator_facade.h" +#include "paddle/phi/api/lib/utils/allocator.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { +namespace tests { + +template +std::vector cast(const std::vector& in) { + std::vector out(in.size()); + for (uint64_t i = 0; i < in.size(); i++) { + out[i] = static_cast(in[i]); + } + return out; +} +template +void TestMaxPoolBase(const std::vector& indices, + const std::vector& features, + const DDim& x_dims, + const std::vector& correct_out_indices, + const std::vector& correct_out_features, + const DDim& correct_out_dims, + const int non_zero_num, + const std::vector& kernel_sizes, + const std::vector& paddings, + const std::vector& strides, + const std::vector& dilations, + const float diff = 1e-3, + const bool backward = false, + const std::vector features_grad = {}) { + phi::CPUContext dev_ctx_cpu; + dev_ctx_cpu.SetAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); + dev_ctx_cpu.Init(); + + const int in_channels = x_dims[4]; + const int out_channels = in_channels; + + DenseTensor indices_tensor = phi::Empty( + dev_ctx_cpu, + DenseTensorMeta(DataType::INT32, {4, non_zero_num}, DataLayout::NCHW)); + memcpy( + indices_tensor.data(), indices.data(), indices.size() * sizeof(int)); + DenseTensor features_tensor = phi::Empty( + dev_ctx_cpu, + DenseTensorMeta(paddle::experimental::CppTypeToDataType::Type(), + {non_zero_num, in_channels}, + DataLayout::NHWC)); + memcpy( + features_tensor.data(), features.data(), features.size() * sizeof(T)); + + SparseCooTensor x_tensor(indices_tensor, features_tensor, x_dims); + + auto f_verify = [&](const T* real_data, const std::vector& correct_data) { + for (uint64_t i = 0; i < correct_data.size(); i++) { + float tmp = std::fabs(static_cast(correct_data[i] - real_data[i])); + ASSERT_LT(tmp, diff); + } + }; + + if (!std::is_same::value) { + DenseTensor rulebook = phi::Empty( + dev_ctx_cpu, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW)); + SparseCooTensor out = sparse::MaxPool(dev_ctx_cpu, + x_tensor, + kernel_sizes, + paddings, + dilations, + strides, + &rulebook); + + ASSERT_EQ(correct_out_dims.size(), out.dims().size()); + for (int i = 0; i < correct_out_dims.size(); i++) { + ASSERT_EQ(correct_out_dims[i], out.dims()[i]); + } + ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, out.nnz()); + + int cmp_indices = memcmp(correct_out_indices.data(), + out.non_zero_indices().data(), + correct_out_indices.size() * sizeof(int)); + ASSERT_EQ(cmp_indices, 0); + + f_verify(out.non_zero_elements().data(), correct_out_features); + + if (backward) { + DenseTensor x_grad = sparse::MaxPoolGrad(dev_ctx_cpu, + x_tensor, + rulebook, + out, + out.non_zero_elements(), + kernel_sizes); + f_verify(x_grad.data(), features_grad); + } + } + +// test gpu +#if defined(PADDLE_WITH_CUDA) + phi::GPUContext dev_ctx_gpu; + dev_ctx_gpu.PartialInitWithoutAllocator(); + dev_ctx_gpu.SetAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(dev_ctx_gpu.GetPlace(), dev_ctx_gpu.stream()) + .get()); + dev_ctx_gpu.SetHostAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(phi::CPUPlace()) + .get()); + dev_ctx_gpu.PartialInitWithAllocator(); + + DenseTensor d_indices_tensor = phi::Empty( + dev_ctx_gpu, + DenseTensorMeta(DataType::INT32, {4, non_zero_num}, DataLayout::NCHW)); + phi::Copy( + dev_ctx_gpu, indices_tensor, phi::GPUPlace(), true, &d_indices_tensor); + + DenseTensor d_features_tensor = phi::Empty( + dev_ctx_gpu, + DenseTensorMeta(paddle::experimental::CppTypeToDataType::Type(), + {non_zero_num, in_channels}, + DataLayout::NHWC)); + phi::Copy( + dev_ctx_gpu, features_tensor, phi::GPUPlace(), true, &d_features_tensor); + + SparseCooTensor d_x_tensor(d_indices_tensor, d_features_tensor, x_dims); + + DenseTensor d_rulebook = phi::Empty( + dev_ctx_gpu, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW)); + SparseCooTensor d_out = sparse::MaxPool(dev_ctx_gpu, + d_x_tensor, + kernel_sizes, + paddings, + dilations, + strides, + &d_rulebook); + + ASSERT_EQ(correct_out_dims.size(), d_out.dims().size()); + ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, d_out.nnz()); + for (int i = 0; i < correct_out_dims.size(); i++) { + ASSERT_EQ(correct_out_dims[i], d_out.dims()[i]); + } + + DenseTensor h_indices_tensor = phi::Empty( + dev_ctx_cpu, + DenseTensorMeta(DataType::INT32, {4, d_out.nnz()}, DataLayout::NCHW)); + phi::Copy(dev_ctx_gpu, + d_out.non_zero_indices(), + phi::CPUPlace(), + true, + &h_indices_tensor); + + int cmp_indices2 = memcmp(correct_out_indices.data(), + h_indices_tensor.data(), + correct_out_indices.size() * sizeof(int)); + ASSERT_EQ(cmp_indices2, 0); + + DenseTensor h_features_tensor = phi::Empty( + dev_ctx_cpu, + DenseTensorMeta(paddle::experimental::CppTypeToDataType::Type(), + {d_out.nnz()}, + d_out.layout())); + + phi::Copy(dev_ctx_gpu, + d_out.non_zero_elements(), + phi::CPUPlace(), + true, + &h_features_tensor); + f_verify(h_features_tensor.data(), correct_out_features); + + if (backward) { + DenseTensor x_grad = sparse::MaxPoolGrad(dev_ctx_gpu, + d_x_tensor, + d_rulebook, + d_out, + d_out.non_zero_elements(), + kernel_sizes); + DenseTensor h_features_grad = phi::Empty( + dev_ctx_cpu, + DenseTensorMeta(x_grad.dtype(), x_grad.dims(), x_grad.layout())); + phi::Copy(dev_ctx_gpu, x_grad, phi::CPUPlace(), true, &h_features_grad); + f_verify(h_features_grad.data(), features_grad); + } +#endif +} + +void TestMaxPool(const std::vector& indices, + const std::vector& features, + const DDim& x_dims, + const std::vector& correct_out_indices, + const std::vector& correct_out_features, + const DDim& correct_out_dims, + const int non_zero_num, + const std::vector& kernel_sizes, + const std::vector& paddings, + const std::vector& strides, + const std::vector& dilations, + const float diff = 1e-3, + const bool backward = false, + const std::vector features_grad = {}) { + // test float + TestMaxPoolBase(indices, + features, + x_dims, + correct_out_indices, + correct_out_features, + correct_out_dims, + non_zero_num, + kernel_sizes, + paddings, + strides, + dilations, + diff, + backward, + features_grad); + // test double + TestMaxPoolBase(indices, + cast(features), + x_dims, + correct_out_indices, + cast(correct_out_features), + correct_out_dims, + non_zero_num, + kernel_sizes, + paddings, + strides, + dilations, + diff, + backward, + cast(features_grad)); +} + +TEST(DEV_API, sparse_maxpool) { + const int channels = 1; + DDim x_dims = {1, 1, 4, 4, channels}; + DDim out_dims = {1, 1, 2, 2, channels}; + std::vector kernel_sizes = {1, 3, 3}; + std::vector paddings = {0, 0, 0}; + std::vector strides = {1, 1, 1}; + std::vector dilations = {1, 1, 1}; + + const int non_zero_num = 3; + std::vector indices = {0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 1, 2}; + std::vector features = {1, 2, 3}; + std::vector out_indices = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, + }; + std::vector out_features = {2, 2, 3, 3}; + std::vector x_grad = {0, 4, 6}; + + TestMaxPool(indices, + features, + x_dims, + out_indices, + out_features, + out_dims, + non_zero_num, + kernel_sizes, + paddings, + strides, + dilations, + 1e-6, + true, + x_grad); +} + +TEST(DEV_API, sparse_maxpool_stride) { + const int channels = 1; + DDim x_dims = {1, 1, 4, 4, channels}; + DDim out_dims = {1, 1, 1, 1, channels}; + std::vector kernel_sizes = {1, 3, 3}; + std::vector paddings = {0, 0, 0}; + std::vector strides = {2, 2, 2}; + std::vector dilations = {1, 1, 1}; + + const int non_zero_num = 3; + std::vector indices = {0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 1, 2}; + std::vector features = {1, 2, 3}; + std::vector out_indices = {0, 0, 0, 0}; + std::vector out_features = {2}; + std::vector x_grad = {0, 2, 0}; + + TestMaxPool(indices, + features, + x_dims, + out_indices, + out_features, + out_dims, + non_zero_num, + kernel_sizes, + paddings, + strides, + dilations, + 1e-6, + true, + x_grad); +} + +TEST(DEV_API, sparse_maxpool_channel) { + const int channels = 2; + DDim x_dims = {1, 1, 4, 4, channels}; + DDim out_dims = {1, 1, 2, 2, channels}; + std::vector kernel_sizes = {1, 3, 3}; + std::vector paddings = {0, 0, 0}; + std::vector strides = {1, 1, 1}; + std::vector dilations = {1, 1, 1}; + + const int non_zero_num = 3; + std::vector indices = {0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 1, 2}; + std::vector features = {1, 1, 2, 2, 3, 3}; + std::vector out_indices = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, + }; + std::vector out_features = {2, 2, 2, 2, 3, 3, 3, 3}; + std::vector x_grad = {0, 0, 4, 4, 6, 6}; + + TestMaxPool(indices, + features, + x_dims, + out_indices, + out_features, + out_dims, + non_zero_num, + kernel_sizes, + paddings, + strides, + dilations, + 1e-6, + true, + x_grad); +} + +TEST(DEV_API, sparse_maxpool3d) { + const int channels = 2; + DDim x_dims = {1, 5, 4, 4, channels}; + DDim out_dims = {1, 3, 2, 2, channels}; + std::vector kernel_sizes = {3, 3, 3}; + std::vector paddings = {0, 0, 0}; + std::vector strides = {1, 1, 1}; + std::vector dilations = {1, 1, 1}; + + const int non_zero_num = 3; + std::vector indices = {0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 1, 2}; + std::vector features = {1, 1, 2, 2, 3, 3}; + std::vector out_indices = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, + }; + std::vector out_features = {2, 2, 2, 2, 3, 3, 3, 3}; + std::vector x_grad = {0, 0, 4, 4, 6, 6}; + + TestMaxPool(indices, + features, + x_dims, + out_indices, + out_features, + out_dims, + non_zero_num, + kernel_sizes, + paddings, + strides, + dilations, + 1e-6, + true, + x_grad); +} + +} // namespace tests +} // namespace phi diff --git a/paddle/scripts/infrt_build.sh b/paddle/scripts/infrt_build.sh index 850d4015abf7a8164add9d4896d5a9bdfa26989d..1b259023f94df7279066533bb6c182a644b4e9c2 100755 --- a/paddle/scripts/infrt_build.sh +++ b/paddle/scripts/infrt_build.sh @@ -45,6 +45,7 @@ function update_pd_ops() { python3 generate_pd_op_dialect_from_paddle_op_maker.py python3 generate_phi_kernel_dialect.py # generate test model + cd ${PADDLE_ROOT} python3 paddle/infrt/tests/model/abs_model.py ${PADDLE_ROOT}/build/paddle/infrt/tests/abs } diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat index 75afa4ef43ff602d0fe4b9a6ce7c7c6ad5aab8a0..78a863040ade1a43e9de660bff59f5179535abef 100644 --- a/paddle/scripts/paddle_build.bat +++ b/paddle/scripts/paddle_build.bat @@ -55,7 +55,6 @@ wmic process where name="python.exe" call terminate 2>NUL rem ------initialize common variable------ if not defined GENERATOR set GENERATOR="Visual Studio 15 2017 Win64" -if not defined BRANCH set BRANCH=develop if not defined WITH_TENSORRT set WITH_TENSORRT=ON if not defined TENSORRT_ROOT set TENSORRT_ROOT=D:/TensorRT if not defined CUDA_ARCH_NAME set CUDA_ARCH_NAME=Auto @@ -70,7 +69,6 @@ if not defined WITH_ONNXRUNTIME set WITH_ONNXRUNTIME=OFF if not defined WITH_INFERENCE_API_TEST set WITH_INFERENCE_API_TEST=ON if not defined WITH_STATIC_LIB set WITH_STATIC_LIB=ON if not defined WITH_TPCACHE set WITH_TPCACHE=OFF -if not defined WITH_CLCACHE set WITH_CLCACHE=OFF if not defined WITH_CACHE set WITH_CACHE=OFF if not defined WITH_SCCACHE set WITH_SCCACHE=OFF if not defined WITH_UNITY_BUILD set WITH_UNITY_BUILD=OFF @@ -145,17 +143,6 @@ if %day_now% NEQ %day_before% ( echo %day_now% > %cache_dir%\day.txt type %cache_dir%\day.txt rmdir %BUILD_DIR% /s/q - - : clear third party cache every once in a while - if %day_now% EQU 21 ( - rmdir %cache_dir%\third_party /s/q - ) - if %day_now% EQU 11 ( - rmdir %cache_dir%\third_party /s/q - ) - if %day_now% EQU 01 ( - rmdir %cache_dir%\third_party /s/q - ) goto :mkbuild ) @@ -212,6 +199,7 @@ echo There is not sccache in this PC, will install sccache. echo Download package from https://paddle-ci.gz.bcebos.com/window_requirement/sccache.exe %PYTHON_ROOT%\python.exe -c "import wget;wget.download('https://paddle-ci.gz.bcebos.com/window_requirement/sccache.exe')" xcopy sccache.exe %PYTHON_ROOT%\ /Y +del sccache.exe goto:eof rem -------Caching strategy 2: End -------------------------------- @@ -232,13 +220,12 @@ set WITH_AVX=ON set MSVC_STATIC_CRT=OFF set ON_INFER=OFF set WITH_TENSORRT=ON +set WITH_INFERENCE_API_TEST=OFF call :cmake || goto cmake_error call :build || goto build_error call :test_whl_pacakage || goto test_whl_pacakage_error call :test_unit || goto test_unit_error -:: call :test_inference || goto test_inference_error -:: call :check_change_of_unittest || goto check_change_of_unittest_error goto:success rem ------PR CI windows check for OPENBLAS/CPU------ @@ -254,8 +241,6 @@ call :cmake || goto cmake_error call :build || goto build_error call :test_whl_pacakage || goto test_whl_pacakage_error call :test_unit || goto test_unit_error -:: call :test_inference || goto test_inference_error -:: call :check_change_of_unittest || goto check_change_of_unittest_error goto:success rem ------PR CI windows check for unittests and inference in CUDA11-MKL-AVX---------- @@ -265,7 +250,6 @@ set WITH_GPU=ON set WITH_AVX=ON set MSVC_STATIC_CRT=ON set ON_INFER=ON -set WITH_TESTING=ON set WITH_TENSORRT=ON set WITH_INFERENCE_API_TEST=ON @@ -274,7 +258,8 @@ call :build || goto build_error call :test_whl_pacakage || goto test_whl_pacakage_error call :test_unit || goto test_unit_error ::call :test_inference || goto test_inference_error -:: call :check_change_of_unittest || goto check_change_of_unittest_error +::call :test_inference_ut || goto test_inference_ut_error +call :check_change_of_unittest || goto check_change_of_unittest_error goto:success rem ------Build windows avx whl package------ @@ -365,18 +350,6 @@ if "%WITH_GPU%"=="ON" ( nvidia-smi 2>NUL ) -rem ------pre install clcache and init config---------- -rem pip install clcache --user -pip uninstall -y clcache -:: set USE_CLCACHE to enable clcache -rem set USE_CLCACHE=1 -:: In some scenarios, CLCACHE_HARDLINK can save one file copy. -rem set CLCACHE_HARDLINK=1 -:: If it takes more than 1000s to obtain the right to use the cache, an error will be reported -rem set CLCACHE_OBJECT_CACHE_TIMEOUT_MS=1000000 -:: set maximum cache size to 20G -rem clcache.exe -M 21474836480 - rem ------set third_party cache dir------ if "%WITH_TPCACHE%"=="OFF" ( @@ -384,6 +357,25 @@ if "%WITH_TPCACHE%"=="OFF" ( goto :cmake_impl ) +rem clear third party cache every ten days +for /F %%# in ('wmic os get localdatetime^|findstr 20') do set datetime=%%# +set day_now=%datetime:~6,2% +set day_before=-1 +set /p day_before=< %cache_dir%\day_third_party.txt +if %day_now% NEQ %day_before% ( + echo %day_now% > %cache_dir%\day_third_party.txt + type %cache_dir%\day_third_party.txt + if %day_now% EQU 21 ( + rmdir %cache_dir%\third_party /s/q + ) + if %day_now% EQU 11 ( + rmdir %cache_dir%\third_party /s/q + ) + if %day_now% EQU 01 ( + rmdir %cache_dir%\third_party /s/q + ) +) + echo set -ex > cache.sh echo md5_content=$(cat %work_dir:\=/%/cmake/external/*.cmake ^|md5sum ^| awk '{print $1}') >> cache.sh echo echo ${md5_content}^>md5.txt >> cache.sh @@ -535,11 +527,7 @@ echo Build Paddle the %build_times% time: if %GENERATOR% == "Ninja" ( ninja all ) else ( - if "%WITH_CLCACHE%"=="OFF" ( - MSBuild /m:%PARALLEL_PROJECT_COUNT% /p:PreferredToolArchitecture=x64 /p:TrackFileAccess=false /p:Configuration=Release /verbosity:%LOG_LEVEL% ALL_BUILD.vcxproj - ) else ( - MSBuild /m:%PARALLEL_PROJECT_COUNT% /p:PreferredToolArchitecture=x64 /p:TrackFileAccess=false /p:CLToolExe=clcache.exe /p:CLToolPath=%PYTHON_ROOT%\Scripts /p:Configuration=Release /verbosity:%LOG_LEVEL% ALL_BUILD.vcxproj - ) + MSBuild /m:%PARALLEL_PROJECT_COUNT% /p:PreferredToolArchitecture=x64 /p:TrackFileAccess=false /p:Configuration=Release /verbosity:%LOG_LEVEL% ALL_BUILD.vcxproj ) if %ERRORLEVEL% NEQ 0 ( @@ -774,77 +762,8 @@ echo ======================================== echo Step 6. Check whether deleting a unit test ... echo ======================================== -cd /d %work_dir%\%BUILD_DIR% -echo set -e> check_change_of_unittest.sh -echo set +x>> check_change_of_unittest.sh -echo GITHUB_API_TOKEN=%GITHUB_API_TOKEN% >> check_change_of_unittest.sh -echo GIT_PR_ID=%AGILE_PULL_ID% >> check_change_of_unittest.sh -echo BRANCH=%BRANCH%>> check_change_of_unittest.sh -echo if [ "${GITHUB_API_TOKEN}" == "" ] ^|^| [ "${GIT_PR_ID}" == "" ];then>> check_change_of_unittest.sh -echo exit 0 >> check_change_of_unittest.sh -echo fi>> check_change_of_unittest.sh -echo set -x>> check_change_of_unittest.sh -echo cat ^<^> check_change_of_unittest.sh -echo ============================================ >> check_change_of_unittest.sh -echo Generate unit tests.spec of this PR. >> check_change_of_unittest.sh -echo ============================================ >> check_change_of_unittest.sh -echo EOF>> check_change_of_unittest.sh -echo spec_path=$(pwd)/UNITTEST_PR.spec>> check_change_of_unittest.sh -echo ctest -N ^| awk -F ':' '{print $2}' ^| sed '/^^$/d' ^| sed '$d' ^> ${spec_path}>> check_change_of_unittest.sh -echo num=$(awk 'END{print NR}' ${spec_path})>> check_change_of_unittest.sh -echo echo "Windows 1 card TestCases count is $num">> check_change_of_unittest.sh -echo echo ipipe_log_param_Windows_1_Card_TestCases_Count: $num>> check_change_of_unittest.sh -echo UPSTREAM_URL='https://github.com/PaddlePaddle/Paddle'>> check_change_of_unittest.sh -echo origin_upstream_url=`git remote -v ^| awk '{print $1, $2}' ^| uniq ^| grep upstream ^| awk '{print $2}'`>> check_change_of_unittest.sh -echo if [ "$origin_upstream_url" == "" ]; then>> check_change_of_unittest.sh -echo git remote add upstream $UPSTREAM_URL.git>> check_change_of_unittest.sh -echo elif [ "$origin_upstream_url" ^!= "$UPSTREAM_URL" ] ^\>> check_change_of_unittest.sh -echo ^&^& [ "$origin_upstream_url" ^!= "$UPSTREAM_URL.git" ]; then>> check_change_of_unittest.sh -echo git remote remove upstream>> check_change_of_unittest.sh -echo git remote add upstream $UPSTREAM_URL.git>> check_change_of_unittest.sh -echo fi>> check_change_of_unittest.sh -echo if [ ! -e "$(pwd)/../.git/refs/remotes/upstream/$BRANCH" ]; then>> check_change_of_unittest.sh -echo git fetch upstream $BRANCH # develop is not fetched>> check_change_of_unittest.sh -echo fi>> check_change_of_unittest.sh -echo git checkout -b origin_pr >> check_change_of_unittest.sh -echo git checkout -f $BRANCH >> check_change_of_unittest.sh -echo cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^ --DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DON_INFER=%ON_INFER% ^ --DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^ --DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DWITH_STATIC_LIB=%WITH_STATIC_LIB% ^ --DWITH_TENSORRT=%WITH_TENSORRT% -DTENSORRT_ROOT="%TENSORRT_ROOT%" -DMSVC_STATIC_CRT=%MSVC_STATIC_CRT% ^ --DWITH_UNITY_BUILD=%WITH_UNITY_BUILD% -DCUDA_ARCH_NAME=%CUDA_ARCH_NAME% >> check_change_of_unittest.sh -echo cat ^<^> check_change_of_unittest.sh -echo ============================================ >> check_change_of_unittest.sh -echo Generate unit tests.spec of develop. >> check_change_of_unittest.sh -echo ============================================ >> check_change_of_unittest.sh -echo EOF>> check_change_of_unittest.sh -echo spec_path=$(pwd)/UNITTEST_DEV.spec>> check_change_of_unittest.sh -echo ctest -N ^| awk -F ':' '{print $2}' ^| sed '/^^$/d' ^| sed '$d' ^> ${spec_path}>> check_change_of_unittest.sh -echo unittest_spec_diff=`python $(pwd)/../tools/diff_unittest.py $(pwd)/UNITTEST_DEV.spec $(pwd)/UNITTEST_PR.spec`>> check_change_of_unittest.sh -echo if [ "$unittest_spec_diff" ^!= "" ]; then>> check_change_of_unittest.sh -echo set +x>> check_change_of_unittest.sh -echo approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`>> check_change_of_unittest.sh -echo set -x>> check_change_of_unittest.sh -echo if [ "$approval_line" ^!= "" ]; then>> check_change_of_unittest.sh -echo APPROVALS=`echo ${approval_line} ^|python $(pwd)/../tools/check_pr_approval.py 1 22165420 52485244 6836917`>> check_change_of_unittest.sh -echo echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}">> check_change_of_unittest.sh -echo if [ "${APPROVALS}" == "FALSE" ]; then>> check_change_of_unittest.sh -echo echo "************************************" >> check_change_of_unittest.sh -echo echo -e "It is forbidden to disable or delete the unit-test.\n" >> check_change_of_unittest.sh -echo echo -e "If you must delete it temporarily, please add it to[https://github.com/PaddlePaddle/Paddle/wiki/Temporarily-disabled-Unit-Test]." >> check_change_of_unittest.sh -echo echo -e "Then you must have one RD (kolinwei(recommended) or zhouwei25) approval for the deletion of unit-test. \n" >> check_change_of_unittest.sh -echo echo -e "If you have any problems about deleting unit-test, please read the specification [https://github.com/PaddlePaddle/Paddle/wiki/Deleting-unit-test-is-forbidden]. \n" >> check_change_of_unittest.sh -echo echo -e "Following unit-tests are deleted in this PR: \n ${unittest_spec_diff} \n" >> check_change_of_unittest.sh -echo echo "************************************" >> check_change_of_unittest.sh -echo exit 1 >> check_change_of_unittest.sh -echo fi>> check_change_of_unittest.sh -echo else>> check_change_of_unittest.sh -echo exit 1 >> check_change_of_unittest.sh -echo fi>> check_change_of_unittest.sh -echo fi>> check_change_of_unittest.sh -echo git checkout -f origin_pr >> check_change_of_unittest.sh -%cache_dir%\tools\busybox64.exe bash check_change_of_unittest.sh +%cache_dir%\tools\busybox64.exe bash %work_dir%\tools\windows\check_change_of_unittest.sh + goto:eof :check_change_of_unittest_error diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py index 89b59254e5b9105a55c68f3ef871396de1bd9199..6a30276e02ba238a0f4ee838164a5bf9976f7d84 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py @@ -89,7 +89,7 @@ class ShardingClipGrad: global_norm_fp16 = paddle.cast( global_norm_fp16, dtype=paddle.float32) - # global norm of non-distributed FP16 params_and_grads for slice parameter + # global norm of non-distributed FP16 params_and_grads for unslice parameter if len(unslice_params_fp16) == 0: global_unslice_fp16 = paddle.to_tensor([0.], dtype=paddle.float32) else: @@ -104,21 +104,20 @@ class ShardingClipGrad: [0.], dtype=paddle.float32) global_norm_fp32 = layers.reduce_sum(global_norm_fp32) - # global norm of non-distributed FP32 params_and_grads for slice parameter + # global norm of non-distributed FP32 params_and_grads for unslice parameter global_unslice_fp32 = layers.concat(unslice_params_fp32) if len( unslice_params_fp32) != 0 else paddle.to_tensor( [0.], dtype=paddle.float32) global_unslice_fp32 = layers.reduce_sum(global_unslice_fp32) global_unslice_var = global_unslice_fp16 + global_unslice_fp32 - global_norm_var = global_norm_fp16 + global_norm_fp32 + global_norm_var = global_norm_fp16 + global_norm_fp32 + 1.0 / self._group.nranks * global_unslice_var # add all reduce to get global norm of distributed params_and_grads dev_id = int(self._device.split(":")[1]) with device_guard(dev_id, "gpu"): paddle.distributed.all_reduce(global_norm_var, group=self._group) - global_norm_var += global_unslice_var global_norm_var = layers.sqrt(global_norm_var) max_global_norm = layers.fill_constant( shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm) diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py index e1df2324889b440737740e443228d0fa69b47b51..7733226cc09f2d6e2f9bcb8403ed1be42aa75e0c 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py @@ -297,10 +297,6 @@ class TensorShapeTransformer(gast.NodeTransformer): return False def _update_name_to_var_shape(self, node): - def replace_dot(name): - # replace all '.' into '_' - return name.replace('.', '_') - assert isinstance(node, gast.Assign) target_node = node.targets[0] value_node = node.value @@ -315,7 +311,6 @@ class TensorShapeTransformer(gast.NodeTransformer): if value_node.id in self.name_to_var_shape: # TODO(zhhsplendid): is context a problem for the result node of gast.parse? static_shape_var_name = unique_name.generate( - replace_dot(target_id) + STATIC_CONVERT_VAR_SHAPE_SUFFIX) static_shape_var_node = gast.parse( static_shape_var_name).body[0].value @@ -337,7 +332,6 @@ class TensorShapeTransformer(gast.NodeTransformer): if isinstance(value_node, gast.Attribute): if self._is_var_shape(value_node): # eg: x.shape static_shape_var_name = unique_name.generate( - replace_dot(target_id) + STATIC_CONVERT_VAR_SHAPE_SUFFIX) static_shape_var_node = gast.parse( static_shape_var_name).body[0].value @@ -370,7 +364,6 @@ class TensorShapeTransformer(gast.NodeTransformer): if isinstance(value_node, gast.Name): if value_node.id in self.name_to_var_shape: static_shape_var_name = unique_name.generate( - replace_dot(target_id) + STATIC_CONVERT_VAR_SHAPE_SUFFIX) static_shape_var_node = gast.parse( static_shape_var_name).body[0].value @@ -387,7 +380,7 @@ class TensorShapeTransformer(gast.NodeTransformer): self.name_to_var_shape[target_id] = static_shape_var_name elif self._is_var_shape(value_node): # eg: x.shape or x.shape[0] static_shape_var_name = unique_name.generate( - replace_dot(target_id) + STATIC_CONVERT_VAR_SHAPE_SUFFIX) + STATIC_CONVERT_VAR_SHAPE_SUFFIX) static_shape_var_node = gast.parse(static_shape_var_name).body[ 0].value static_shape_value_node = copy.deepcopy(value_node) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index c82172780b7b2e27e430d0494ce59f7dce626d74..44e6f8e8f2a6d11371f21fff5a9dccefcd72ebed 100755 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -960,6 +960,7 @@ set_tests_properties(test_bicubic_interp_op PROPERTIES TIMEOUT 120) set_tests_properties(test_deformable_conv_op PROPERTIES TIMEOUT 120) set_tests_properties(test_nearest_interp_op PROPERTIES TIMEOUT 120) set_tests_properties(test_profiler PROPERTIES TIMEOUT 120) +set_tests_properties(test_inplace_eager_fluid PROPERTIES TIMEOUT 120) set_tests_properties(test_inplace_softmax_with_cross_entropy PROPERTIES TIMEOUT 120) set_tests_properties(test_cross_entropy2_op PROPERTIES TIMEOUT 120) set_tests_properties(test_fetch_unmerged PROPERTIES TIMEOUT 120) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py index 06d69daa75d1c755d3c9f2b111e31297c4905d8f..d05be03bbfb193ae25ee039aef1608afdef4f585 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py @@ -223,6 +223,12 @@ def dyfunc_len_paddle_shape(): print(x) +def dyfunc_dict_assign_shape(): + x = paddle.to_tensor([1, 2]) + a = {} + a['shape'] = x.shape[0] + + # 1. Basic tests without control flow class TestTensorShapeBasic(unittest.TestCase): def setUp(self): @@ -592,6 +598,8 @@ class TestPaddleShape(unittest.TestCase): def test_paddle_shape(self): func = paddle.jit.to_static(dyfunc_len_paddle_shape) self.assertEqual('paddle.shape(x)' in func.code, True) + func = paddle.jit.to_static(dyfunc_dict_assign_shape) + self.assertEqual("__static_convert_var_shape_suffix" in func.code, True) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_gelu_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_gelu_fuse_pass.py index 33df428388882f2e536ecebb15a1f5dae6a6afc5..81bb182802ede6a2b78ffc44345cdcf382d344af 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_gelu_fuse_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_gelu_fuse_pass.py @@ -19,6 +19,7 @@ import paddle.inference as paddle_infer from functools import partial from typing import Optional, List, Callable, Dict, Any, Set import unittest +import paddle import hypothesis from hypothesis import given, settings, seed, example, assume @@ -104,4 +105,5 @@ class TestConvGeluMkldnnFusePass(PassAutoScanTest): if __name__ == "__main__": + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt index 8e31d58195be8b17243fd5203fd8ced17c11f183..e9d9af5c11366c258d1fdab34b1e9ea345b0bfad 100644 --- a/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt @@ -22,4 +22,5 @@ if (WITH_ASCEND_CL) set_tests_properties(test_conv2d_transpose_op_npu PROPERTIES TIMEOUT 200) set_tests_properties(test_conv2d_op_npu PROPERTIES TIMEOUT 300) set_tests_properties(test_matmulv2_op_npu PROPERTIES TIMEOUT 300) + set_tests_properties(test_elementwise_add_op_npu PROPERTIES TIMEOUT 200) endif() diff --git a/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py index 877f9904f3407c8e600995c2cf65cf849d49cdd5..e01b2b691a28aa788836a7f0d66fb2723fc1b364 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py @@ -144,6 +144,7 @@ class TestBatchNormOpTraining(unittest.TestCase): def setUp(self): self.set_npu() + self.init_dtype() self.use_mkldnn = False self.fuse_with_relu = False self.data_formats = ["NCHW", "NHWC"] @@ -153,6 +154,9 @@ class TestBatchNormOpTraining(unittest.TestCase): self.init_kernel_type() self.init_test_case() + def init_dtype(self): + self.dtype = np.float32 + def init_test_case(self): self.use_global_stats = False self.no_grad_set = set() @@ -210,11 +214,16 @@ class TestBatchNormOpTraining(unittest.TestCase): scale_shape = [c] np.random.seed(123) - x = np.random.random_sample(shape).astype(np.float32) + x = np.random.random_sample(shape).astype(self.dtype) scale = np.random.random_sample(scale_shape).astype(np.float32) bias = np.random.random_sample(scale_shape).astype(np.float32) mean, variance = self.set_mean_variance(scale_shape, x, data_layout) - y_grad = np.random.random_sample(shape).astype(np.float32) + + if self.dtype == np.float16: + mean = mean.astype(np.float32) + variance = variance.astype(np.float32) + + y_grad = np.random.random_sample(shape).astype(self.dtype) momentum_var = np.array([momentum]).astype(np.float32) y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad = self.ref_forward_backward( @@ -275,7 +284,7 @@ class TestBatchNormOpTraining(unittest.TestCase): inputs=inputs, outputs=outputs, attrs=attrs) - block.create_var(name='y@GRAD', dtype='float32', shape=y.shape) + block.create_var(name='y@GRAD', dtype=self.dtype, shape=y.shape) # generate backward op_desc grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc( @@ -320,6 +329,11 @@ class TestBatchNormOpTraining(unittest.TestCase): pass +class TestFP16BatchNormOpTraining(TestBatchNormOpTraining): + def init_dtype(self): + self.dtype = np.float16 + + class TestBatchNormOpTrainingCase1(TestBatchNormOpTraining): def init_test_case(self): self.use_global_stats = False diff --git a/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_depthwise_conv_npu.py b/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_depthwise_conv_npu.py index 012a6e59e775f8ab2d27c23c779571022d6c194f..2e15a1eac2b4b891712fb5889a8974a04c5766c0 100755 --- a/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_depthwise_conv_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_depthwise_conv_npu.py @@ -132,36 +132,50 @@ class TestDepthwiseConvNPU(OpTest): self.check_output_with_place(self.place, atol=1e-2) def test_check_grad(self): - if self.dtype == np.float16: - return if self.dilations[0] == 1 and self.dilations[1] == 1: - self.check_grad_with_place( - self.place, {'Input', 'Filter'}, - 'Output', - max_relative_error=0.03, - numeric_place=paddle.CPUPlace()) + if self.dtype == np.float16: + self.check_grad_with_place( + self.place, {'Input', 'Filter'}, + 'Output', + max_relative_error=0.9) + else: + self.check_grad_with_place( + self.place, {'Input', 'Filter'}, + 'Output', + max_relative_error=0.03, + numeric_place=paddle.CPUPlace()) def test_check_grad_no_filter(self): if self.dtype == np.float16: - return - self.check_grad_with_place( - self.place, ['Input'], - 'Output', - no_grad_set=set(['Filter']), - max_relative_error=0.03, - numeric_place=paddle.CPUPlace()) - - def test_check_grad_no_input(self): - if self.dtype == np.float16: - return - if self.dilations[0] == 1 and self.dilations[1] == 1: self.check_grad_with_place( - self.place, ['Filter'], + self.place, ['Input'], 'Output', - no_grad_set=set(['Input']), + no_grad_set=set(['Filter']), + max_relative_error=0.9) + else: + self.check_grad_with_place( + self.place, ['Input'], + 'Output', + no_grad_set=set(['Filter']), max_relative_error=0.03, numeric_place=paddle.CPUPlace()) + def test_check_grad_no_input(self): + if self.dilations[0] == 1 and self.dilations[1] == 1: + if self.dtype == np.float16: + self.check_grad_with_place( + self.place, ['Filter'], + 'Output', + no_grad_set=set(['Input']), + max_relative_error=0.9) + else: + self.check_grad_with_place( + self.place, ['Filter'], + 'Output', + no_grad_set=set(['Input']), + max_relative_error=0.03, + numeric_place=paddle.CPUPlace()) + def init_data_format(self): self.data_format = "NCHW" @@ -267,32 +281,46 @@ class TestDepthwiseConvNPU_Padding(OpTest): def test_check_grad(self): if self.dtype == np.float16: - return - self.check_grad_with_place( - self.place, {'Input', 'Filter'}, - 'Output', - max_relative_error=0.03, - numeric_place=paddle.CPUPlace()) + self.check_grad_with_place( + self.place, {'Input', 'Filter'}, + 'Output', + max_relative_error=1.2) + else: + self.check_grad_with_place( + self.place, {'Input', 'Filter'}, + 'Output', + max_relative_error=0.03, + numeric_place=paddle.CPUPlace()) def test_check_grad_no_filter(self): if self.dtype == np.float16: - return - self.check_grad_with_place( - self.place, ['Input'], - 'Output', - max_relative_error=0.03, - no_grad_set=set(['Filter']), - numeric_place=paddle.CPUPlace()) + self.check_grad_with_place( + self.place, ['Input'], + 'Output', + max_relative_error=0.7, + no_grad_set=set(['Filter'])) + else: + self.check_grad_with_place( + self.place, ['Input'], + 'Output', + max_relative_error=0.03, + no_grad_set=set(['Filter']), + numeric_place=paddle.CPUPlace()) def test_check_grad_no_input(self): if self.dtype == np.float16: - return - self.check_grad_with_place( - self.place, ['Filter'], - 'Output', - max_relative_error=0.03, - no_grad_set=set(['Input']), - numeric_place=paddle.CPUPlace()) + self.check_grad_with_place( + self.place, ['Filter'], + 'Output', + max_relative_error=0.8, + no_grad_set=set(['Input'])) + else: + self.check_grad_with_place( + self.place, ['Filter'], + 'Output', + max_relative_error=0.03, + no_grad_set=set(['Input']), + numeric_place=paddle.CPUPlace()) def init_data_format(self): self.data_format = "NCHW" diff --git a/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_npu.py index d0dc86055a1635c8bac644570f17e158cb2adda3..4070d0267d95b6cec2d3a2cb9926f9b389b69c50 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_npu.py @@ -127,8 +127,6 @@ class TestConv2DOp(OpTest): self.check_output_with_place(fluid.NPUPlace(0), atol=1e-2) def test_check_grad(self): - if self.dtype == np.float16: - return self.check_grad_with_place( fluid.NPUPlace(0), {'Input', 'Filter'}, 'Output', @@ -136,8 +134,6 @@ class TestConv2DOp(OpTest): numeric_place=paddle.CPUPlace()) def test_check_grad_no_filter(self): - if self.dtype == np.float16: - return self.check_grad_with_place( fluid.NPUPlace(0), ['Input'], 'Output', @@ -146,8 +142,6 @@ class TestConv2DOp(OpTest): numeric_place=paddle.CPUPlace()) def test_check_grad_no_input(self): - if self.dtype == np.float16: - return self.check_grad_with_place( fluid.NPUPlace(0), ['Filter'], 'Output', @@ -276,10 +270,13 @@ class TestConv2DOp_v2(OpTest): def set_npu(self): self.__class__.use_npu = True + def init_dtype(self): + self.dtype = np.float32 + def setUp(self): self.set_npu() self.op_type = "conv2d" - self.dtype = np.float32 + self.init_dtype() self.init_kernel_type() self.init_group() self.init_dilation() @@ -320,31 +317,45 @@ class TestConv2DOp_v2(OpTest): def test_check_grad(self): if self.dtype == np.float16: - return - self.check_grad_with_place( - paddle.NPUPlace(0), {'Input', 'Filter'}, - 'Output', - max_relative_error=0.02, - numeric_place=paddle.CPUPlace()) + self.check_grad_with_place( + paddle.NPUPlace(0), {'Input', 'Filter'}, + 'Output', + max_relative_error=1.1) + else: + self.check_grad_with_place( + paddle.NPUPlace(0), {'Input', 'Filter'}, + 'Output', + max_relative_error=0.02, + numeric_place=paddle.CPUPlace()) def test_check_grad_no_filter(self): if self.dtype == np.float16: - return - self.check_grad_with_place( - paddle.NPUPlace(0), ['Input'], - 'Output', - max_relative_error=0.02, - no_grad_set=set(['Filter']), - numeric_place=paddle.CPUPlace()) + self.check_grad_with_place( + paddle.NPUPlace(0), ['Input'], + 'Output', + max_relative_error=0.99, + no_grad_set=set(['Filter'])) + else: + self.check_grad_with_place( + paddle.NPUPlace(0), ['Input'], + 'Output', + max_relative_error=0.02, + no_grad_set=set(['Filter']), + numeric_place=paddle.CPUPlace()) def test_check_grad_no_input(self): if self.dtype == np.float16: - return - self.check_grad_with_place( - paddle.NPUPlace(0), ['Filter'], - 'Output', - no_grad_set=set(['Input']), - numeric_place=paddle.CPUPlace()) + self.check_grad_with_place( + paddle.NPUPlace(0), ['Filter'], + 'Output', + max_relative_error=0.99, + no_grad_set=set(['Input'])) + else: + self.check_grad_with_place( + paddle.NPUPlace(0), ['Filter'], + 'Output', + no_grad_set=set(['Input']), + numeric_place=paddle.CPUPlace()) def init_test_case(self): self.pad = [0, 0] diff --git a/python/paddle/fluid/tests/unittests/npu/test_cos_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_cos_op_npu.py index 9b29fc812faedde2aa28c9b597c6e8449bbd36b0..a4769442b083eb845daa9f7989c8621a3d475ef8 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_cos_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_cos_op_npu.py @@ -51,8 +51,6 @@ class TestCos(OpTest): self.check_output_with_place(self.place, atol=1e-7) def test_check_grad(self): - if self.dtype == np.float16: - return self.check_grad_with_place(self.place, ['X'], 'Out') diff --git a/python/paddle/fluid/tests/unittests/npu/test_dropout_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_dropout_op_npu.py index bd9022f56a3e77fa92c74637d5947869b201ac54..fea8502f2d7664b2717b42df9923171f880a1db2 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_dropout_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_dropout_op_npu.py @@ -56,8 +56,6 @@ class TestDropoutOp(OpTest): self.check_output_with_place(self.place) def test_check_grad_normal(self): - if self.dtype == np.float16: - return self.check_grad_with_place(self.place, ['X'], 'Out') diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py index 75c70e0a131ac996395427d9d3cdb7f2b7dd8ff7..f24c6c455a0cb306df4ea048641351c5309f5acd 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py @@ -65,36 +65,59 @@ class TestElementwiseAddOp(OpTest): self.check_output_with_place(self.place) def test_check_grad_normal(self): - if self.dtype == np.float16 or self.dtype == np.int64: + if self.dtype == np.int64: return - self.check_grad_with_place( - self.place, - ['X', 'Y'], - 'Out', - max_relative_error=0.006, ) + if self.dtype == np.float16: + self.check_grad_with_place( + self.place, + ['X', 'Y'], + 'Out', + max_relative_error=0.15, ) + else: + self.check_grad_with_place( + self.place, + ['X', 'Y'], + 'Out', + max_relative_error=0.006, ) def test_check_grad_ingore_x(self): - if self.dtype == np.float16 or self.dtype == np.int64: + if self.dtype == np.int64: return - self.check_grad_with_place( - self.place, - ['Y'], - 'Out', - no_grad_set=set("X"), - max_relative_error=0.006, ) + if self.dtype == np.float16: + self.check_grad_with_place( + self.place, + ['Y'], + 'Out', + no_grad_set=set("X"), + max_relative_error=0.92, ) + else: + self.check_grad_with_place( + self.place, + ['Y'], + 'Out', + no_grad_set=set("X"), + max_relative_error=0.006, ) def test_check_grad_ingore_y(self): - if self.dtype == np.float16 or self.dtype == np.int64: + if self.dtype == np.int64: return - self.check_grad_with_place( - self.place, - ['X'], - 'Out', - no_grad_set=set("Y"), - max_relative_error=0.006, ) + if self.dtype == np.float16: + self.check_grad_with_place( + self.place, + ['X'], + 'Out', + no_grad_set=set("Y"), + max_relative_error=0.8, ) + else: + self.check_grad_with_place( + self.place, + ['X'], + 'Out', + no_grad_set=set("Y"), + max_relative_error=0.006, ) class TestFP16ElementwiseAddOp(TestElementwiseAddOp): diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_max_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_max_op_npu.py index 461e15352e3837f90dfa290bf32dddc2ab26b6b8..cbfc07f35447939c9db7e216db0d3a1f530630fe 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_max_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_max_op_npu.py @@ -116,19 +116,13 @@ class TestElementwiseMaxOp(OpTest): self.check_output_with_place(self.place) def test_check_grad_normal(self): - if self.dtype == np.float16: - return self.check_grad_with_place(self.place, ['X', 'Y'], 'Out') def test_check_grad_ingore_x(self): - if self.dtype == np.float16: - return self.check_grad_with_place( self.place, ['Y'], 'Out', no_grad_set=set("X")) def test_check_grad_ingore_y(self): - if self.dtype == np.float16: - return self.check_grad_with_place( self.place, ['X'], 'Out', no_grad_set=set("Y")) @@ -213,15 +207,11 @@ class TestElementwiseMaxOp_broadcast_2(TestElementwiseMaxOp): self.out = np.maximum(self.x, self.y.reshape(1, 1, 100)) def test_check_grad_normal(self): - if self.dtype == np.float16: - return dx, dy = ComputeGrad(self.x, self.y, self.out, self.axis) self.check_grad_with_place( self.place, ['X', 'Y'], 'Out', user_defined_grads=[dx, dy]) def test_check_grad_ingore_x(self): - if self.dtype == np.float16: - return _, dy = ComputeGrad(self.x, self.y, self.out, self.axis) self.check_grad_with_place( self.place, ['Y'], @@ -230,8 +220,6 @@ class TestElementwiseMaxOp_broadcast_2(TestElementwiseMaxOp): user_defined_grads=[dy]) def test_check_grad_ingore_y(self): - if self.dtype == np.float16: - return dx, _ = ComputeGrad(self.x, self.y, self.out, self.axis) self.check_grad_with_place( self.place, ['X'], diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py index 51cf5cdaf6d1afb4a6aad64ddac4600b8d800358..e191224df81ee419e58b843dfd90b74c3fd113c1 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py @@ -64,32 +64,41 @@ class TestElementwiseMinOp(OpTest): def test_check_grad_normal(self): if self.dtype == np.float16: - return - - self.check_grad_with_place( - self.place, - ['X', 'Y'], - 'Out', ) + self.check_grad_with_place( + self.place, ['X', 'Y'], 'Out', max_relative_error=0.5) + else: + self.check_grad_with_place( + self.place, + ['X', 'Y'], + 'Out', ) def test_check_grad_ingore_x(self): if self.dtype == np.float16: - return - - self.check_grad_with_place( - self.place, - ['Y'], - 'Out', - no_grad_set=set("X"), ) + self.check_grad_with_place( + self.place, ['Y'], + 'Out', + no_grad_set=set("X"), + max_relative_error=0.9) + else: + self.check_grad_with_place( + self.place, + ['Y'], + 'Out', + no_grad_set=set("X"), ) def test_check_grad_ingore_y(self): if self.dtype == np.float16: - return - - self.check_grad_with_place( - self.place, - ['X'], - 'Out', - no_grad_set=set("Y"), ) + self.check_grad_with_place( + self.place, ['X'], + 'Out', + no_grad_set=set("Y"), + max_relative_error=0.1) + else: + self.check_grad_with_place( + self.place, + ['X'], + 'Out', + no_grad_set=set("Y"), ) class TestElementwiseMinOpFp16(TestElementwiseMinOp): diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_pow_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_pow_op_npu.py index ce645f317d054c264a730c150df42bccbfabbeee..907e149c8b2c3fb6093b279849a0aff48abfdb39 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_pow_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_pow_op_npu.py @@ -114,8 +114,6 @@ class TestElementwisePow(OpTest): self.out = np.power(self.x, self.y) def test_check_grad_normal(self): - if self.dtype == np.float16: - return dx, dy = ComputeGrad(self.x, self.y, self.out, self.axis) self.check_grad_with_place( self.place, ['X', 'Y'], 'Out', user_defined_grads=[dx, dy]) @@ -184,8 +182,6 @@ class TestElementwisePowOp_broadcast_0(TestElementwisePow): self.out = np.power(self.x, self.y) def test_check_grad_normal(self): - if self.dtype == np.float16: - return dx, dy = ComputeGrad(self.x, self.y, self.out, self.axis) self.check_grad_with_place( self.place, ['X', 'Y'], 'Out', user_defined_grads=[dx, dy]) @@ -218,8 +214,6 @@ class TestElementwisePowOp_broadcast_1(TestElementwisePow): self.out = np.power(self.x, self.y.reshape(1, 100, 1)) def test_check_grad_normal(self): - if self.dtype == np.float16: - return dx, dy = ComputeGrad(self.x, self.y, self.out, self.axis) self.check_grad_with_place( self.place, ['X', 'Y'], 'Out', user_defined_grads=[dx, dy]) @@ -252,8 +246,6 @@ class TestElementwisePowOp_broadcast_2(TestElementwisePow): self.out = np.power(self.x, self.y.reshape(100, 1, 1)) def test_check_grad_normal(self): - if self.dtype == np.float16: - return dx, dy = ComputeGrad(self.x, self.y, self.out, self.axis) self.check_grad_with_place( self.place, ['X', 'Y'], 'Out', user_defined_grads=[dx, dy]) diff --git a/python/paddle/fluid/tests/unittests/npu/test_exp_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_exp_op_npu.py index ccd5f0649d8dc68bb9cc8bb3e1736ced26c7cf7f..6be2fe0086b128851a79016fbeb2eaf705111199 100755 --- a/python/paddle/fluid/tests/unittests/npu/test_exp_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_exp_op_npu.py @@ -50,8 +50,6 @@ class TestExpNPUOP(OpTest): self.check_output_with_place(self.place) def test_check_grad(self): - if self.dtype == np.float16: - return self.check_grad_with_place(self.place, ['X'], 'Out') def init_dtype(self): diff --git a/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py index 89ac9e09aa3488c25000c7801f108e036f33934e..83b65630d801a40aebf59e0f8e464aae5827d84a 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py @@ -34,7 +34,7 @@ class TestExpand(OpTest): self.init_dtype() np.random.seed(SEED) - x = np.random.randn(3, 1, 7).astype(self.dtype) + x = np.random.randn(30, 1, 7).astype(self.dtype) out = np.tile(x, [1, 10, 1]) self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} @@ -50,12 +50,8 @@ class TestExpand(OpTest): def test_check_output(self): self.check_output_with_place(self.place) - # TODO(ascendrc): Add grad test - # def test_check_grad(self): - # if self.dtype == np.float16: - # return - # self.check_grad(['X'], 'Out') - # + def test_check_grad(self): + self.check_grad(['X'], 'Out') class TestExpandV2(TestExpand): @@ -66,7 +62,7 @@ class TestExpandV2(TestExpand): self.init_dtype() np.random.seed(SEED) - x = np.random.randn(3, 1, 7).astype(self.dtype) + x = np.random.randn(30, 1, 7).astype(self.dtype) out = np.tile(x, [1, 10, 1]) expand_times = np.array([1, 10, 1]).astype(np.int32) @@ -145,7 +141,7 @@ class TestExpand_expand_times_all_one(TestExpand): self.init_dtype() np.random.seed(SEED) - x = np.random.randn(3, 1, 7).astype(self.dtype) + x = np.random.randn(30, 1, 7).astype(self.dtype) out = np.tile(x, [1, 1, 1]) self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} diff --git a/python/paddle/fluid/tests/unittests/npu/test_hard_sigmoid_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_hard_sigmoid_op_npu.py index d7aafccc88cf8d9ffde9c0b4923239abe14c3cc9..f1d89cb8d561b2cb0b10e94d0d1f084cf8733ea1 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_hard_sigmoid_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_hard_sigmoid_op_npu.py @@ -59,9 +59,6 @@ class TestNPUHardSigmoid(OpTest): self.check_output_with_place(self.place, atol=1e-5) def test_check_grad(self): - if self.dtype == np.float16: - return - self.check_grad_with_place(self.place, ['X'], 'Out') def set_npu(self): diff --git a/python/paddle/fluid/tests/unittests/npu/test_hard_swish_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_hard_swish_op_npu.py index 32042ba83a9f7723a03f3865319dafc13e1ae649..9495cdb8a55aa9e4e62ad66117cc9b41308d5d76 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_hard_swish_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_hard_swish_op_npu.py @@ -66,8 +66,6 @@ class TestHardSwishNPU(OpTest): self.check_output_with_place(self.place) def test_check_grad(self): - if self.dtype == np.float16: - return # There is a problem that precision of grad result using float32 # can't satisfy the default precision requirement # when compared with numeric_grads, but the results on diff --git a/python/paddle/fluid/tests/unittests/npu/test_huber_loss_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_huber_loss_op_npu.py index 1c9f499d22db42bf89a40b64e8f05a131785956e..a9c195bb8cd29f2c278ee974601eca1ad7e0358d 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_huber_loss_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_huber_loss_op_npu.py @@ -81,13 +81,9 @@ class TestHuberLossOp(OpTest): self.check_output_with_place(self.place) def test_check_grad_normal(self): - if self.dtype == np.float16: - return self.check_grad_with_place(self.place, ['X', 'Y'], 'Out') def test_check_grad_ingore_x(self): - if self.dtype == np.float16: - return self.check_grad_with_place( self.place, ['Y'], 'Out', @@ -95,8 +91,6 @@ class TestHuberLossOp(OpTest): no_grad_set=set("residual")) def test_check_grad_ingore_y(self): - if self.dtype == np.float16: - return self.check_grad_with_place( self.place, ['X'], 'Out', diff --git a/python/paddle/fluid/tests/unittests/npu/test_label_smooth_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_label_smooth_op_npu.py index 6e5b4c012053f7e5e8cee28c7d54be3152ecb4cd..d02ddae461ba5c4182c03c70f7b7e39b639baa9d 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_label_smooth_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_label_smooth_op_npu.py @@ -78,8 +78,10 @@ class TestLabelSmoothOp(OpTest): def test_check_grad(self): if self.dtype == np.float16: - return - self.check_grad_with_place(self.place, ['X'], 'Out') + self.check_grad_with_place( + self.place, ['X'], 'Out', max_relative_error=0.5) + else: + self.check_grad_with_place(self.place, ['X'], 'Out') class TestLabelSmoothOpWithPriorDist(TestLabelSmoothOp): diff --git a/python/paddle/fluid/tests/unittests/npu/test_leaky_relu_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_leaky_relu_op_npu.py index 590a961269989548ee03ed550bcb6ef3faa527f0..a0472f9611eb01c8230efae3555025967398f2f0 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_leaky_relu_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_leaky_relu_op_npu.py @@ -63,8 +63,10 @@ class TestLeadyRelu(OpTest): def test_check_grad(self): if self.dtype == np.float16: - return - self.check_grad_with_place(self.place, ['X'], 'Out') + self.check_grad_with_place( + self.place, ['X'], 'Out', max_relative_error=0.006) + else: + self.check_grad_with_place(self.place, ['X'], 'Out') class TestLeadyReluFP16(TestLeadyRelu): diff --git a/python/paddle/fluid/tests/unittests/npu/test_log_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_log_op_npu.py index 9534431e99a7a2e0218fe08dfd95a770b9924915..5da3cb0ce56503da8edda2506077f7de273375ef 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_log_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_log_op_npu.py @@ -50,12 +50,8 @@ class TestLog(OpTest): def test_check_output(self): self.check_output_with_place(self.place) - # TODO(ascendrc): Add grad test - # def test_check_grad(self): - # if self.dtype == np.float16: - # return - # self.check_grad(['X'], 'Out') - # + def test_check_grad(self): + self.check_grad(['X'], 'Out') class TestLogFp16(OpTest): diff --git a/python/paddle/fluid/tests/unittests/npu/test_log_softmax_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_log_softmax_op_npu.py index f6baefec7f29e769a23a8777b0a5796289c6606d..10ec8621ffa58d9a2ada40f2ff6537a685094cc5 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_log_softmax_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_log_softmax_op_npu.py @@ -63,9 +63,13 @@ class TestLogSoftmaxNPUOp(OpTest): def test_check_grad(self): if self.dtype == np.float16: - return - self.check_grad_with_place( - self.place, ['X'], ['Out'], user_defined_grads=[self.x_grad]) + self.check_grad_with_place( + self.place, ['X'], ['Out'], + user_defined_grads=[self.x_grad], + max_relative_error=0.02) + else: + self.check_grad_with_place( + self.place, ['X'], ['Out'], user_defined_grads=[self.x_grad]) def test_class(op_type, typename): diff --git a/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py index fefff0974ae40d2b9ac9d1a5f81410283cef0761..8ec9eb1cf3572703c656408da21a1f2f3d79123e 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py @@ -77,8 +77,10 @@ class TestLookupTableV2(OpTest): def test_check_grad(self): if self.dtype == np.float16: - return - self.check_grad_with_place(self.place, ['W'], 'Out') + self.check_grad_with_place( + self.place, ['W'], 'Out', max_relative_error=0.01) + else: + self.check_grad_with_place(self.place, ['W'], 'Out') class TestLookupTableV2FP16(TestLookupTableV2): diff --git a/python/paddle/fluid/tests/unittests/npu/test_nearest_interp_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_nearest_interp_v2_op_npu.py index f3df1fca30749e55599c7f19d336ddb9ff41edbd..ec51dcf3f8e3e107574dc02ee69693664b74ff36 100755 --- a/python/paddle/fluid/tests/unittests/npu/test_nearest_interp_v2_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_nearest_interp_v2_op_npu.py @@ -39,10 +39,11 @@ class TestNearestInterpOp(OpTest): self.set_npu() self.out_size = None self.actual_shape = None + self.init_dtype() self.data_layout = 'NCHW' self.init_test_case() self.op_type = "nearest_interp_v2" - input_np = np.random.random(self.input_shape).astype("float32") + input_np = np.random.random(self.input_shape).astype(self.dtype) if self.data_layout == "NCHW": in_h = self.input_shape[2] @@ -95,8 +96,21 @@ class TestNearestInterpOp(OpTest): self.check_output_with_place(self.place) def test_check_grad(self): - self.check_grad_with_place( - self.place, ['X'], 'Out', in_place=True, max_relative_error=0.006) + if self.dtype == np.float16: + self.check_grad_with_place( + self.place, ['X'], + 'Out', + in_place=True, + max_relative_error=0.02) + else: + self.check_grad_with_place( + self.place, ['X'], + 'Out', + in_place=True, + max_relative_error=0.006) + + def init_dtype(self): + self.dtype = np.float32 def init_test_case(self): self.interp_method = 'nearest' @@ -108,6 +122,11 @@ class TestNearestInterpOp(OpTest): self.align_corners = False +class TestNearestNeighborInterpFP16(TestNearestInterpOp): + def init_dtype(self): + self.dtype = np.float16 + + class TestNearestNeighborInterpCase1(TestNearestInterpOp): def init_test_case(self): self.interp_method = 'nearest' diff --git a/python/paddle/fluid/tests/unittests/npu/test_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_norm_op_npu.py index 2c41f09ff51488dd8e6eff48fa0dec0a6917bf50..8e28b3fe413b071d63123203d0f2a842f6d041ba 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_norm_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_norm_op_npu.py @@ -54,9 +54,6 @@ class TestNPUNormOp(OpTest): self.check_output_with_place(self.place) def test_check_grad(self): - if self.dtype == np.float16: - return - self.check_grad_with_place( self.place, ['X'], 'Out', max_relative_error=0.006) diff --git a/python/paddle/fluid/tests/unittests/npu/test_p_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_p_norm_op_npu.py index 3b75cba60b103fce118d2b0aca6eacf50fe9b809..a7ca4edc524be12e00536fa8f08bb6223004943f 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_p_norm_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_p_norm_op_npu.py @@ -51,8 +51,6 @@ class TestPnormOp(OpTest): self.check_output_with_place(paddle.NPUPlace(0)) def test_check_grad(self): - if self.dtype == "float16": - return self.check_grad_with_place( paddle.NPUPlace(0), ['X'], 'Out', user_defined_grads=self.gradient) diff --git a/python/paddle/fluid/tests/unittests/npu/test_pad_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_pad_op_npu.py index 7d6c3b9bdb444667e986fa92f6be5963eaf71f97..d1d2e8b3467be10ff075f357ccf7cb43ef263db7 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_pad_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_pad_op_npu.py @@ -50,9 +50,10 @@ class TestPadOp(OpTest): def test_check_grad_normal(self): if self.dtype == np.float16: - return - - self.check_grad_with_place(self.place, ['X'], 'Out') + self.check_grad_with_place( + self.place, ['X'], 'Out', max_relative_error=0.6) + else: + self.check_grad_with_place(self.place, ['X'], 'Out') def set_npu(self): self.__class__.use_npu = True diff --git a/python/paddle/fluid/tests/unittests/npu/test_pool2d_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_pool2d_op_npu.py index 2b8550a88de592d70299111f8d33b4e978f2177a..4822abc3b25ebed695bc6d0a9fe6b564cef3ab63 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_pool2d_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_pool2d_op_npu.py @@ -67,9 +67,6 @@ def create_test_fp16_class(parent): self.use_cudnn = False self.dtype = np.float16 - def test_check_grad(self): - return - cls_name = "{0}_{1}".format(parent.__name__, "Fp16Op") TestFp16Case.__name__ = cls_name globals()[cls_name] = TestFp16Case diff --git a/python/paddle/fluid/tests/unittests/npu/test_reciprocal_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_reciprocal_op_npu.py index e8f5de005d421566451bff7a211961a311da3195..899d4ef43bd860251da77cb42b482343d5643fba 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_reciprocal_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_reciprocal_op_npu.py @@ -40,8 +40,6 @@ class TestNPUReciprocal(OpTest): self.check_output_with_place(self.place) def test_check_grad(self): - if self.dtype == np.float16: - return self.check_grad_with_place( self.place, ['X'], 'Out', max_relative_error=0.01) diff --git a/python/paddle/fluid/tests/unittests/npu/test_relu6_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_relu6_op_npu.py index 601a351c015f3258ebd23732dd0f76282e8f7d8e..b1cb5e02a731f8bbbc36097a73b609909fc2320b 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_relu6_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_relu6_op_npu.py @@ -56,8 +56,6 @@ class TestRelu6(OpTest): self.check_output_with_place(self.place) def test_check_grad(self): - if self.dtype == np.float16: - return self.check_grad_with_place(self.place, ['X'], 'Out') def init_dtype(self): diff --git a/python/paddle/fluid/tests/unittests/npu/test_relu_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_relu_op_npu.py index a2547808e6f161ae1cdac5ea5944863d7c640d24..c909b14b5141fe1725e10642abec57eb416c1af8 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_relu_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_relu_op_npu.py @@ -34,11 +34,12 @@ class TestRelu(OpTest): self.init_dtype() np.random.seed(SEED) - x = np.random.rand(3, 2).astype(self.dtype) - out = x - self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} - self.attrs = {} + x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype) + # The same reason with TestAbs + x[np.abs(x) < 0.005] = 0.02 + out = np.maximum(x, 0) + self.inputs = {'X': x} self.outputs = {'Out': out} def set_npu(self): @@ -50,32 +51,18 @@ class TestRelu(OpTest): def test_check_output(self): self.check_output_with_place(self.place) + def test_check_grad(self): + if self.dtype == np.float16: + self.check_grad_with_place( + self.place, ['X'], 'Out', max_relative_error=0.006) + else: + self.check_grad_with_place(self.place, ['X'], 'Out') -class TestReluFp16(OpTest): - def setUp(self): - self.set_npu() - self.op_type = "relu" - self.place = paddle.NPUPlace(0) - - self.init_dtype() - np.random.seed(SEED) - x = np.random.rand(3, 2).astype(self.dtype) - out = x - - self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} - self.attrs = {} - self.outputs = {'Out': out} - - def set_npu(self): - self.__class__.use_npu = True - self.__class__.no_need_check_grad = True +class TestReluFp16(TestRelu): def init_dtype(self): self.dtype = np.float16 - def test_check_output(self): - self.check_output_with_place(self.place, atol=1e-5) - class TestReluNeg(OpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/npu/test_sigmoid_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_sigmoid_op_npu.py index 4516b25b59d9c080a4ff12de162440da1f196150..489f8bfb116a19cfaf3348f647cd584483788a75 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_sigmoid_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_sigmoid_op_npu.py @@ -44,8 +44,6 @@ class TestNPUSigmoid(OpTest): self.check_output_with_place(self.place) def test_check_grad(self): - if self.dtype == np.float16: - return self.check_grad_with_place( self.place, ['X'], 'Out', max_relative_error=0.01) diff --git a/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py index 611691109e187b98d67379a3952fea0e0afd88e9..a5b203b6eea2a6c147194aabe36cbc6c600ae971 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py @@ -58,12 +58,17 @@ class TestSliceOp(OpTest): self.place = paddle.NPUPlace(0) def test_check_output(self): - self.check_output_with_place(self.place) + if self.dtype == np.float16: + self.check_output_with_place(self.place) + else: + self.check_output_with_place(self.place) def test_check_grad_normal(self): if self.dtype == np.float16: - return - self.check_grad_with_place(self.place, ['Input'], 'Out') + self.check_grad_with_place( + self.place, ['Input'], 'Out', max_relative_error=0.02) + else: + self.check_grad_with_place(self.place, ['Input'], 'Out') class TestSliceOp2(TestSliceOp): @@ -347,8 +352,10 @@ class TestSliceOpDecsDim(OpTest): def test_check_grad_normal(self): if self.dtype == np.float16: - return - self.check_grad_with_place(self.place, ['Input'], 'Out') + self.check_grad_with_place( + self.place, ['Input'], 'Out', max_relative_error=0.5) + else: + self.check_grad_with_place(self.place, ['Input'], 'Out') class TestSliceOpDecsDimFp16(TestSliceOpDecsDim): diff --git a/python/paddle/fluid/tests/unittests/npu/test_softmax_with_cross_entropy_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_softmax_with_cross_entropy_op_npu.py index 8d78ee6a97efdd1df99c9636e8e18a2905d858a5..f0ca7788345765f5fcef3ebf23e5ca25bc97eaea 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_softmax_with_cross_entropy_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_softmax_with_cross_entropy_op_npu.py @@ -87,8 +87,6 @@ class TestSoftmaxWithCrossEntropyOp(OpTest): self.check_output_with_place(self.place) def test_check_grad(self): - if self.dtype == np.float16: - return # fp32 has low precision, cpu and npu both need to relax the max_relative_error if using fp32 self.check_grad_with_place( self.place, ['Logits'], diff --git a/python/paddle/fluid/tests/unittests/npu/test_sqrt_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_sqrt_op_npu.py index acb99746d231ded16032bfdc1839b6b0f3120f62..24b34fa625c6339f6b990076fe6d0a874e7ba316 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_sqrt_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_sqrt_op_npu.py @@ -50,12 +50,11 @@ class TestSqrt(OpTest): def test_check_output(self): self.check_output_with_place(self.place) - # TODO(ascendrc): Add grad test - # def test_check_grad(self): - # if self.dtype == np.float16: - # return - # self.check_grad(['X'], 'Out') - # + def test_check_grad(self): + if self.dtype == np.float16: + self.check_grad(['X'], 'Out', max_relative_error=0.009) + else: + self.check_grad(['X'], 'Out', max_relative_error=0.009) class TestSqrtFp16(OpTest): diff --git a/python/paddle/fluid/tests/unittests/npu/test_square_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_square_op_npu.py index caf55b4850f0b18f0fb20ed5692119c2b4ceccc2..170f6b6ca4f934c1bf29433502718b5fc35b25d4 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_square_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_square_op_npu.py @@ -51,8 +51,6 @@ class TestSquare(OpTest): self.check_output_with_place(self.place) def test_check_grad(self): - if self.dtype == np.float16: - return self.check_grad_with_place(self.place, ['X'], 'Out') diff --git a/python/paddle/fluid/tests/unittests/npu/test_tanh_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_tanh_op_npu.py index 55be94da2b7e0346d8c6783d244c9d3a2c43273e..375eef12291ec50af416c38292adecf17fa83277 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_tanh_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_tanh_op_npu.py @@ -50,12 +50,11 @@ class TestTanh(OpTest): def test_check_output(self): self.check_output_with_place(self.place) - # TODO(ascendrc): Add grad test - # def test_check_grad(self): - # if self.dtype == np.float16: - # return - # self.check_grad(['X'], 'Out') - # + def test_check_grad(self): + if self.dtype == np.float16: + self.check_grad(['X'], 'Out', max_relative_error=0.009) + else: + self.check_grad(['X'], 'Out', max_relative_error=0.009) class TestTanhFp16(OpTest): diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check_in_eager_mode.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check_in_eager_mode.py new file mode 100644 index 0000000000000000000000000000000000000000..214f41c78a3a5b2c285c7b412241bb59c8ee0a75 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check_in_eager_mode.py @@ -0,0 +1,163 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import division +from __future__ import print_function + +import unittest +import os + +import paddle +import numpy as np +import paddle.distributed as dist +import paddle.fluid as fluid +from paddle.fluid.dygraph.nn import Linear +from paddle.fluid.framework import _test_eager_guard +from paddle.fluid.dygraph.parallel import ParallelEnv +import paddle.fluid.core as core + +paddle.seed(1024) +np.random.seed(2021) + +batch = 5 +in_dim = 10 +out_dim = 20 + + +def init_process_group(strategy=None): + nranks = ParallelEnv().nranks + rank = ParallelEnv().local_rank + is_master = True if rank == 0 else False + store = paddle.fluid.core.TCPStore("127.0.0.1", 6174, is_master, nranks) + group = core.ProcessGroupNCCL(store, rank, nranks) + return group + + +class SimpleNet(fluid.Layer): + def __init__(self, train_id): + super(SimpleNet, self).__init__() + self.w1 = self.create_parameter( + shape=[in_dim, out_dim], dtype="float32") + self.w2 = self.create_parameter( + shape=[in_dim, out_dim], dtype="float32") + self.share_net = Linear(out_dim, 10) + + self.unused_param = self.create_parameter( + shape=[out_dim, in_dim], dtype="float64") + + # just for test sync_params_buffers + # self.register_buffer("queue", paddle.randn([10, 5])) + # self.queue = paddle.nn.functional.normalize(self.queue, axis=0) + # self.register_buffer("queue_ptr", paddle.zeros([1], 'int64')) + + self.trainer_id = train_id + + def forward(self, x): + is_use = (paddle.equal_all( + x, paddle.ones(shape=(batch, in_dim))).numpy()[0] and + self.trainer_id == 1) + + if is_use: + tmp = paddle.matmul(x, self.w1) + else: + tmp = paddle.matmul(x, self.w2) + + return self.share_net(tmp) + + +class TestDistTraning(unittest.TestCase): + def test_multiple_gpus(self): + dist.init_parallel_env() + self.trainer_id = dist.get_rank() + + process_group = init_process_group() + self.pg = process_group + with _test_eager_guard(): + + model_a = SimpleNet(self.trainer_id) + model_b = SimpleNet(self.trainer_id) + + state_dict = model_a.state_dict() + model_b.set_state_dict(state_dict) + + model_a = paddle.DataParallel( + model_a, + find_unused_parameters=True, + process_group=process_group) + model_b = paddle.DataParallel( + model_b, + find_unused_parameters=True, + process_group=process_group) + + ones_input = paddle.ones(shape=(batch, in_dim)) + ones_input.stop_gradient = True + + w1_grad_sum = np.zeros((in_dim, out_dim), dtype='float32') + w2_grad_sum = np.zeros((in_dim, out_dim), dtype='float32') + + for step_id in range(5): + print("==============", step_id) + random_input = paddle.rand(shape=(batch, in_dim)) + random_input.stop_gradient = True + + if step_id % 2 == 0: + out_a = model_a(random_input) + out_b = model_b(random_input) + else: + out_a = model_a(ones_input) + out_b = model_b(ones_input) + + out_a.sum().backward() + out_b.sum().backward() + + self.check_gradient(model_a.parameters()) + self.check_gradient(model_b.parameters()) + + # test acc gradient + w1_grad_sum = self.check_acc(model_a._layers.w1.grad, + w1_grad_sum, + model_b._layers.w1.grad) + w2_grad_sum = self.check_acc(model_a._layers.w2.grad, + w2_grad_sum, + model_b._layers.w2.grad) + + model_a.clear_gradients() + + def check_acc(self, grad, grad_sum, acc_grad): + if grad is not None: + grad_sum = grad_sum + grad.numpy() + acc_grad = acc_grad.numpy() if acc_grad is not None else None + np.testing.assert_allclose(grad_sum, acc_grad, rtol=1e-6) + return grad_sum + + def print_trainer_0(self, *args): + if self.trainer_id == 0: + print(*args) + + def broadcast_param(self, param, root): + self.pg.broadcast(param, root) + return param + + def check_gradient(self, params): + other_param = [] + for param in params: + if param.trainable and (param.grad is not None): + grad = param.grad + other_grad = self.broadcast_param(grad, root=1) + if self.trainer_id == 0: + np.testing.assert_allclose(other_grad.numpy(), grad.numpy()) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/process_group_gloo.py b/python/paddle/fluid/tests/unittests/process_group_gloo.py index c62c4615f74707796946137d3b44efc3cc8aeee9..b1f3a71ab3e94c7db53048b95d73795d155bd122 100644 --- a/python/paddle/fluid/tests/unittests/process_group_gloo.py +++ b/python/paddle/fluid/tests/unittests/process_group_gloo.py @@ -47,9 +47,7 @@ class TestProcessGroupFp32(unittest.TestCase): is_master = True if rank == 0 else False store = paddle.fluid.core.TCPStore("127.0.0.1", 6172, is_master, nranks, datetime.timedelta(0)) - gloo_store = paddle.fluid.core.GlooStore(store) - opt = paddle.fluid.core.GlooOptions() - pg = paddle.fluid.core.ProcessGroupGloo(gloo_store, rank, nranks) + pg = paddle.fluid.core.ProcessGroupGloo(store, rank, nranks) # test allreduce sum # rank 0 diff --git a/python/paddle/fluid/tests/unittests/test_inplace_eager_fluid.py b/python/paddle/fluid/tests/unittests/test_inplace_eager_fluid.py new file mode 100644 index 0000000000000000000000000000000000000000..a434c56200061b656bc2daa0e66069f09b6949cf --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_inplace_eager_fluid.py @@ -0,0 +1,397 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np + +import paddle +import paddle.fluid.core as core +from paddle.fluid.framework import _test_eager_guard + + +class TestDygraphInplace(unittest.TestCase): + def setUp(self): + self.init_data() + self.set_np_compare_func() + + def init_data(self): + self.input_var_numpy = np.random.uniform(-5, 5, [10, 20, 1]) + self.dtype = "float32" + + def set_np_compare_func(self): + self.np_compare = np.array_equal + + def non_inplace_api_processing(self, var): + return paddle.squeeze(var) + + def inplace_api_processing(self, var): + return paddle.squeeze_(var) + + def test_inplace_api(self): + with _test_eager_guard(): + var = paddle.to_tensor(self.input_var_numpy).astype(self.dtype) + inplace_var = self.inplace_api_processing(var) + self.assertTrue(id(var) == id(inplace_var)) + + inplace_var.exp_() + self.assertTrue(np.array_equal(var.numpy(), inplace_var.numpy())) + + def test_forward_version(self): + with paddle.fluid.dygraph.guard(): + with _test_eager_guard(): + var = paddle.to_tensor(self.input_var_numpy).astype(self.dtype) + self.assertEqual(var.inplace_version, 0) + + inplace_var = self.inplace_api_processing(var) + self.assertEqual(var.inplace_version, 1) + + inplace_var.exp_() + self.assertEqual(var.inplace_version, 2) + + inplace_var = self.inplace_api_processing(inplace_var) + self.assertEqual(var.inplace_version, 3) + + def test_leaf_inplace_var_error(self): + with paddle.fluid.dygraph.guard(): + with _test_eager_guard(): + var = paddle.to_tensor(self.input_var_numpy).astype(self.dtype) + var.stop_gradient = False + + def leaf_inplace_error(): + self.inplace_api_processing(var) + + self.assertRaises(ValueError, leaf_inplace_error) + + def test_backward_error(self): + # It raises an error because the inplace operator will result + # in incorrect gradient computation. + with paddle.fluid.dygraph.guard(): + with _test_eager_guard(): + var_a = paddle.to_tensor(self.input_var_numpy).astype( + self.dtype) + var_a.stop_gradient = False + + var_b = var_a**2 + + # Here, the gradient computation will use the value of var_b + var_c = var_b**2 + self.inplace_api_processing(var_b) + + loss = paddle.nn.functional.relu(var_c) + with self.assertRaisesRegexp( + RuntimeError, + "received current_inplace_version:{} != inplace_version_snapshot_:{}". + format(1, 0)): + loss.backward() + + def test_backward_success_1(self): + # var_b is modified inplace before using it, the inplace operator doesn't result + # in incorrect gradient computation. + grad_var_a, grad_var_a_inplace = 0, 1 + with paddle.fluid.dygraph.guard(): + with _test_eager_guard(): + var_a = paddle.to_tensor(self.input_var_numpy).astype( + self.dtype) + var_a.stop_gradient = False + + var_b = var_a**2 + var_c = self.inplace_api_processing( + var_b) # var_b is modified inplace before using it + + # Here, the gradient computation will use the value of var_b + var_d = var_c**2 + loss = var_d.sum() + loss.backward() + grad_var_a_inplace = var_a.grad.numpy() + + with paddle.fluid.dygraph.guard(): + with _test_eager_guard(): + var_a = paddle.to_tensor(self.input_var_numpy).astype( + self.dtype) + var_a.stop_gradient = False + + var_b = var_a**2 + var_c = self.non_inplace_api_processing(var_b) + var_d = var_c**2 + loss = var_d.sum() + loss.backward() + grad_var_a = var_a.grad.numpy() + + self.assertTrue(self.np_compare(grad_var_a_inplace, grad_var_a)) + + def test_backward_success_2(self): + # Although var_b is modified inplace after using it, it does not used in gradient computation. + # The inplace operator doesn't result in incorrect gradient computation. + grad_var_a, grad_var_a_inplace = 0, 1 + with paddle.fluid.dygraph.guard(): + with _test_eager_guard(): + var_a = paddle.to_tensor(self.input_var_numpy).astype( + self.dtype) + var_a.stop_gradient = False + + var_b = var_a**2 + + var_c = self.inplace_api_processing( + var_b) # var_b is modified inplace before using it + + var_d = var_c + var_c # Here, the grad op of sum doesn't use the value of var_b + loss = var_d.sum() + + loss.backward() + grad_var_a_inplace = var_a.grad.numpy() + + with paddle.fluid.dygraph.guard(): + with _test_eager_guard(): + var_a = paddle.to_tensor(self.input_var_numpy).astype( + self.dtype) + var_a.stop_gradient = False + + var_b = var_a**2 + + var_c = self.non_inplace_api_processing( + var_b) # var_b is modified inplace before using it + + var_d = var_c + var_c # Here, the grad op of sum doesn't use the value of var_b + loss = var_d.sum() + + loss.backward() + grad_var_a = var_a.grad.numpy() + self.assertTrue(np.array_equal(grad_var_a_inplace, grad_var_a)) + + +class TestDygraphInplaceUnsqueeze(TestDygraphInplace): + def non_inplace_api_processing(self, var): + return paddle.unsqueeze(var, -1) + + def inplace_api_processing(self, var): + return paddle.unsqueeze_(var, -1) + + +class TestDygraphInplaceReshape(TestDygraphInplace): + def non_inplace_api_processing(self, var): + return paddle.reshape(var, [-1]) + + def inplace_api_processing(self, var): + return paddle.reshape_(var, [-1]) + + +class TestDygraphInplaceFlatten(TestDygraphInplace): + def non_inplace_api_processing(self, var): + return var.flatten() + + def inplace_api_processing(self, var): + return var.flatten_() + + +class TestDygraphInplaceScatter(TestDygraphInplace): + def init_data(self): + self.input_var_numpy = np.array([[1, 1], [2, 2], [3, 3]]) + self.dtype = "float32" + + def non_inplace_api_processing(self, var): + index = paddle.to_tensor([2, 1, 0, 1], dtype='int64') + updates = paddle.to_tensor( + [[1, 1], [2, 2], [3, 3], [4, 4]], dtype='float32') + + return paddle.scatter(var, index, updates, overwrite=False) + + def inplace_api_processing(self, var): + index = paddle.to_tensor([2, 1, 0, 1], dtype='int64') + updates = paddle.to_tensor( + [[1, 1], [2, 2], [3, 3], [4, 4]], dtype='float32') + + return paddle.scatter_(var, index, updates, overwrite=False) + + +class TestDygraphInplaceElu(TestDygraphInplace): + def non_inplace_api_processing(self, var): + return paddle.nn.functional.elu(var) + + def inplace_api_processing(self, var): + return paddle.nn.functional.elu_(var) + + +class TestDygraphInplaceRelu(TestDygraphInplace): + def non_inplace_api_processing(self, var): + return paddle.nn.functional.relu(var) + + def inplace_api_processing(self, var): + return paddle.nn.functional.relu_(var) + + +class TestDygraphInplaceSoftmax(TestDygraphInplace): + def non_inplace_api_processing(self, var): + return paddle.nn.functional.softmax(var) + + def inplace_api_processing(self, var): + return paddle.nn.functional.softmax_(var) + + +class TestDygraphInplaceTanh(TestDygraphInplace): + def non_inplace_api_processing(self, var): + return paddle.tanh(var) + + def inplace_api_processing(self, var): + return paddle.tanh_(var) + + +class TestDygraphInplaceCeil(TestDygraphInplace): + def non_inplace_api_processing(self, var): + return var.ceil() + + def inplace_api_processing(self, var): + return var.ceil_() + + +class TestDygraphInplaceFloor(TestDygraphInplace): + def non_inplace_api_processing(self, var): + return var.floor() + + def inplace_api_processing(self, var): + return var.floor_() + + +class TestDygraphInplaceExp(TestDygraphInplace): + def set_np_compare_func(self): + self.np_compare = np.allclose + + def non_inplace_api_processing(self, var): + return var.exp() + + def inplace_api_processing(self, var): + return var.exp_() + + +class TestDygraphInplaceReciprocal(TestDygraphInplace): + def non_inplace_api_processing(self, var): + return var.reciprocal() + + def inplace_api_processing(self, var): + return var.reciprocal_() + + +class TestDygraphInplaceRound(TestDygraphInplace): + def non_inplace_api_processing(self, var): + return var.round() + + def inplace_api_processing(self, var): + return var.round_() + + +class TestDygraphInplaceSqrt(TestDygraphInplace): + def init_data(self): + self.input_var_numpy = np.random.uniform(0, 5, [10, 20, 1]) + self.dtype = "float32" + + def non_inplace_api_processing(self, var): + return var.sqrt() + + def inplace_api_processing(self, var): + return var.sqrt_() + + +class TestDygraphInplaceRsqrt(TestDygraphInplaceSqrt): + def non_inplace_api_processing(self, var): + return var.rsqrt() + + def inplace_api_processing(self, var): + return var.rsqrt_() + + +class TestDygraphInplaceClip(TestDygraphInplace): + def non_inplace_api_processing(self, var): + return var.clip(0.6, 1.5) + + def inplace_api_processing(self, var): + return var.clip_(0.6, 1.5) + + +class TestDygraphInplaceScale(TestDygraphInplace): + def non_inplace_api_processing(self, var): + return var.scale(scale=2.0, bias=3.0) + + def inplace_api_processing(self, var): + return var.scale_(scale=2.0, bias=3.0) + + +class TestDygraphInplaceAdd(TestDygraphInplace): + def init_data(self): + self.input_var_numpy = np.random.rand(2, 3, 4) + self.dtype = "float32" + self.input_var_numpy_2 = np.random.rand(2, 3, 4).astype(self.dtype) + + def non_inplace_api_processing(self, var): + input_var_2 = paddle.to_tensor(self.input_var_numpy_2) + return var.add(input_var_2) + + def inplace_api_processing(self, var): + input_var_2 = paddle.to_tensor(self.input_var_numpy_2) + return var.add_(input_var_2) + + +class TestDygraphInplaceSubtract(TestDygraphInplaceAdd): + def non_inplace_api_processing(self, var): + input_var_2 = paddle.to_tensor(self.input_var_numpy_2) + return var.subtract(input_var_2) + + def inplace_api_processing(self, var): + input_var_2 = paddle.to_tensor(self.input_var_numpy_2) + return var.subtract_(input_var_2) + + +class TestLossIsInplaceVar(unittest.TestCase): + def test_loss_is_inplace_var(self): + with paddle.fluid.dygraph.guard(): + with _test_eager_guard(): + var_a = paddle.ones((2, 2)) + var_a.stop_gradient = False + + var_b = var_a * 2 + loss = var_b.tanh_() + + loss.backward() + inplace_grad_var_a = var_a.grad.numpy() + + with paddle.fluid.dygraph.guard(): + with _test_eager_guard(): + var_a = paddle.ones((2, 2)) + var_a.stop_gradient = False + + var_b = var_a * 2 + loss = var_b.tanh() + + loss.backward() + grad_var_a = var_a.grad.numpy() + + self.assertTrue(np.array_equal(inplace_grad_var_a, grad_var_a)) + + +class TestContinuouslyInplace(unittest.TestCase): + def test_continuously_inplace(self): + with _test_eager_guard(): + a = paddle.rand([2, 3]) + a.stop_gradient = False + b = a * 2 + + b.reshape_([-1]) + b.reshape_([2, 3]) + b.reshape_([-1]) + + b.backward() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py index d0a40f38ba25721b6f285b48d45d7a3ead37bfee..65d0e289f81329561eaec73d10aa639689f0e1d3 100644 --- a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py +++ b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py @@ -542,7 +542,7 @@ class TestComplexMatMulOp(OpTest): 'Out', user_defined_grads=[self.grad_x, self.grad_y], user_defined_grad_outputs=[self.grad_out], - check_eager=False) + check_eager=True) def test_check_grad_ingore_x(self): self.check_grad( @@ -560,7 +560,7 @@ class TestComplexMatMulOp(OpTest): no_grad_set=set('Y'), user_defined_grads=[self.grad_x], user_defined_grad_outputs=[self.grad_out], - check_eager=False) + check_eager=True) class TestComplexMatMulOpBroadcast(OpTest): diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py index 802fcc96288f6114e81f080ef17e527b2e7ad2bd..2530fc07753e8fac56cedff1a6a9798a42059dcb 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py @@ -205,5 +205,10 @@ class TestDataParallelInEagerMode(TestMultipleGpus): self.run_mnist_2gpu('parallel_dygraph_dataparallel_in_eager_mode.py') +class TestGradientCheckInEagerMode(TestMultipleGpus): + def test_multiple_gpus_dynamic(self): + self.run_mnist_2gpu('parallel_dygraph_gradient_check_in_eager_mode.py') + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/xpu/test_slice_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_slice_op_xpu.py index 8f3578b526e1e5fbfaf2ad27c84bef5134f17d5f..3d7c9959db9ea28ac6f6ecd0050878eee15e6cbd 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_slice_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_slice_op_xpu.py @@ -18,169 +18,174 @@ import sys import unittest sys.path.append("..") from op_test import OpTest +from op_test_xpu import XPUOpTest +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper paddle.enable_static() # Situation 1: starts(list, no tensor), ends(list, no tensor) # 1.1 without attr(decrease) -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") -class TestSliceOp(OpTest): - def setUp(self): - self.op_type = "slice" - self.config() - self.inputs = {'Input': self.input} - self.outputs = {'Out': self.out} - self.attrs = { - 'axes': self.axes, - 'starts': self.starts, - 'ends': self.ends, - 'infer_flags': self.infer_flags, - "use_xpu": True - } - - def config(self): - self.input = np.random.random([3, 4, 5, 6]).astype("float32") - self.starts = [1, 0, 2] - self.ends = [3, 3, 4] - self.axes = [0, 1, 2] - self.infer_flags = [1, 1, 1] - self.out = self.input[1:3, 0:3, 2:4, :] - - def test_check_output(self): - place = paddle.XPUPlace(0) - self.check_output_with_place(place) - - def test_check_grad_normal(self): - place = paddle.XPUPlace(0) - self.check_grad_with_place(place, ['Input'], 'Out') - - -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") -class TestCase1(TestSliceOp): - def config(self): - self.input = np.random.random([3, 4, 5, 6]).astype("float32") - self.starts = [-3, 0, 2] - self.ends = [3, 100, -1] - self.axes = [0, 1, 2] - self.infer_flags = [1, 1, 1] - self.out = self.input[-3:3, 0:100, 2:-1, :] - - -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") -class TestCase2(TestSliceOp): - def config(self): - self.input = np.random.random([3, 4, 5, 6]).astype("float32") - self.starts = [-3, 0, 2] - self.ends = [3, 100, -1] - self.axes = [0, 1, 3] - self.infer_flags = [1, 1, 1] - self.out = self.input[-3:3, 0:100, :, 2:-1] +class XPUTestSliceOp(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'slice' + self.use_dynamic_create_class = False + + class TestSliceOp(XPUOpTest): + def setUp(self): + self.dtype = self.in_type + self.place = paddle.XPUPlace(0) + self.op_type = "slice" + self.config() + self.inputs = {'Input': self.input} + self.outputs = {'Out': self.out} + self.attrs = { + 'axes': self.axes, + 'starts': self.starts, + 'ends': self.ends, + 'infer_flags': self.infer_flags, + "use_xpu": True + } + + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype) + self.starts = [1, 0, 2] + self.ends = [3, 3, 4] + self.axes = [0, 1, 2] + self.infer_flags = [1, 1, 1] + self.out = self.input[1:3, 0:3, 2:4, :] + + def test_check_grad_normal(self): + if self.dtype == np.float16: + self.check_grad_with_place(self.place, ['Input'], 'Out') + else: + user_defined_grad_outputs = np.random.random( + self.out.shape).astype(self.dtype) + self.check_grad_with_place( + self.place, ['Input'], + 'Out', + user_defined_grad_outputs=user_defined_grad_outputs) + + class TestCase1(TestSliceOp): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype) + self.starts = [-3, 0, 2] + self.ends = [3, 100, -1] + self.axes = [0, 1, 2] + self.infer_flags = [1, 1, 1] + self.out = self.input[-3:3, 0:100, 2:-1, :] + + class TestCase2(TestSliceOp): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype) + self.starts = [-3, 0, 2] + self.ends = [3, 100, -1] + self.axes = [0, 1, 3] + self.infer_flags = [1, 1, 1] + self.out = self.input[-3:3, 0:100, :, 2:-1] # 1.2 with attr(decrease) -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") -class TestSliceOp_decs_dim(OpTest): - def setUp(self): - self.op_type = "slice" - self.config() - self.inputs = {'Input': self.input} - self.outputs = {'Out': self.out} - self.attrs = { - 'axes': self.axes, - 'starts': self.starts, - 'ends': self.ends, - 'infer_flags': self.infer_flags, - 'decrease_axis': self.decrease_axis, - "use_xpu": True - } - - def config(self): - self.input = np.random.random([3, 4, 5, 6]).astype("float32") - self.starts = [1, 0, 2] - self.ends = [2, 3, 4] - self.axes = [0, 1, 2] - self.decrease_axis = [0] - self.infer_flags = [1, 1, 1] - self.out = self.input[1, 0:3, 2:4, :] - - def test_check_output(self): - place = paddle.XPUPlace(0) - self.check_output_with_place(place) - - def test_check_grad_normal(self): - place = paddle.XPUPlace(0) - self.check_grad_with_place(place, ['Input'], 'Out') - - -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") -class TestSliceOp_decs_dim_2(TestSliceOp_decs_dim): - def config(self): - self.input = np.random.random([3, 4, 5, 6]).astype("float32") - self.starts = [1, 0, 2] - self.ends = [2, 1, 4] - self.axes = [0, 1, 2] - self.decrease_axis = [0, 1] - self.infer_flags = [1, 1, 1] - self.out = self.input[1, 0, 2:4, :] - - -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") -class TestSliceOp_decs_dim_3(TestSliceOp_decs_dim): - def config(self): - self.input = np.random.random([3, 4, 5, 6]).astype("float32") - self.starts = [-1, 0, 2] - self.ends = [1000000, 1, 4] - self.axes = [0, 1, 2] - self.decrease_axis = [0, 1] - self.infer_flags = [1, 1, 1] - self.out = self.input[-1, 0, 2:4, :] - - -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") -class TestSliceOp_decs_dim_4(TestSliceOp_decs_dim): - def config(self): - self.input = np.random.random([3, 4, 5, 7]).astype("float32") - self.starts = [0, 1, 2, 3] - self.ends = [1, 2, 3, 4] - self.axes = [0, 1, 2, 3] - self.decrease_axis = [0, 1, 2, 3] - self.infer_flags = [1, 1, 1] - self.out = self.input[0, 1, 2, 3:4] - - -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") -class TestSliceOp_decs_dim_5(TestSliceOp_decs_dim): - def config(self): - self.input = np.random.random([3, 4, 5, 6]).astype("float32") - self.starts = [-1] - self.ends = [1000000] - self.axes = [3] - self.decrease_axis = [3] - self.infer_flags = [1, 1, 1] - self.out = self.input[:, :, :, -1] - - -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") -class TestSliceOp_decs_dim_6(TestSliceOp_decs_dim): - def config(self): - self.input = np.random.random([3, 4, 5, 6]).astype("float32") - self.starts = [0, 1, 2, 3] - self.ends = [1, 2, 3, 4] - self.axes = [0, 1, 2, 3] - self.decrease_axis = [0, 1, 2, 3] - self.infer_flags = [1, 1, 1] - self.out = self.input[0, 1, 2, 3:4] - +class XPUTestSliceOp_decs_dim(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'slice' + self.use_dynamic_create_class = False + + class TestSliceOp_decs_dim(XPUOpTest): + def setUp(self): + self.dtype = self.in_type + self.place = paddle.XPUPlace(0) + self.op_type = "slice" + self.config() + self.inputs = {'Input': self.input} + self.outputs = {'Out': self.out} + self.attrs = { + 'axes': self.axes, + 'starts': self.starts, + 'ends': self.ends, + 'infer_flags': self.infer_flags, + 'decrease_axis': self.decrease_axis, + "use_xpu": True + } + + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype) + self.starts = [1, 0, 2] + self.ends = [2, 3, 4] + self.axes = [0, 1, 2] + self.decrease_axis = [0] + self.infer_flags = [1, 1, 1] + self.out = self.input[1, 0:3, 2:4, :] + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad_normal(self): + if self.dtype == np.float16: + self.check_grad_with_place(self.place, ['Input'], 'Out') + else: + user_defined_grad_outputs = np.random.random( + self.out.shape).astype(self.dtype) + self.check_grad_with_place( + self.place, ['Input'], + 'Out', + user_defined_grad_outputs=user_defined_grad_outputs) + + class TestSliceOp_decs_dim_2(TestSliceOp_decs_dim): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype) + self.starts = [1, 0, 2] + self.ends = [2, 1, 4] + self.axes = [0, 1, 2] + self.decrease_axis = [0, 1] + self.infer_flags = [1, 1, 1] + self.out = self.input[1, 0, 2:4, :] + + class TestSliceOp_decs_dim_3(TestSliceOp_decs_dim): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype) + self.starts = [-1, 0, 2] + self.ends = [1000000, 1, 4] + self.axes = [0, 1, 2] + self.decrease_axis = [0, 1] + self.infer_flags = [1, 1, 1] + self.out = self.input[-1, 0, 2:4, :] + + class TestSliceOp_decs_dim_4(TestSliceOp_decs_dim): + def config(self): + self.input = np.random.random([3, 4, 5, 7]).astype(self.dtype) + self.starts = [0, 1, 2, 3] + self.ends = [1, 2, 3, 4] + self.axes = [0, 1, 2, 3] + self.decrease_axis = [0, 1, 2, 3] + self.infer_flags = [1, 1, 1] + self.out = self.input[0, 1, 2, 3:4] + + class TestSliceOp_decs_dim_5(TestSliceOp_decs_dim): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype) + self.starts = [-1] + self.ends = [1000000] + self.axes = [3] + self.decrease_axis = [3] + self.infer_flags = [1, 1, 1] + self.out = self.input[:, :, :, -1] + + class TestSliceOp_decs_dim_6(TestSliceOp_decs_dim): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype) + self.starts = [0, 1, 2, 3] + self.ends = [1, 2, 3, 4] + self.axes = [0, 1, 2, 3] + self.decrease_axis = [0, 1, 2, 3] + self.infer_flags = [1, 1, 1] + self.out = self.input[0, 1, 2, 3:4] + + +support_types = get_xpu_op_support_types('slice') +for stype in support_types: + create_test_class(globals(), XPUTestSliceOp, stype) + create_test_class(globals(), XPUTestSliceOp_decs_dim, stype) if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/xpu/test_tile_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_tile_op_xpu.py index d010e1633578ed6f4a237dbda2641b1b563633ee..cd18bd63a88f7c4366470d4d0854f4951e1ba46d 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_tile_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_tile_op_xpu.py @@ -24,221 +24,158 @@ import paddle import paddle.fluid as fluid from paddle.fluid import compiler, Program, program_guard from paddle.fluid import core +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper paddle.enable_static() np.random.seed(10) #Situation 1: repeat_times is a list (without tensor) -class TestTileOpRank1(XPUOpTest): - def setUp(self): - self.set_xpu() - self.place = paddle.XPUPlace(0) - self.op_type = "tile" - self.init_data() - - self.inputs = {'X': np.random.random(self.ori_shape).astype("float32")} - self.attrs = {'repeat_times': self.repeat_times} - output = np.tile(self.inputs['X'], self.repeat_times) - self.outputs = {'Out': output} - - def set_xpu(self): - self.__class__.use_xpu = True - - def init_data(self): - self.ori_shape = [100] - self.repeat_times = [2] - - def test_check_output(self): - self.check_output_with_place(self.place) - - def test_check_grad(self): - pass - - -#with dimension expanding -class TestTileOpRank2Expanding(TestTileOpRank1): - def init_data(self): - self.ori_shape = [120] - self.repeat_times = [2, 2] - - -class TestTileOpRank2(TestTileOpRank1): - def init_data(self): - self.ori_shape = [12, 14] - self.repeat_times = [2, 3] - - -class TestTileOpRank3_Corner(TestTileOpRank1): - def init_data(self): - self.ori_shape = (2, 10, 5) - self.repeat_times = (1, 1, 1) - - -class TestTileOpRank3_Corner2(TestTileOpRank1): - def init_data(self): - self.ori_shape = (2, 10, 5) - self.repeat_times = (2, 2) - - -class TestTileOpRank3(TestTileOpRank1): - def init_data(self): - self.ori_shape = (2, 4, 15) - self.repeat_times = (2, 1, 4) - - -class TestTileOpRank4(TestTileOpRank1): - def init_data(self): - self.ori_shape = (2, 4, 5, 7) - self.repeat_times = (3, 2, 1, 2) +class XPUTestTileOpRank1(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'tile' + self.use_dynamic_create_class = False + + class TestTileOpRank1(XPUOpTest): + def setUp(self): + self.dtype = self.in_type + self.__class__.no_need_check_grad = True + self.place = paddle.XPUPlace(0) + self.op_type = "tile" + self.init_data() + self.inputs = { + 'X': np.random.random(self.ori_shape).astype(self.dtype) + } + self.attrs = {'repeat_times': self.repeat_times} + output = np.tile(self.inputs['X'], self.repeat_times) + self.outputs = {'Out': output} + + def init_data(self): + self.ori_shape = [100] + self.repeat_times = [2] + + def test_check_output(self): + self.check_output_with_place(self.place) + + #with dimension expanding + class TestTileOpRank2Expanding(TestTileOpRank1): + def init_data(self): + self.ori_shape = [120] + self.repeat_times = [2, 2] + + class TestTileOpRank2(TestTileOpRank1): + def init_data(self): + self.ori_shape = [12, 14] + self.repeat_times = [2, 3] + + class TestTileOpRank3_Corner(TestTileOpRank1): + def init_data(self): + self.ori_shape = (2, 10, 5) + self.repeat_times = (1, 1, 1) + + class TestTileOpRank3_Corner2(TestTileOpRank1): + def init_data(self): + self.ori_shape = (2, 10, 5) + self.repeat_times = (2, 2) + + class TestTileOpRank3(TestTileOpRank1): + def init_data(self): + self.ori_shape = (2, 4, 15) + self.repeat_times = (2, 1, 4) + + class TestTileOpRank4(TestTileOpRank1): + def init_data(self): + self.ori_shape = (2, 4, 5, 7) + self.repeat_times = (3, 2, 1, 2) # Situation 2: repeat_times is a list (with tensor) -class TestTileOpRank1_tensor_attr(XPUOpTest): - def setUp(self): - self.set_xpu() - self.place = paddle.XPUPlace(0) - self.op_type = "tile" - self.init_data() - repeat_times_tensor = [] - for index, ele in enumerate(self.repeat_times): - repeat_times_tensor.append(("x" + str(index), np.ones( - (1)).astype('int32') * ele)) - - self.inputs = { - 'X': np.random.random(self.ori_shape).astype("float32"), - 'repeat_times_tensor': repeat_times_tensor, - } - self.attrs = {"repeat_times": self.infer_repeat_times} - output = np.tile(self.inputs['X'], self.repeat_times) - self.outputs = {'Out': output} - - def set_xpu(self): - self.__class__.use_xpu = True - - def init_data(self): - self.ori_shape = [100] - self.repeat_times = [2] - self.infer_repeat_times = [-1] - - def test_check_output(self): - self.check_output_with_place(self.place) - - def test_check_grad(self): - pass - - -class TestTileOpRank2_Corner_tensor_attr(TestTileOpRank1_tensor_attr): - def init_data(self): - self.ori_shape = [12, 14] - self.repeat_times = [1, 1] - self.infer_repeat_times = [1, -1] - - -class TestTileOpRank2_attr_tensor(TestTileOpRank1_tensor_attr): - def init_data(self): - self.ori_shape = [12, 14] - self.repeat_times = [2, 3] - self.infer_repeat_times = [-1, 3] +class XPUTestTileOpRank1_tensor_attr(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'tile' + self.use_dynamic_create_class = False + + class TestTileOpRank1_tensor_attr(XPUOpTest): + def setUp(self): + self.dtype = self.in_type + self.__class__.no_need_check_grad = True + self.place = paddle.XPUPlace(0) + self.op_type = "tile" + self.init_data() + repeat_times_tensor = [] + for index, ele in enumerate(self.repeat_times): + repeat_times_tensor.append(("x" + str(index), np.ones( + (1)).astype('int32') * ele)) + + self.inputs = { + 'X': np.random.random(self.ori_shape).astype(self.dtype), + 'repeat_times_tensor': repeat_times_tensor, + } + self.attrs = {"repeat_times": self.infer_repeat_times} + output = np.tile(self.inputs['X'], self.repeat_times) + self.outputs = {'Out': output} + + def init_data(self): + self.ori_shape = [100] + self.repeat_times = [2] + self.infer_repeat_times = [-1] + + def test_check_output(self): + self.check_output_with_place(self.place) + + class TestTileOpRank2_Corner_tensor_attr(TestTileOpRank1_tensor_attr): + def init_data(self): + self.ori_shape = [12, 14] + self.repeat_times = [1, 1] + self.infer_repeat_times = [1, -1] + + class TestTileOpRank2_attr_tensor(TestTileOpRank1_tensor_attr): + def init_data(self): + self.ori_shape = [12, 14] + self.repeat_times = [2, 3] + self.infer_repeat_times = [-1, 3] # Situation 3: repeat_times is a tensor -class TestTileOpRank1_tensor(XPUOpTest): - def setUp(self): - self.set_xpu() - self.place = paddle.XPUPlace(0) - self.op_type = "tile" - self.init_data() - - self.inputs = { - 'X': np.random.random(self.ori_shape).astype("float32"), - 'RepeatTimes': np.array(self.repeat_times).astype("int32"), - } - self.attrs = {} - output = np.tile(self.inputs['X'], self.repeat_times) - self.outputs = {'Out': output} - - def set_xpu(self): - self.__class__.use_xpu = True - - def init_data(self): - self.ori_shape = [100] - self.repeat_times = [2] - - def test_check_output(self): - self.check_output_with_place(self.place) - - def test_check_grad(self): - pass - - -class TestTileOpRank2_tensor(TestTileOpRank1_tensor): - def init_data(self): - self.ori_shape = [12, 14] - self.repeat_times = [2, 3] - - -# Situation 4: input x is Integer -class TestTileOpInteger(XPUOpTest): - def setUp(self): - self.set_xpu() - self.place = paddle.XPUPlace(0) - self.op_type = "tile" - self.inputs = { - 'X': np.random.randint( - 10, size=(4, 4, 5)).astype("int32") - } - self.attrs = {'repeat_times': [2, 1, 4]} - output = np.tile(self.inputs['X'], (2, 1, 4)) - self.outputs = {'Out': output} - - def set_xpu(self): - self.__class__.use_xpu = True - - def test_check_output(self): - self.check_output_with_place(self.place) - - -# Situation 5: input x is Integer -class TestTileOpInt64_t(XPUOpTest): - def setUp(self): - self.set_xpu() - self.place = paddle.XPUPlace(0) - self.op_type = "tile" - self.inputs = { - 'X': np.random.randint( - 10, size=(2, 4, 5)).astype("int64") - } - self.attrs = {'repeat_times': [2, 1, 4]} - output = np.tile(self.inputs['X'], (2, 1, 4)) - self.outputs = {'Out': output} - - def set_xpu(self): - self.__class__.use_xpu = True - - def test_check_output(self): - self.check_output_with_place(self.place) - - -# Situation 6: input x is Bool -class TestTileOpBool(XPUOpTest): - def setUp(self): - self.set_xpu() - self.place = paddle.XPUPlace(0) - self.op_type = "tile" - self.inputs = { - 'X': np.random.randint( - 10, size=(2, 4, 5)).astype("bool") - } - self.attrs = {'repeat_times': [2, 1, 4]} - output = np.tile(self.inputs['X'], (2, 1, 4)) - self.outputs = {'Out': output} - - def set_xpu(self): - self.__class__.use_xpu = True - - def test_check_output(self): - self.check_output_with_place(self.place) +class XPUTestTileOpRank1_tensor(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'tile' + self.use_dynamic_create_class = False + + class TestTileOpRank1_tensor(XPUOpTest): + def setUp(self): + self.dtype = self.in_type + self.__class__.no_need_check_grad = True + self.place = paddle.XPUPlace(0) + self.op_type = "tile" + self.init_data() + + self.inputs = { + 'X': np.random.random(self.ori_shape).astype(self.dtype), + 'RepeatTimes': np.array(self.repeat_times).astype("int32"), + } + self.attrs = {} + output = np.tile(self.inputs['X'], self.repeat_times) + self.outputs = {'Out': output} + + def init_data(self): + self.ori_shape = [100] + self.repeat_times = [2] + + def test_check_output(self): + self.check_output_with_place(self.place) + + class TestTileOpRank2_tensor(TestTileOpRank1_tensor): + def init_data(self): + self.ori_shape = [12, 14] + self.repeat_times = [2, 3] + + +support_types = get_xpu_op_support_types('tile') +for stype in support_types: + create_test_class(globals(), XPUTestTileOpRank1, stype) + create_test_class(globals(), XPUTestTileOpRank1_tensor_attr, stype) + create_test_class(globals(), XPUTestTileOpRank1_tensor, stype) # Test python API diff --git a/tools/check_added_ut.sh b/tools/check_added_ut.sh index 2a9fb842862c2e733376d7eee985b428e497b9f9..5466a1cdd597b0f466d3a0a25def932d6a6be098 100644 --- a/tools/check_added_ut.sh +++ b/tools/check_added_ut.sh @@ -52,9 +52,10 @@ if [[ "$SYSTEM" == "Linux" ]] || [[ "$SYSTEM" == "Darwin" ]];then elif [[ "$SYSTEM" == "Windows_NT" ]];then bash $PADDLE_ROOT/win_cmake.sh >prec_build.log 2>&1 fi -ctest -N | awk -F ':' '{print $2}' | sed '/^$/d' | sed '$d' | sed 's/ //g' | grep 'test' > $PADDLE_ROOT/br-ut +# remove line ended with .exe to get correct deleted_ut list +ctest -N | awk -F ':' '{print $2}' | sed '/^$/d' | sed '$d' | sed 's/ //g' | sed '/\.exe$/d' | grep 'test' > $PADDLE_ROOT/br-ut cd $PADDLE_ROOT/build -ctest -N | awk -F ':' '{print $2}' | sed '/^$/d' | sed '$d' | sed 's/ //g' | grep 'test' > $PADDLE_ROOT/pr-ut +ctest -N | awk -F ':' '{print $2}' | sed '/^$/d' | sed '$d' | sed 's/ //g' | sed '/\.exe$/d' | grep 'test' > $PADDLE_ROOT/pr-ut cd $PADDLE_ROOT grep -F -x -v -f br-ut pr-ut > $PADDLE_ROOT/added_ut if [[ "$SYSTEM" == 'Linux' ]];then @@ -66,6 +67,8 @@ rm -rf prec_build if [[ "$SYSTEM" == "Linux" ]] || [[ "$SYSTEM" == "Darwin" ]];then rm $PADDLE_ROOT/br-ut $PADDLE_ROOT/pr-ut $PADDLE_ROOT/paddle/scripts/paddle_build_pre.sh elif [[ "$SYSTEM" == "Windows_NT" ]];then + # get the deleted ut list in windows, will be used in check_change_of_unittest.sh + grep -F -x -v -f pr-ut br-ut > $PADDLE_ROOT/deleted_ut rm $PADDLE_ROOT/br-ut $PADDLE_ROOT/pr-ut $PADDLE_ROOT/win_cmake.sh fi git checkout -f $CURBRANCH diff --git a/tools/infrt/custom_pdop.td b/tools/infrt/custom_pdop.td index 861b31941200fd8a7482482cb683ff969bd05a18..ae0316036f1854e281e07de59fb5aa53201bd35e 100644 --- a/tools/infrt/custom_pdop.td +++ b/tools/infrt/custom_pdop.td @@ -42,6 +42,6 @@ def PD_ConstantOp : PD_Op<"constant", [NoSideEffect, ConstantLike, DeclareOpInte let hasFolder = 1; let builders = [ - OpBuilder<(ins "Attribute":$value)>, + OpBuilder<(ins "mlir::Attribute":$value)>, ]; } diff --git a/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py b/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py index 8855e1eee38717a6cffc14e9c1762af36e94fa84..b0e420da64aa280b71859b27334a2abeaaacc53b 100644 --- a/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py +++ b/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py @@ -191,6 +191,21 @@ def generate_all_ops_inputs_outputs_map(op_descs): ops_inputs_outputs_head_file.write(cpp_style_ops_outputs_map_str) +def get_constraint(op_type, op_proto): + # 2.3.1 inputs + constraint = "NoSideEffect" + + optional_input_num_ = 0 + for input_ in op_proto[INPUTS]: + if op_proto[INPUTS][input_][EXTRA] != True and op_proto[INPUTS][input_][ + INTERMEDIATE] != True and op_proto[INPUTS][input_][ + DISPENSABLE] == True: + optional_input_num_ += 1 + if optional_input_num_ > 1: + constraint += ", AttrSizedOperandSegments" + return constraint + + # funtion to generate paddle op dialect file def convert_op_proto_into_mlir(op_descs): dst_dialect_file = "../../paddle/infrt/dialect/pd/ir/pd_ops.td" @@ -237,9 +252,11 @@ def convert_op_proto_into_mlir(op_descs): if (op_type in skipped_op_list) or (op_type not in original_ops_): continue automatically_generated_op_dialect.append(op_type) + constraint_ = get_constraint(op_type, op_proto) # 2.1 OpDef - HEAD = 'def PD_{op_type_capitalize}Op : PD_Op<"{op_type}", [NoSideEffect]> {left_brace}\n'.format( + HEAD = 'def PD_{op_type_capitalize}Op : PD_Op<"{op_type}", [{constraint}]> {left_brace}\n'.format( op_type_capitalize=op_type.capitalize(), + constraint=constraint_, op_type=op_type, left_brace="{") SUMMARY = ' let summary = "{} op";\n'.format(op_type) @@ -256,14 +273,22 @@ def convert_op_proto_into_mlir(op_descs): ARGUMENTS = "" if (len(op_proto[INPUTS]) > 0 or len(op_proto[ATTRS]) > 0): ARGUMENTS = " let arguments = (ins " + # 2.3.1 inputs for input_ in op_proto[INPUTS]: if op_proto[INPUTS][input_][EXTRA] != True and op_proto[INPUTS][ input_][INTERMEDIATE] != True: - if op_proto[INPUTS][input_][DUPLICABLE] != "true": - ARGUMENTS = ARGUMENTS + " PD_Tensor:$" + input_ + "," + if op_proto[INPUTS][input_][DISPENSABLE] != True: + if op_proto[INPUTS][input_][DUPLICABLE] != True: + ARGUMENTS = ARGUMENTS + " PD_Tensor:$" + input_ + "," + else: + ARGUMENTS = ARGUMENTS + " PD_Tensor_Array:$" + input_ + "," else: - ARGUMENTS = ARGUMENTS + " PD_Tensor_Array:$" + input_ + "," + if op_proto[INPUTS][input_][DUPLICABLE] != True: + ARGUMENTS = ARGUMENTS + " Optional:$" + input_ + "," + else: + ARGUMENTS = ARGUMENTS + " Optional:$" + input_ + "," + # unsupported: BLOCK = 8; BLOCKS = 10; attr_mlir_converter = { 0: 'SI32Attr', @@ -332,7 +357,7 @@ def convert_op_proto_into_mlir(op_descs): for output_ in op_proto[OUTPUTS]: if op_proto[OUTPUTS][output_][EXTRA] != True and op_proto[ OUTPUTS][output_][INTERMEDIATE] != True: - if op_proto[OUTPUTS][output_][DUPLICABLE] != "true": + if op_proto[OUTPUTS][output_][DUPLICABLE] != True: outputs = outputs + "PD_Tensor:${},".format(output_) else: outputs = outputs + "PD_Tensor_Array:${},".format( diff --git a/tools/infrt/generate_phi_kernel_dialect.py b/tools/infrt/generate_phi_kernel_dialect.py index 36561d4e71da8b669f1e06b0240a4d4b3b2ca92e..f632c9a9dba504d209946e494e55eb970e727629 100644 --- a/tools/infrt/generate_phi_kernel_dialect.py +++ b/tools/infrt/generate_phi_kernel_dialect.py @@ -43,7 +43,8 @@ precision_type_converter = { "float64": "FLOAT64", "complex64": "COMPLEX64", "complex128": "COMPLEX128", - "bool": "BOOL" + "bool": "BOOL", + "Undefined": "UNK" } kernel_types_info_file = "./kernels.json" diff --git a/tools/infrt/get_phi_kernel_function.sh b/tools/infrt/get_phi_kernel_function.sh index 3b9f4b7273500f23d67a3062a2d4ee367c0b473b..6b2586d40819b9e25eef823dff59687114664197 100644 --- a/tools/infrt/get_phi_kernel_function.sh +++ b/tools/infrt/get_phi_kernel_function.sh @@ -41,7 +41,37 @@ python3 ${PADDLE_ROOT}/python/paddle/utils/code_gen/wrapped_infermeta_gen.py \ grep PD_REGISTER_INFER_META_FN ${temp_path}/generate.cc \ | awk -F "\(|,|::|\)" '{print $2, $4}' > ${temp_path}/wrap_info.txt -#step 3: merge all infos + +#step 3:get ir's attr_name. +ir_attr_name_info_file=`mktemp` +# phi_cpu attr +all_ir_name=`grep -Eo "PDTCPU_Kernel<.*\"" paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td | awk -v FS="<" '{gsub(/\"/,"");print $2}'` +for ir in $all_ir_name +do + attr_name=`grep "<\"$ir" -A 3 paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td | grep -Eo "Attr:.*)" \ + | awk '{gsub(/F32Attr/,"");gsub(/F64Attr/,"");gsub(/StrAttr/,"");gsub(/BOOLAttr/,""); \ + gsub(/SI1Attr/,"");gsub(/SI8Attr/,"");gsub(/SI16Attr/,"");gsub(/SI32Attr/,"");gsub(/SI64Attr/,""); \ + gsub(/UI1Attr/,"");gsub(/UI8Attr/,"");gsub(/I16Attr/,"");gsub(/I32Attr/,"");gsub(/I64Attr/,""); \ + gsub(/I1Attr/,"");gsub(/I8Attr/,"");gsub(/UI16Attr/,"");gsub(/UI32Attr/,"");gsub(/UI64Attr/,""); \ + gsub(/Attr/,"");gsub(/\)/,""); \ + gsub(/[,:]/,"");print $a}'` + echo phi_cpu.$ir $attr_name >> $ir_attr_name_info_file +done +# phi_gpu attr +all_ir_name=`grep -Eo "PDTGPU_Kernel<.*\"" paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td | awk -v FS="<" '{gsub(/\"/,"");print $2}'` +for ir in $all_ir_name +do + attr_name=`grep "<\"$ir" -A 3 paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td | grep -Eo "Attr:.*)" \ + | awk '{gsub(/F32Attr/,"");gsub(/F64Attr/,"");gsub(/StrAttr/,"");gsub(/BOOLAttr/,""); \ + gsub(/SI1Attr/,"");gsub(/SI8Attr/,"");gsub(/SI16Attr/,"");gsub(/SI32Attr/,"");gsub(/SI64Attr/,""); \ + gsub(/UI1Attr/,"");gsub(/UI8Attr/,"");gsub(/I16Attr/,"");gsub(/I32Attr/,"");gsub(/I64Attr/,""); \ + gsub(/I1Attr/,"");gsub(/I8Attr/,"");gsub(/UI16Attr/,"");gsub(/UI32Attr/,"");gsub(/UI64Attr/,""); \ + gsub(/Attr/,"");gsub(/\)/,""); \ + gsub(/[,:]/,"");print $a}'` + echo phi_gpu.$ir $attr_name >> $ir_attr_name_info_file +done + +#step 4: merge all infos # @input1 => phi kernel infomation : kernel_name kernel_key(GPU/CPU, precision, layout) # @input2 => information from api.yaml : kernel_name kernel_function_name inferMeta_function_name # @input3 => information from wrapped_infermeta_gen : ensure the inferMeta function has @@ -50,4 +80,5 @@ python3 ${PADDLE_ROOT}/tools/infrt/get_phi_kernel_info.py \ --paddle_root_path ${PADDLE_ROOT} \ --kernel_info_file $kernel_register_info_file \ --infermeta_wrap_file ${temp_path}/wrap_info.txt \ + --attr_info_file $ir_attr_name_info_file \ --generate_file ${PADDLE_ROOT}/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.cc diff --git a/tools/infrt/get_phi_kernel_info.py b/tools/infrt/get_phi_kernel_info.py index 774f6cd6bf3648a0de7a34e01e893d212bce9770..85ad585cdefa9cbb4ac8d029e699af4d5ffaeaf7 100644 --- a/tools/infrt/get_phi_kernel_info.py +++ b/tools/infrt/get_phi_kernel_info.py @@ -37,6 +37,8 @@ def parse_args(): type=str, required=True, help="inferMeta wrap info file.") + parser.add_argument( + "--attr_info_file", type=str, required=True, help="attr info file.") parser.add_argument( "--generate_file", type=str, @@ -59,6 +61,23 @@ def get_kernel_info(file_path): return [l.strip() for l in cont] +def get_attr_info(file_path): + """ + phi_gpu.argsort.float64.any $axisBool$descending + """ + ret = {} + with open(file_path, 'r') as f: + cont = f.readlines() + for l in cont: + datas = l.strip().split(' ') + if len(datas) == 2: + attrs = datas[1].split('$') + ret[datas[0]] = attrs[1:] + else: + ret[datas[0]] = None + return ret + + def merge(infer_meta_data, kernel_data, wrap_data): meta_map = {} for api in infer_meta_data: @@ -114,14 +133,14 @@ namespace kernel { def gen_context(val): if val == "CPU": - return "phi::CPUContext" - # elif val == "GPU": - # return "phi::GPUContext" + return "phi::CPUContext", "phi_cpu" + elif val == "GPU": + return "phi::GPUContext", "phi_gpu" # elif val == "XPU": - # return "phi::XPUContext" + # return "phi::XPUContext", "phi_xpu" else: # raise Exception(f"Unknown context type {val}") - return "" + return "", "" def gen_layout(val): @@ -195,34 +214,53 @@ def gen_dtype(vals: List[str]): return ir_dtypes, origin_dtypes -# TODO(wilber): Now only process CPUContext. -def gen_register_info(resources: List[List[str]]): +# Note: Now only process CPUContext and GPUContext. + + +def gen_register_code_info(item: List[str], attr_data: Dict[str, List[str]]): """ - resources: [['add', 'CPU', 'ALL_LAYOUT', 'AddKernel', 'float', 'double', '...'(varaidic types), 'ElementwiseInferMeta'], ...] + item: ['add', 'CPU', 'ALL_LAYOUT', 'AddKernel', 'float', 'double', '...'(varaidic types), 'ElementwiseInferMeta'] + attr_data: {'phi_cpu.arg_min.float32.any': ['axisBool', 'keepdimsBool', 'flatten', 'dtype']} """ - res = "void RegisterInferShapeLaunchers(host_context::KernelRegistry* registry) {" - for item in resources: - # The output string is polluted by C++ macros, here the \ is removed - update_item = [v.strip('\\') for v in item] + ctx_name, ir_ctx_name = gen_context(item[1]) + if (ctx_name == ""): + return "" + item[2] = gen_layout(item[2]) + ir_dtypes, origin_dtypes = gen_dtype(item[4:-1]) + infer_shape_func = "&phi::" + item[-1] - ctx_name = gen_context(update_item[1]) - if (ctx_name == ""): - continue - update_item[2] = gen_layout(update_item[2]) - ir_dtypes, origin_dtypes = gen_dtype(update_item[4:-1]) - infer_shape_func = "&phi::" + update_item[-1] + res = "" - if update_item[-1] == "unknown": - # TODO(wilber): handle the unknown inferShape func. - continue + if item[-1] == "unknown": + # TODO(wilber): handle the unknown inferShape func. + return "" + + for ir_dtype, origin_dtype in zip(ir_dtypes, origin_dtypes): + kernel_func = gen_kernel_func(item[3], ctx_name, origin_dtype) + ir_name = ir_ctx_name + '.' + item[0].lower( + ) + '.' + ir_dtype + '.' + item[2].lower() + if ir_name in attr_data.keys() and attr_data[ir_name] is not None: + attr_names = ', '.join( + ["\"" + a + "\"" for a in attr_data[ir_name]]) + res += f""" +registry->AddKernelWithAttrs("{ir_name}",""" + + res += f""" + std::bind(&KernelLauncherFunc, + KernelLauncher(), + std::placeholders::_1), + {{{attr_names}}}); +""" - for ir_dtype, origin_dtype in zip(ir_dtypes, origin_dtypes): - kernel_func = gen_kernel_func(update_item[3], ctx_name, - origin_dtype) - ir_name = 'phi_cpu.' + update_item[0].lower( - ) + '.' + ir_dtype + '.' + update_item[2].lower() + else: res += f""" - registry->AddKernel("{ir_name}",""" +registry->AddKernel("{ir_name}",""" res += f""" std::bind(&KernelLauncherFunc