diff --git a/.gitignore b/.gitignore index 801790d0a472080af607e9fbcde0284902a4ead8..664c45b7202f6bf93712062ffa1d003b575afffd 100644 --- a/.gitignore +++ b/.gitignore @@ -52,12 +52,12 @@ tools/__pycache__ # This file is automatically generated. # TODO(zhiqiang) Move this file to build directory. -paddle/infrt/dialect/pd_ops.td +paddle/infrt/dialect/pd/ir/pd_ops.td paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td tools/infrt/kernels.json tools/infrt/kernel_signature.json -paddle/infrt/dialect/pd_ops_info.h +paddle/infrt/dialect/pd/common/pd_ops_info.h .lit_test_times.txt paddle/infrt/tests/dialect/Output paddle/infrt/tests/lit.cfg.py diff --git a/cmake/external/paddle2onnx.cmake b/cmake/external/paddle2onnx.cmake index 661c3675c84b27a7ed8210fec0cfeaa2c858487c..ba6f0396008fc25dd21d462a2d19285a6cbe9080 100644 --- a/cmake/external/paddle2onnx.cmake +++ b/cmake/external/paddle2onnx.cmake @@ -61,6 +61,7 @@ set(PADDLE2ONNX_OPTIONAL_ARGS -DONNX_CUSTOM_PROTOC_PATH=${PROTOC_BIN_PATH} -DWITH_STATIC=OFF -DCMAKE_INSTALL_PREFIX=${PADDLE2ONNX_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR=${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR} -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} ${EXTERNAL_OPTIONAL_ARGS} diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt index 3fca45cc068f9916b52b3f99df2baa679d4c3546..49ba9479d49e93143665b8314d04ee8e0efcbf51 100644 --- a/paddle/fluid/distributed/collective/CMakeLists.txt +++ b/paddle/fluid/distributed/collective/CMakeLists.txt @@ -1,5 +1,5 @@ cc_library(processgroup SRCS ProcessGroup.cc DEPS phi phi_api eager_api) -cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup phi phi_api) +cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup phi phi_api string_helper) if (WITH_DISTRIBUTE) cc_library(processgroup_gloo SRCS ProcessGroupGloo.cc DEPS phi phi_api eager_api gloo_wrapper) diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc index 5dc43af117825bf95407255e93e1e4600e8ddd9a..cb82677a281e990d9837f081b0d4d2f3b0a34a26 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc @@ -171,10 +171,10 @@ ProcessGroupGloo::GlooTask::GlooTask(int rank, "Only CPU place is supported for ProcessGroupGloo.")); } -ProcessGroupGloo::ProcessGroupGloo(const std::shared_ptr& store, - int rank, int world_size, - const std::shared_ptr options) - : ProcessGroup(rank, world_size), _tag(0), _store(store) { +ProcessGroupGloo::ProcessGroupGloo( + const std::shared_ptr& store, int rank, + int world_size, const std::shared_ptr options) + : ProcessGroup(rank, world_size), _tag(0), _store(new GlooStore(store)) { _context = std::make_shared(rank, world_size); auto prefix_store = ::gloo::rendezvous::PrefixStore(std::to_string(0), *_store); diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.h b/paddle/fluid/distributed/collective/ProcessGroupGloo.h index 24f156571a427128f09cd28e632212f47fa4cd47..71e0a40f8a76181d9f4db13ddd57b31de676910b 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupGloo.h +++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.h @@ -52,8 +52,7 @@ class ProcessGroupGloo : public ProcessGroup { class GlooStore : public ::gloo::rendezvous::Store { public: - explicit GlooStore( - const std::shared_ptr& store) + explicit GlooStore(const std::shared_ptr& store) : _store(store) {} ~GlooStore() = default; @@ -87,7 +86,7 @@ class ProcessGroupGloo : public ProcessGroup { } protected: - std::shared_ptr _store; + std::shared_ptr _store; }; class GlooOptions { @@ -100,9 +99,9 @@ class ProcessGroupGloo : public ProcessGroup { std::shared_ptr<::gloo::transport::Device> device; }; - explicit ProcessGroupGloo(const std::shared_ptr& store, int rank, - int world_size, - std::shared_ptr options); + explicit ProcessGroupGloo( + const std::shared_ptr& store, int rank, + int world_size, std::shared_ptr options); ~ProcessGroupGloo() = default; @@ -145,7 +144,7 @@ class ProcessGroupGloo : public ProcessGroup { protected: uint32_t _tag; std::shared_ptr _context; - std::shared_ptr _store; + std::shared_ptr<::gloo::rendezvous::Store> _store; }; } // namespace distributed diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc index 5533f3f4cbf4b136c52b35cb74afefb86cbe73d7..be4c5423943f5076201b75e307094c75d3d9c103 100644 --- a/paddle/fluid/distributed/collective/reducer.cc +++ b/paddle/fluid/distributed/collective/reducer.cc @@ -17,6 +17,20 @@ namespace paddle { namespace distributed { +static Backend TransToBackend(platform::Place place) { + static const std::map type_backend = { + {phi::AllocationType::GPU, Backend::GPU}, + {phi::AllocationType::CPU, Backend::CPU}, + }; + + phi::AllocationType type = place.GetType(); + auto it = type_backend.find(type); + PADDLE_ENFORCE_EQ(it != type_backend.end(), true, + platform::errors::InvalidArgument( + "Place type (%s) is not supported. ", place)); + return it->second; +} + std::vector> Eager_AssignGroupBySize( const std::vector tensors, const std::vector &is_sparse_gradient, @@ -297,10 +311,18 @@ EagerReducer::EagerReducer( std::dynamic_pointer_cast(grad_node); accumulation_grad_node->RegisterReduceHook( std::make_shared(reduce_hook)); + + gradnode_index_map_[grad_node.get()] = global_var_index; } vars_marked_ready_.resize(tensors_.size(), false); local_used_vars_.resize(tensors_.size(), 0); + + if (find_unused_vars_each_step_) { + global_used_vars_ = paddle::experimental::empty( + ScalarArray({static_cast(tensors_.size())}), DataType::INT32, + TransToBackend(inner_place_)); + } } std::shared_ptr EagerReducer::GetGradNodeFromTensor( @@ -341,21 +363,10 @@ void EagerReducer::InitializeGroups( } else { // process the dense gradient. InitializeDenseGroups(tensor_indices_, &group); - experimental::Backend backend; - switch (inner_place_.GetType()) { - case phi::AllocationType::GPU: - backend = experimental::Backend::GPU; - break; - case phi::AllocationType::CPU: - backend = experimental::Backend::CPU; - break; - default: - PADDLE_THROW(platform::errors::Unimplemented( - "Place type (%s) is not supported. ", inner_place_)); - break; - } + // experimental::Backend backend = TransToBackend(inner_place_); group.dense_contents_ = paddle::experimental::empty( - ScalarArray({group.all_length_}), group.dtype_, backend); + ScalarArray({group.all_length_}), group.dtype_, + TransToBackend(inner_place_)); } // map tensors to this group by VariableLocator @@ -418,6 +429,53 @@ void EagerReducer::InitializeDenseGroups( p_group->all_length_ = all_length; } +void EagerReducer::TraverseBackwardGraph(const std::vector &outputs) { + std::queue queue; + std::set visited; + + for (const auto &output : outputs) { + auto *auto_grad_meta = + static_cast(output.get_autograd_meta()); + if (!auto_grad_meta) continue; + auto shared_grad_node = auto_grad_meta->GetMutableGradNode(); + if (shared_grad_node == nullptr || shared_grad_node.get() == nullptr || + auto_grad_meta->StopGradient()) { + continue; + } + egr::GradNodeBase *grad_node = shared_grad_node.get(); + queue.emplace(grad_node); + } + + while (!queue.empty()) { + egr::GradNodeBase *node = queue.front(); + queue.pop(); + const std::vector> &edges = node->GetEdges(); + for (size_t i = 0; i < edges.size(); i++) { + for (size_t j = 0; j < edges[i].size(); j++) { + const egr::Edge &edge = edges[i][j]; + auto next_node_shared = edge.GetMutableGradNode(); + if (!next_node_shared || !next_node_shared.get()) { + continue; + } + auto *next_node = next_node_shared.get(); + const bool was_inserted = visited.insert(next_node).second; + if (was_inserted) { + queue.emplace(next_node); + } + } + } + } + + for (const auto &it : gradnode_index_map_) { + if (visited.count(it.first) == 0) { + unused_vars_.push_back(it.second); + VLOG(3) << "[Rank " << process_group_->GetRank() << "]: " + << "Tensor " << tensors_[it.second].name() << " at index " + << it.second << " is marked as unused."; + } + } +} + void EagerReducer::PrepareForBackward(const std::vector &outputs) { VLOG(3) << "after forward, then reset count for backward."; grad_need_hooks_ = true; @@ -429,6 +487,51 @@ void EagerReducer::PrepareForBackward(const std::vector &outputs) { // reinitialize vars_marked_ready_ for next iteration vars_marked_ready_.clear(); vars_marked_ready_.resize(tensors_.size(), false); + + PADDLE_ENFORCE_EQ( + groups_need_finalize_, false, + platform::errors::PreconditionNotMet( + "A serious error has occurred here. Please " + "set find_unused_parameters=True to traverse backward graph " + "in each step to prepare reduce in advance. If you have " + "set, There may be several reasons for this error: " + "1) Please note that all forward outputs derived from the module " + "parameters must participate in the calculation of losses and " + "subsequent gradient calculations. If not, the wrapper will hang, " + "waiting for autograd to generate gradients for these parameters. " + "you can use detach or stop_gradient to make the unused parameters " + "detached from the autograd graph. " + "2) Used multiple forwards and one backward. You may be able to wrap " + "multiple forwards in a model.")); + + // The first var to trigger the unused parameter + has_marked_unused_vars_ = false; + + if (find_unused_vars_once_ || find_unused_vars_each_step_) { + unused_vars_.clear(); + TraverseBackwardGraph(outputs); + // only check once in first step + find_unused_vars_once_ = false; + } + + if (find_unused_vars_each_step_ && unused_vars_.empty()) { + LOG_FIRST_N(WARNING, 1) + << "All parameters are involved in the backward pass. " + "It is recommended to set find_unused_parameters to False " + "to improve performance. However, if unused parameters " + "appear in subsequent iterative training, then an error " + "will occur. Please make it clear that in the subsequent " + "training, there will be no parameters that are not used " + "in the backward pass, and then set find_unused_parameters"; + } + + if (unused_vars_.size() == tensors_.size()) { + LOG_FIRST_N(WARNING, 1) + << "There is no parameter in the device involved " + "in the backward calculation. If there are " + "parameters on other devices involved in the " + "backward, then a serious error will occur here."; + } } void EagerReducer::AddDistHook(size_t var_index) { @@ -446,36 +549,104 @@ void EagerReducer::AddDistHook(size_t var_index) { auto &tensor = tensors_[var_index]; const auto &grad_node = GetGradNodeFromTensor(&tensor); - VLOG(3) << "Var[" << var_index << "] [" << (*grad_node).name() - << "] arrived and triggered disthook"; + VLOG(3) << "Tensor[" << var_index << "] [" << tensors_[var_index].name() + << "@Grad] arrived and triggered disthook"; local_used_vars_[var_index] = 1; + if (!has_marked_unused_vars_) { + has_marked_unused_vars_ = true; + for (const auto unused_index : unused_vars_) { + MarkVarReady(unused_index, false); + } + } MarkVarReady(var_index, true); } void EagerReducer::MarkVarReady(const size_t var_index, const bool is_used_var) { + VLOG(3) << "Tensor[" << var_index << "][" << tensors_[var_index].name() + << "] is marked ready."; + // error happened, if the var is ready before. + if (vars_marked_ready_[var_index]) { + auto error_info = string::Sprintf( + "Error happened, when parameter[%d][%s] has been ready before. " + "Please set find_unused_parameters=True to traverse backward graph " + "in each step to prepare reduce in advance. If you have set, " + "there may be several reasons for this error: " + "1) In multiple reentrant backward phase, some parameters are reused." + "2) Using model parameters outside of forward function. Please " + "make sure that model parameters are not shared in concurrent " + "forward-backward passes.", + var_index, tensors_[var_index].name()); + + PADDLE_ENFORCE_EQ(has_marked_unused_vars_, false, + platform::errors::PreconditionNotMet(error_info)); + + error_info += + "3) Unused parameters retrieval is incorrect. " + "The return value of forward will be used to retrieve" + " the unused parameters of the entire model. These " + "gradients of unused parameters will not be synchronized " + "between multiple cards. However, if the unused " + "parameters participate in the backward calculation " + "again at a later time (e.g. after the forward function, " + "the loss calculation uses the unused " + "paramters of the forward and trigger backward), " + "its gradient will be wrong."; + + PADDLE_ENFORCE_EQ(has_marked_unused_vars_, true, + platform::errors::PreconditionNotMet(error_info)); + } else { + vars_marked_ready_[var_index] = true; + } + groups_need_finalize_ = true; + const auto &var_locator = variable_locators_[var_index]; const auto group_index = var_locator.group_index; const auto inside_group_index = var_locator.inside_group_index; auto &group = groups_[group_index]; auto &group_tensor = group.dense_tensors_[inside_group_index]; - auto *autograd_meta = tensors_[var_index].get_autograd_meta(); - auto &grad_tensor = static_cast(autograd_meta)->Grad(); - - group_tensor - .ShareDataWith( - *(std::dynamic_pointer_cast(grad_tensor.impl()))) - .Resize({grad_tensor.numel()}); - - vars_marked_ready_[var_index] = true; + const auto length = group.length_[inside_group_index]; + + if (is_used_var) { + auto *autograd_meta = tensors_[var_index].get_autograd_meta(); + auto &grad_tensor = static_cast(autograd_meta)->Grad(); + group_tensor + .ShareDataWith( + *(std::dynamic_pointer_cast(grad_tensor.impl()))) + .Resize({grad_tensor.numel()}); + } else { + // TODO(shenliang03): maybe save the memory by avoiding tensor construction + if (!group_tensor.initialized()) { + group_tensor.Resize({static_cast(length)}); + group_tensor.mutable_data(inner_place_, group.dtype_); + } + if (HasGrad(var_index)) { + VLOG(3) << "Tensor[" << tensors_[var_index].name() << "] has grad"; + auto grad_tensor = egr::EagerUtils::mutable_grad(tensors_[var_index]); + group_tensor + .ShareDataWith(*( + std::dynamic_pointer_cast(grad_tensor->impl()))) + .Resize({length}); + } else { + VLOG(3) << "Tensor[" << tensors_[var_index].name() + << "] doesn't have grad"; + auto *dev_ctx = platform::DeviceContextPool::Instance().Get(inner_place_); + group_tensor.Resize({static_cast(length)}); + phi::funcs::set_constant(*dev_ctx, &group_tensor, 0.0); + } + } if (--group.pending_ == 0) { // can start allreduce MarkGroupReady(group_index); } + + if (next_group_ == groups_.size()) { + FinalizeBackward(); + } } void EagerReducer::MarkGroupReady(size_t group_index) { @@ -501,6 +672,92 @@ void EagerReducer::MarkGroupReady(size_t group_index) { } } +bool EagerReducer::HasGrad(size_t var_index) { + auto grad = egr::EagerUtils::mutable_grad(tensors_[var_index]); + if (grad && grad->is_initialized()) { + return true; + } else { + return false; + } +} + +void EagerReducer::ProcessUnusedDenseVars() { + // The calculation stream must be used here to + // avoid conflicts with communication. + VLOG(3) << "Local used vars : " + << string::join_strings(local_used_vars_, ','); + + const auto *dev_ctx = + platform::DeviceContextPool::Instance().Get(inner_place_); + auto *global_used_tensor = + std::dynamic_pointer_cast(global_used_vars_.impl()) + .get(); + framework::TensorFromVector(local_used_vars_, *dev_ctx, + global_used_tensor); + + distributed::AllreduceOptions opts; + opts.reduce_op = ReduceOp::SUM; + std::vector reduce_tensors = {global_used_vars_}; + process_group_->AllReduce(reduce_tensors, opts)->Synchronize(); + + framework::TensorToVector(*global_used_tensor, *dev_ctx, + &local_used_vars_); + dev_ctx->Wait(); + + // sync compute stream to get global used var message, + // but maybe affect speed performance + VLOG(3) << "Global used vars : " + << string::join_strings(local_used_vars_, ','); + + for (const auto var_index : unused_vars_) { + const bool global_unused = (local_used_vars_[var_index] == 0); + + // global used but local unused, set grad + VLOG(3) << "[Rank " << process_group_->GetRank() << "]: " + << "Var [" << var_index << "] [" << tensors_[var_index].name() + << "] global_unused: " << global_unused + << " has grad: " << HasGrad(var_index); + + if (!global_unused) { + VLOG(3) << "Set Tensor[" << var_index << "]'s Grad for [Rank " + << process_group_->GetRank() << "]"; + const auto &var_locator = variable_locators_[var_index]; + const auto group_index = var_locator.group_index; + const auto &group = groups_[group_index]; + const auto inside_group_index = var_locator.inside_group_index; + auto &src_tensor = group.dense_tensors_[inside_group_index]; + + Tensor grad_value(std::make_shared(src_tensor)); + + auto dest_var_base = tensors_[var_index]; + auto grad_tensor = egr::EagerUtils::mutable_grad(dest_var_base); + grad_tensor->copy_(grad_value, inner_place_, true); + grad_tensor->reshape(dest_var_base.shape()); + } + } +} + +void EagerReducer::FinalizeBackward() { + groups_need_finalize_ = false; + grad_need_hooks_ = false; + for (auto &group : groups_) { + group.task->Synchronize(); + } + + for (auto &group : groups_) { + group.SplitTensors(inner_place_); + } + + if (find_unused_vars_each_step_) { + ProcessUnusedDenseVars(); + local_used_vars_.clear(); + local_used_vars_.resize(tensors_.size(), 0); + VLOG(3) << "ProcessUnusedDenseVars is finished."; + } + + VLOG(3) << "In the batch, Reducer is finished."; +} + void EagerReducer::FusedAllReduceSchedule(EagerGroup *group, const int curr_group_index) { // The overall timeline: concat > div_nranks > allreduce > split @@ -513,24 +770,14 @@ void EagerReducer::FusedAllReduceSchedule(EagerGroup *group, group->ConcatTensors(inner_place_); // div nranks - double scaling = 1.0 / nranks_; - paddle::experimental::scale_(group->dense_contents_, scaling, 0.0, false); + paddle::experimental::scale_(group->dense_contents_, 1.0 / nranks_, 0.0, + false); // all_reduce std::vector reduce_tensors = {group->dense_contents_}; - tasks_.push_back(process_group_->AllReduce(reduce_tensors, opts)); + group->task = process_group_->AllReduce(reduce_tensors, opts); - if (tasks_.size() == groups_.size()) { - for (size_t index = 0; index < tasks_.size(); index++) { - auto &task = tasks_.back(); - task->Synchronize(); - tasks_.pop_back(); - } - for (size_t index = 0; index < groups_.size(); index++) { - auto &group = groups_[index]; - group.SplitTensors(inner_place_); - } - } + // split in FinalizeBackward() } std::ostream &operator<<(std::ostream &out, const EagerGroup &group) { diff --git a/paddle/fluid/distributed/collective/reducer.h b/paddle/fluid/distributed/collective/reducer.h index ac6f3fbe5956cd47d4385343509d41afec0b69a4..d3ffa8498a14b0d0ade02ea459e1c6058550122f 100644 --- a/paddle/fluid/distributed/collective/reducer.h +++ b/paddle/fluid/distributed/collective/reducer.h @@ -28,6 +28,8 @@ #include "paddle/phi/api/include/tensor.h" #include "paddle/phi/api/lib/ext_compat_utils.h" #include "paddle/phi/common/data_type.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/utils/string/string_helper.h" namespace paddle { namespace distributed { @@ -35,6 +37,7 @@ using Tensor = paddle::experimental::Tensor; using Scalar = paddle::experimental::ScalarBase; using ScalarArray = paddle::experimental::ScalarArrayBase; +using Backend = paddle::experimental::Backend; std::vector> Eager_AssignGroupBySize( const std::vector, const std::vector &is_sparse_gradient, @@ -61,6 +64,9 @@ class EagerGroup { // external message of group phi::DataType dtype_; + // help to sync + std::shared_ptr task; + // context is used to select the stream for concat void ConcatTensors(const platform::Place &); @@ -98,6 +104,10 @@ class EagerReducer { void MarkVarReady(const size_t var_index, const bool is_used_var); void MarkGroupReady(const size_t group_index); void FusedAllReduceSchedule(EagerGroup *group, const int curr_group_index); + void FinalizeBackward(); + void TraverseBackwardGraph(const std::vector &outputs); + void ProcessUnusedDenseVars(); + bool HasGrad(size_t var_index); private: std::vector tensors_; @@ -105,7 +115,6 @@ class EagerReducer { std::vector is_sparse_gradient_; std::shared_ptr process_group_; std::vector group_size_limits_; - bool find_unused_vars_each_step_; std::vector groups_; std::vector variable_locators_; @@ -113,12 +122,20 @@ class EagerReducer { platform::Place inner_place_; size_t next_group_ = 0; int64_t nranks_ = -1; - std::vector> tasks_; bool grad_need_hooks_{false}; std::vector vars_marked_ready_; - std::vector local_used_vars_; + std::vector local_used_vars_; + + // Following variables are to help unused vars + std::vector unused_vars_; + std::map gradnode_index_map_; + bool has_marked_unused_vars_{false}; + bool find_unused_vars_each_step_{false}; + bool find_unused_vars_once_{true}; + bool groups_need_finalize_{false}; + Tensor global_used_vars_; }; } // namespace distributed diff --git a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt index 3e734b1b9ed241f54e14d8a7c94b834674db1054..8641b36a1be8ea51dc4ad911214c2cebe6121e20 100644 --- a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt +++ b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt @@ -4,7 +4,7 @@ if(WITH_PYTHON) endif() proto_library(interceptor_message_proto SRCS interceptor_message.proto) -if(WITH_DISTRIBUTE AND WITH_PSCORE AND NOT (WITH_ASCEND OR WITH_ASCEND_CL)) +if(WITH_DISTRIBUTE AND WITH_PSCORE) set(BRPC_DEPS brpc ssl crypto protobuf zlib leveldb snappy gflags glog) else() set(BRPC_DEPS "") diff --git a/paddle/fluid/distributed/fleet_executor/message_bus.cc b/paddle/fluid/distributed/fleet_executor/message_bus.cc index 8d2ec5c41d86499393f62c65c4519960669b8fd8..80a6b4667aa1a0dbfd957a390c9202ea1a4d2b68 100644 --- a/paddle/fluid/distributed/fleet_executor/message_bus.cc +++ b/paddle/fluid/distributed/fleet_executor/message_bus.cc @@ -67,8 +67,7 @@ bool MessageBus::IsInit() const { return is_init_; } MessageBus::~MessageBus() { VLOG(3) << "Message bus releases resource."; -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) server_.Stop(1000); server_.Join(); #endif @@ -87,8 +86,7 @@ bool MessageBus::Send(int64_t dst_rank, IsInit(), true, platform::errors::PreconditionNotMet( "Using message bus since it has not been initialized.")); -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) int retry_time = 0; // message bus will retry sending for 10 times while (retry_time < 10) { ++retry_time; @@ -173,8 +171,7 @@ void MessageBus::ListenPort() { LOG(INFO) << "No need listen to port since training on single card."; return; } -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) // function keep listen the port and handle the message PADDLE_ENFORCE_EQ( server_.AddService(&message_service_, brpc::SERVER_DOESNT_OWN_SERVICE), 0, @@ -203,8 +200,7 @@ void MessageBus::ListenPort() { #endif } -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) bool MessageBus::SendInterRank(int64_t dst_rank, const InterceptorMessage& interceptor_message) { const auto& dst_addr = GetAddr(dst_rank); diff --git a/paddle/fluid/distributed/fleet_executor/message_bus.h b/paddle/fluid/distributed/fleet_executor/message_bus.h index d805ac81606b8928b069174bf8aadc693db2aa0c..dfd65fdbc00d445a11f60f4e1cde4f4da77b80dc 100644 --- a/paddle/fluid/distributed/fleet_executor/message_bus.h +++ b/paddle/fluid/distributed/fleet_executor/message_bus.h @@ -20,8 +20,7 @@ #include #include -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) #include "brpc/channel.h" #include "brpc/server.h" #include "paddle/fluid/distributed/fleet_executor/message_service.h" @@ -64,8 +63,7 @@ class MessageBus final { const std::string& GetAddr(int64_t rank) const; -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) // send the message inter rank (dst is different rank with src) bool SendInterRank(int64_t dst_rank, const InterceptorMessage& interceptor_message); @@ -81,8 +79,7 @@ class MessageBus final { // the ip needs to be listened std::string addr_; -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) MessageServiceImpl message_service_; // brpc server brpc::Server server_; diff --git a/paddle/fluid/distributed/fleet_executor/message_service.cc b/paddle/fluid/distributed/fleet_executor/message_service.cc index c3fff98f684ad5f0feb74f30fd51404d4693c7f9..1c66d83ea34d702733b3a5c0386abb62d4e1ec8a 100644 --- a/paddle/fluid/distributed/fleet_executor/message_service.cc +++ b/paddle/fluid/distributed/fleet_executor/message_service.cc @@ -11,8 +11,7 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) #include "paddle/fluid/distributed/fleet_executor/message_service.h" #include "brpc/server.h" #include "paddle/fluid/distributed/fleet_executor/global.h" diff --git a/paddle/fluid/distributed/fleet_executor/message_service.h b/paddle/fluid/distributed/fleet_executor/message_service.h index 02f73471e3b911adc622ca990bca70b7a5f3033d..5ab687ff93dc4fc2ccd0884456cdbf2d6c3c6fcb 100644 --- a/paddle/fluid/distributed/fleet_executor/message_service.h +++ b/paddle/fluid/distributed/fleet_executor/message_service.h @@ -11,8 +11,7 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) #pragma once #include "brpc/server.h" diff --git a/paddle/fluid/distributed/ps.proto b/paddle/fluid/distributed/ps.proto index 0ae87812bce434be5e664aefea4bba19ae147d28..fac30e26c388c65af13135699a886a3c69031d57 100644 --- a/paddle/fluid/distributed/ps.proto +++ b/paddle/fluid/distributed/ps.proto @@ -115,6 +115,7 @@ message TableParameter { optional CommonAccessorParameter common = 6; optional TableType type = 7; optional bool compress_in_save = 8 [ default = false ]; + optional GraphParameter graph_parameter = 9; } message TableAccessorParameter { @@ -211,3 +212,25 @@ message SparseAdamSGDParameter { // SparseAdamSGDRule optional double ada_epsilon = 5 [ default = 1e-08 ]; repeated float weight_bounds = 6; } + +message GraphParameter { + optional int32 task_pool_size = 1 [ default = 24 ]; + optional bool gpups_mode = 2 [ default = false ]; + optional string gpups_graph_sample_class = 3 + [ default = "CompleteGraphSampler" ]; + optional string gpups_graph_sample_args = 4 [ default = "" ]; + optional bool use_cache = 5 [ default = true ]; + optional float cache_ratio = 6 [ default = 0.3 ]; + optional int32 cache_ttl = 7 [ default = 5 ]; + optional GraphFeature graph_feature = 8; + optional string table_name = 9 [ default = "" ]; + optional string table_type = 10 [ default = "" ]; + optional int32 gpups_mode_shard_num = 11 [ default = 127 ]; + optional int32 gpu_num = 12 [ default = 1 ]; +} + +message GraphFeature { + repeated string name = 1; + repeated string dtype = 2; + repeated int32 shape = 3; +} \ No newline at end of file diff --git a/paddle/fluid/distributed/ps/service/graph_brpc_client.cc b/paddle/fluid/distributed/ps/service/graph_brpc_client.cc index 301708f6b7bb3d465d8dcbd2b94bbc4c217fcc77..a3db88e3b679da63a9b205cc013d579cf9a4be2f 100644 --- a/paddle/fluid/distributed/ps/service/graph_brpc_client.cc +++ b/paddle/fluid/distributed/ps/service/graph_brpc_client.cc @@ -44,7 +44,7 @@ void GraphPsService_Stub::service( } } -int GraphBrpcClient::get_server_index_by_id(uint64_t id) { +int GraphBrpcClient::get_server_index_by_id(int64_t id) { int shard_num = get_shard_num(); int shard_per_server = shard_num % server_size == 0 ? shard_num / server_size @@ -53,7 +53,7 @@ int GraphBrpcClient::get_server_index_by_id(uint64_t id) { } std::future GraphBrpcClient::get_node_feat( - const uint32_t &table_id, const std::vector &node_ids, + const uint32_t &table_id, const std::vector &node_ids, const std::vector &feature_names, std::vector> &res) { std::vector request2server; @@ -66,7 +66,7 @@ std::future GraphBrpcClient::get_node_feat( } } size_t request_call_num = request2server.size(); - std::vector> node_id_buckets(request_call_num); + std::vector> node_id_buckets(request_call_num); std::vector> query_idx_buckets(request_call_num); for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) { int server_index = get_server_index_by_id(node_ids[query_idx]); @@ -129,7 +129,7 @@ std::future GraphBrpcClient::get_node_feat( closure->request(request_idx) ->add_params((char *)node_id_buckets[request_idx].data(), - sizeof(uint64_t) * node_num); + sizeof(int64_t) * node_num); std::string joint_feature_name = paddle::string::join_strings(feature_names, '\t'); closure->request(request_idx) @@ -179,9 +179,9 @@ std::future GraphBrpcClient::clear_nodes(uint32_t table_id) { return fut; } std::future GraphBrpcClient::add_graph_node( - uint32_t table_id, std::vector &node_id_list, + uint32_t table_id, std::vector &node_id_list, std::vector &is_weighted_list) { - std::vector> request_bucket; + std::vector> request_bucket; std::vector> is_weighted_bucket; bool add_weight = is_weighted_list.size() > 0; std::vector server_index_arr; @@ -191,7 +191,7 @@ std::future GraphBrpcClient::add_graph_node( if (index_mapping[server_index] == -1) { index_mapping[server_index] = request_bucket.size(); server_index_arr.push_back(server_index); - request_bucket.push_back(std::vector()); + request_bucket.push_back(std::vector()); if (add_weight) is_weighted_bucket.push_back(std::vector()); } request_bucket[index_mapping[server_index]].push_back( @@ -229,7 +229,7 @@ std::future GraphBrpcClient::add_graph_node( size_t node_num = request_bucket[request_idx].size(); closure->request(request_idx) ->add_params((char *)request_bucket[request_idx].data(), - sizeof(uint64_t) * node_num); + sizeof(int64_t) * node_num); if (add_weight) { bool weighted[is_weighted_bucket[request_idx].size() + 1]; for (size_t j = 0; j < is_weighted_bucket[request_idx].size(); j++) @@ -248,8 +248,8 @@ std::future GraphBrpcClient::add_graph_node( return fut; } std::future GraphBrpcClient::remove_graph_node( - uint32_t table_id, std::vector &node_id_list) { - std::vector> request_bucket; + uint32_t table_id, std::vector &node_id_list) { + std::vector> request_bucket; std::vector server_index_arr; std::vector index_mapping(server_size, -1); for (size_t query_idx = 0; query_idx < node_id_list.size(); ++query_idx) { @@ -257,7 +257,7 @@ std::future GraphBrpcClient::remove_graph_node( if (index_mapping[server_index] == -1) { index_mapping[server_index] = request_bucket.size(); server_index_arr.push_back(server_index); - request_bucket.push_back(std::vector()); + request_bucket.push_back(std::vector()); } request_bucket[index_mapping[server_index]].push_back( node_id_list[query_idx]); @@ -291,7 +291,7 @@ std::future GraphBrpcClient::remove_graph_node( closure->request(request_idx) ->add_params((char *)request_bucket[request_idx].data(), - sizeof(uint64_t) * node_num); + sizeof(int64_t) * node_num); // PsService_Stub rpc_stub(get_cmd_channel(server_index)); GraphPsService_Stub rpc_stub = getServiceStub(get_cmd_channel(server_index)); @@ -303,9 +303,9 @@ std::future GraphBrpcClient::remove_graph_node( } // char* &buffer,int &actual_size std::future GraphBrpcClient::batch_sample_neighbors( - uint32_t table_id, std::vector node_ids, int sample_size, - // std::vector>> &res, - std::vector> &res, + uint32_t table_id, std::vector node_ids, int sample_size, + // std::vector>> &res, + std::vector> &res, std::vector> &res_weight, bool need_weight, int server_index) { if (server_index != -1) { @@ -337,7 +337,7 @@ std::future GraphBrpcClient::batch_sample_neighbors( int start = 0; while (start < actual_size) { res[node_idx].emplace_back( - *(uint64_t *)(node_buffer + offset + start)); + *(int64_t *)(node_buffer + offset + start)); start += GraphNode::id_size; if (need_weight) { res_weight[node_idx].emplace_back( @@ -358,7 +358,7 @@ std::future GraphBrpcClient::batch_sample_neighbors( closure->request(0)->set_table_id(table_id); closure->request(0)->set_client_id(_client_id); closure->request(0)->add_params((char *)node_ids.data(), - sizeof(uint64_t) * node_ids.size()); + sizeof(int64_t) * node_ids.size()); closure->request(0)->add_params((char *)&sample_size, sizeof(int)); closure->request(0)->add_params((char *)&need_weight, sizeof(bool)); ; @@ -380,14 +380,14 @@ std::future GraphBrpcClient::batch_sample_neighbors( server2request[server_index] = request2server.size(); request2server.push_back(server_index); } - // res.push_back(std::vector>()); + // res.push_back(std::vector>()); res.push_back({}); if (need_weight) { res_weight.push_back({}); } } size_t request_call_num = request2server.size(); - std::vector> node_id_buckets(request_call_num); + std::vector> node_id_buckets(request_call_num); std::vector> query_idx_buckets(request_call_num); for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) { int server_index = get_server_index_by_id(node_ids[query_idx]); @@ -428,7 +428,7 @@ std::future GraphBrpcClient::batch_sample_neighbors( int start = 0; while (start < actual_size) { res[query_idx].emplace_back( - *(uint64_t *)(node_buffer + offset + start)); + *(int64_t *)(node_buffer + offset + start)); start += GraphNode::id_size; if (need_weight) { res_weight[query_idx].emplace_back( @@ -459,7 +459,7 @@ std::future GraphBrpcClient::batch_sample_neighbors( closure->request(request_idx) ->add_params((char *)node_id_buckets[request_idx].data(), - sizeof(uint64_t) * node_num); + sizeof(int64_t) * node_num); closure->request(request_idx) ->add_params((char *)&sample_size, sizeof(int)); closure->request(request_idx) @@ -476,7 +476,7 @@ std::future GraphBrpcClient::batch_sample_neighbors( } std::future GraphBrpcClient::random_sample_nodes( uint32_t table_id, int server_index, int sample_size, - std::vector &ids) { + std::vector &ids) { DownpourBrpcClosure *closure = new DownpourBrpcClosure(1, [&](void *done) { int ret = 0; auto *closure = (DownpourBrpcClosure *)done; @@ -490,7 +490,7 @@ std::future GraphBrpcClient::random_sample_nodes( auto size = io_buffer_itr.copy_and_forward((void *)(buffer), bytes_size); int index = 0; while (index < bytes_size) { - ids.push_back(*(uint64_t *)(buffer + index)); + ids.push_back(*(int64_t *)(buffer + index)); index += GraphNode::id_size; } delete[] buffer; @@ -633,7 +633,7 @@ std::future GraphBrpcClient::pull_graph_list( } std::future GraphBrpcClient::set_node_feat( - const uint32_t &table_id, const std::vector &node_ids, + const uint32_t &table_id, const std::vector &node_ids, const std::vector &feature_names, const std::vector> &features) { std::vector request2server; @@ -646,7 +646,7 @@ std::future GraphBrpcClient::set_node_feat( } } size_t request_call_num = request2server.size(); - std::vector> node_id_buckets(request_call_num); + std::vector> node_id_buckets(request_call_num); std::vector> query_idx_buckets(request_call_num); std::vector>> features_idx_buckets( request_call_num); @@ -696,7 +696,7 @@ std::future GraphBrpcClient::set_node_feat( closure->request(request_idx) ->add_params((char *)node_id_buckets[request_idx].data(), - sizeof(uint64_t) * node_num); + sizeof(int64_t) * node_num); std::string joint_feature_name = paddle::string::join_strings(feature_names, '\t'); closure->request(request_idx) diff --git a/paddle/fluid/distributed/ps/service/graph_brpc_client.h b/paddle/fluid/distributed/ps/service/graph_brpc_client.h index 06e753d028baa2d9c0002620dc445d4204046180..e2b8a518615dc511a726c4be104cb03900dd2e9a 100644 --- a/paddle/fluid/distributed/ps/service/graph_brpc_client.h +++ b/paddle/fluid/distributed/ps/service/graph_brpc_client.h @@ -63,8 +63,8 @@ class GraphBrpcClient : public BrpcPsClient { virtual ~GraphBrpcClient() {} // given a batch of nodes, sample graph_neighbors for each of them virtual std::future batch_sample_neighbors( - uint32_t table_id, std::vector node_ids, int sample_size, - std::vector>& res, + uint32_t table_id, std::vector node_ids, int sample_size, + std::vector>& res, std::vector>& res_weight, bool need_weight, int server_index = -1); @@ -75,20 +75,20 @@ class GraphBrpcClient : public BrpcPsClient { virtual std::future random_sample_nodes(uint32_t table_id, int server_index, int sample_size, - std::vector& ids); + std::vector& ids); virtual std::future get_node_feat( - const uint32_t& table_id, const std::vector& node_ids, + const uint32_t& table_id, const std::vector& node_ids, const std::vector& feature_names, std::vector>& res); virtual std::future set_node_feat( - const uint32_t& table_id, const std::vector& node_ids, + const uint32_t& table_id, const std::vector& node_ids, const std::vector& feature_names, const std::vector>& features); virtual std::future clear_nodes(uint32_t table_id); virtual std::future add_graph_node( - uint32_t table_id, std::vector& node_id_list, + uint32_t table_id, std::vector& node_id_list, std::vector& is_weighted_list); virtual std::future use_neighbors_sample_cache(uint32_t table_id, size_t size_limit, @@ -96,11 +96,11 @@ class GraphBrpcClient : public BrpcPsClient { virtual std::future load_graph_split_config(uint32_t table_id, std::string path); virtual std::future remove_graph_node( - uint32_t table_id, std::vector& node_id_list); + uint32_t table_id, std::vector& node_id_list); virtual int32_t initialize(); int get_shard_num() { return shard_num; } void set_shard_num(int shard_num) { this->shard_num = shard_num; } - int get_server_index_by_id(uint64_t id); + int get_server_index_by_id(int64_t id); void set_local_channel(int index) { this->local_channel = get_cmd_channel(index); } diff --git a/paddle/fluid/distributed/ps/service/graph_brpc_server.cc b/paddle/fluid/distributed/ps/service/graph_brpc_server.cc index 441f489fb3097cda51fc62dc35e93264a1f7caef..20a55e4d11983dad37b9e2e7845923dded881d3b 100644 --- a/paddle/fluid/distributed/ps/service/graph_brpc_server.cc +++ b/paddle/fluid/distributed/ps/service/graph_brpc_server.cc @@ -140,9 +140,9 @@ int32_t GraphBrpcService::add_graph_node(Table *table, return 0; } - size_t node_num = request.params(0).size() / sizeof(uint64_t); - uint64_t *node_data = (uint64_t *)(request.params(0).c_str()); - std::vector node_ids(node_data, node_data + node_num); + size_t node_num = request.params(0).size() / sizeof(int64_t); + int64_t *node_data = (int64_t *)(request.params(0).c_str()); + std::vector node_ids(node_data, node_data + node_num); std::vector is_weighted_list; if (request.params_size() == 2) { size_t weight_list_size = request.params(1).size() / sizeof(bool); @@ -165,9 +165,9 @@ int32_t GraphBrpcService::remove_graph_node(Table *table, "graph_get_node_feat request requires at least 1 argument"); return 0; } - size_t node_num = request.params(0).size() / sizeof(uint64_t); - uint64_t *node_data = (uint64_t *)(request.params(0).c_str()); - std::vector node_ids(node_data, node_data + node_num); + size_t node_num = request.params(0).size() / sizeof(int64_t); + int64_t *node_data = (int64_t *)(request.params(0).c_str()); + std::vector node_ids(node_data, node_data + node_num); ((GraphTable *)table)->remove_graph_node(node_ids); return 0; @@ -386,9 +386,9 @@ int32_t GraphBrpcService::graph_random_sample_neighbors( "graph_random_sample_neighbors request requires at least 3 arguments"); return 0; } - size_t node_num = request.params(0).size() / sizeof(uint64_t); - uint64_t *node_data = (uint64_t *)(request.params(0).c_str()); - int sample_size = *(uint64_t *)(request.params(1).c_str()); + size_t node_num = request.params(0).size() / sizeof(int64_t); + int64_t *node_data = (int64_t *)(request.params(0).c_str()); + int sample_size = *(int64_t *)(request.params(1).c_str()); bool need_weight = *(bool *)(request.params(2).c_str()); std::vector> buffers(node_num); std::vector actual_sizes(node_num, 0); @@ -407,7 +407,7 @@ int32_t GraphBrpcService::graph_random_sample_neighbors( int32_t GraphBrpcService::graph_random_sample_nodes( Table *table, const PsRequestMessage &request, PsResponseMessage &response, brpc::Controller *cntl) { - size_t size = *(uint64_t *)(request.params(0).c_str()); + size_t size = *(int64_t *)(request.params(0).c_str()); std::unique_ptr buffer; int actual_size; if (((GraphTable *)table)->random_sample_nodes(size, buffer, actual_size) == @@ -430,9 +430,9 @@ int32_t GraphBrpcService::graph_get_node_feat(Table *table, "graph_get_node_feat request requires at least 2 arguments"); return 0; } - size_t node_num = request.params(0).size() / sizeof(uint64_t); - uint64_t *node_data = (uint64_t *)(request.params(0).c_str()); - std::vector node_ids(node_data, node_data + node_num); + size_t node_num = request.params(0).size() / sizeof(int64_t); + int64_t *node_data = (int64_t *)(request.params(0).c_str()); + std::vector node_ids(node_data, node_data + node_num); std::vector feature_names = paddle::string::split_string(request.params(1), "\t"); @@ -464,16 +464,16 @@ int32_t GraphBrpcService::sample_neighbors_across_multi_servers( "at least 3 arguments"); return 0; } - size_t node_num = request.params(0).size() / sizeof(uint64_t), + size_t node_num = request.params(0).size() / sizeof(int64_t), size_of_size_t = sizeof(size_t); - uint64_t *node_data = (uint64_t *)(request.params(0).c_str()); - int sample_size = *(uint64_t *)(request.params(1).c_str()); - bool need_weight = *(uint64_t *)(request.params(2).c_str()); - // std::vector res = ((GraphTable + int64_t *node_data = (int64_t *)(request.params(0).c_str()); + int sample_size = *(int64_t *)(request.params(1).c_str()); + bool need_weight = *(int64_t *)(request.params(2).c_str()); + // std::vector res = ((GraphTable // *)table).filter_out_non_exist_nodes(node_data, sample_size); std::vector request2server; std::vector server2request(server_size, -1); - std::vector local_id; + std::vector local_id; std::vector local_query_idx; size_t rank = get_rank(); for (int query_idx = 0; query_idx < node_num; ++query_idx) { @@ -496,7 +496,7 @@ int32_t GraphBrpcService::sample_neighbors_across_multi_servers( std::vector> local_buffers; std::vector local_actual_sizes; std::vector seq; - std::vector> node_id_buckets(request_call_num); + std::vector> node_id_buckets(request_call_num); std::vector> query_idx_buckets(request_call_num); for (int query_idx = 0; query_idx < node_num; ++query_idx) { int server_index = @@ -583,7 +583,7 @@ int32_t GraphBrpcService::sample_neighbors_across_multi_servers( closure->request(request_idx) ->add_params((char *)node_id_buckets[request_idx].data(), - sizeof(uint64_t) * node_num); + sizeof(int64_t) * node_num); closure->request(request_idx) ->add_params((char *)&sample_size, sizeof(int)); closure->request(request_idx) @@ -618,9 +618,9 @@ int32_t GraphBrpcService::graph_set_node_feat(Table *table, "graph_set_node_feat request requires at least 3 arguments"); return 0; } - size_t node_num = request.params(0).size() / sizeof(uint64_t); - uint64_t *node_data = (uint64_t *)(request.params(0).c_str()); - std::vector node_ids(node_data, node_data + node_num); + size_t node_num = request.params(0).size() / sizeof(int64_t); + int64_t *node_data = (int64_t *)(request.params(0).c_str()); + std::vector node_ids(node_data, node_data + node_num); std::vector feature_names = paddle::string::split_string(request.params(1), "\t"); diff --git a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc index 088edcb75bbc67d6d2acef9609b442f6fa38c332..c8be0f797109078509eeced53920845ac4c51684 100644 --- a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc +++ b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc @@ -44,9 +44,9 @@ void GraphPyService::add_table_feat_conf(std::string table_name, } } -void add_graph_node(std::vector node_ids, +void add_graph_node(std::vector node_ids, std::vector weight_list) {} -void remove_graph_node(std::vector node_ids) {} +void remove_graph_node(std::vector node_ids) {} void GraphPyService::set_up(std::string ips_str, int shard_num, std::vector node_types, std::vector edge_types) { @@ -260,7 +260,7 @@ void GraphPyClient::clear_nodes(std::string name) { } void GraphPyClient::add_graph_node(std::string name, - std::vector& node_ids, + std::vector& node_ids, std::vector& weight_list) { if (this->table_id_map.count(name)) { uint32_t table_id = this->table_id_map[name]; @@ -271,7 +271,7 @@ void GraphPyClient::add_graph_node(std::string name, } void GraphPyClient::remove_graph_node(std::string name, - std::vector& node_ids) { + std::vector& node_ids) { if (this->table_id_map.count(name)) { uint32_t table_id = this->table_id_map[name]; auto status = get_ps_client()->remove_graph_node(table_id, node_ids); @@ -290,13 +290,12 @@ void GraphPyClient::load_node_file(std::string name, std::string filepath) { } } -std::pair>, std::vector> +std::pair>, std::vector> GraphPyClient::batch_sample_neighbors(std::string name, - std::vector node_ids, + std::vector node_ids, int sample_size, bool return_weight, bool return_edges) { - // std::vector>> v; - std::vector> v; + std::vector> v; std::vector> v1; if (this->table_id_map.count(name)) { uint32_t table_id = this->table_id_map[name]; @@ -309,7 +308,7 @@ GraphPyClient::batch_sample_neighbors(std::string name, // res.first[1]: slice index // res.first[2]: src nodes // res.second: edges weight - std::pair>, std::vector> res; + std::pair>, std::vector> res; res.first.push_back({}); res.first.push_back({}); if (return_edges) res.first.push_back({}); @@ -342,10 +341,10 @@ void GraphPyClient::use_neighbors_sample_cache(std::string name, status.wait(); } } -std::vector GraphPyClient::random_sample_nodes(std::string name, - int server_index, - int sample_size) { - std::vector v; +std::vector GraphPyClient::random_sample_nodes(std::string name, + int server_index, + int sample_size) { + std::vector v; if (this->table_id_map.count(name)) { uint32_t table_id = this->table_id_map[name]; auto status = @@ -357,7 +356,7 @@ std::vector GraphPyClient::random_sample_nodes(std::string name, // (name, dtype, ndarray) std::vector> GraphPyClient::get_node_feat( - std::string node_type, std::vector node_ids, + std::string node_type, std::vector node_ids, std::vector feature_names) { std::vector> v( feature_names.size(), std::vector(node_ids.size())); @@ -371,7 +370,7 @@ std::vector> GraphPyClient::get_node_feat( } void GraphPyClient::set_node_feat( - std::string node_type, std::vector node_ids, + std::string node_type, std::vector node_ids, std::vector feature_names, const std::vector> features) { if (this->table_id_map.count(node_type)) { diff --git a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h index c25ef5035453ded0996cfe190dec71b0ce4b9b4a..85707137c1800ed9486148584ce22a78c52a47fd 100644 --- a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h +++ b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h @@ -70,18 +70,34 @@ class GraphPyService { ::paddle::distributed::TableAccessorParameter* accessor_proto = sparse_table_proto->mutable_accessor(); - ::paddle::distributed::CommonAccessorParameter* common_proto = - sparse_table_proto->mutable_common(); + // ::paddle::distributed::CommonAccessorParameter* common_proto = + // sparse_table_proto->mutable_common(); + ::paddle::distributed::GraphParameter* graph_proto = + sparse_table_proto->mutable_graph_parameter(); + + ::paddle::distributed::GraphFeature* graph_feature = + graph_proto->mutable_graph_feature(); + + graph_proto->set_task_pool_size(24); + + graph_proto->set_table_name(table_name); + graph_proto->set_table_type(table_type); + graph_proto->set_use_cache(false); // Set GraphTable Parameter - common_proto->set_table_name(table_name); - common_proto->set_name(table_type); + // common_proto->set_table_name(table_name); + // common_proto->set_name(table_type); + // for (size_t i = 0; i < feat_name.size(); i++) { + // common_proto->add_params(feat_dtype[i]); + // common_proto->add_dims(feat_shape[i]); + // common_proto->add_attributes(feat_name[i]); + // } + for (size_t i = 0; i < feat_name.size(); i++) { - common_proto->add_params(feat_dtype[i]); - common_proto->add_dims(feat_shape[i]); - common_proto->add_attributes(feat_name[i]); + graph_feature->add_dtype(feat_dtype[i]); + graph_feature->add_shape(feat_shape[i]); + graph_feature->add_name(feat_name[i]); } - accessor_proto->set_accessor_class("CommMergeAccessor"); } @@ -143,24 +159,24 @@ class GraphPyClient : public GraphPyService { void load_edge_file(std::string name, std::string filepath, bool reverse); void load_node_file(std::string name, std::string filepath); void clear_nodes(std::string name); - void add_graph_node(std::string name, std::vector& node_ids, + void add_graph_node(std::string name, std::vector& node_ids, std::vector& weight_list); - void remove_graph_node(std::string name, std::vector& node_ids); + void remove_graph_node(std::string name, std::vector& node_ids); int get_client_id() { return client_id; } void set_client_id(int client_id) { this->client_id = client_id; } void start_client(); - std::pair>, std::vector> - batch_sample_neighbors(std::string name, std::vector node_ids, + std::pair>, std::vector> + batch_sample_neighbors(std::string name, std::vector node_ids, int sample_size, bool return_weight, bool return_edges); - std::vector random_sample_nodes(std::string name, int server_index, - int sample_size); + std::vector random_sample_nodes(std::string name, int server_index, + int sample_size); std::vector> get_node_feat( - std::string node_type, std::vector node_ids, + std::string node_type, std::vector node_ids, std::vector feature_names); void use_neighbors_sample_cache(std::string name, size_t total_size_limit, size_t ttl); - void set_node_feat(std::string node_type, std::vector node_ids, + void set_node_feat(std::string node_type, std::vector node_ids, std::vector feature_names, const std::vector> features); std::vector pull_graph_list(std::string name, int server_index, diff --git a/paddle/fluid/distributed/ps/table/CMakeLists.txt b/paddle/fluid/distributed/ps/table/CMakeLists.txt index be916bf2e800308cdebbbfbe4e5ff4c467cf3f6f..2fa5ecb4051c568fa0697b236bcfb9c00e4319bf 100644 --- a/paddle/fluid/distributed/ps/table/CMakeLists.txt +++ b/paddle/fluid/distributed/ps/table/CMakeLists.txt @@ -53,7 +53,6 @@ cc_library(memory_sparse_table SRCS memory_sparse_table.cc DEPS ps_framework_pro set_source_files_properties(memory_sparse_geo_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_library(memory_sparse_geo_table SRCS memory_sparse_geo_table.cc DEPS ps_framework_proto ${TABLE_DEPS} common_table) - cc_library(table SRCS table.cc DEPS memory_sparse_table memory_sparse_geo_table common_table tensor_accessor tensor_table ps_framework_proto string_helper device_context gflags glog boost) target_link_libraries(table -fopenmp) diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.cc b/paddle/fluid/distributed/ps/table/common_graph_table.cc index 54b98cb96ce5196bb5133f777b2571f4d3d43c6e..2c07bd65d63d408b1bff12eda7bcf8fba3336db6 100644 --- a/paddle/fluid/distributed/ps/table/common_graph_table.cc +++ b/paddle/fluid/distributed/ps/table/common_graph_table.cc @@ -27,6 +27,288 @@ namespace paddle { namespace distributed { +#ifdef PADDLE_WITH_HETERPS + +int CompleteGraphSampler::run_graph_sampling() { + pthread_rwlock_t *rw_lock = graph_table->rw_lock.get(); + pthread_rwlock_rdlock(rw_lock); + std::cout << "in graph sampling" << std::endl; + sample_nodes.clear(); + sample_neighbors.clear(); + sample_res.clear(); + sample_nodes.resize(gpu_num); + sample_neighbors.resize(gpu_num); + sample_res.resize(gpu_num); + std::vector>> + sample_nodes_ex(graph_table->task_pool_size_); + std::vector>> sample_neighbors_ex( + graph_table->task_pool_size_); + for (int i = 0; i < graph_table->task_pool_size_; i++) { + sample_nodes_ex[i].resize(gpu_num); + sample_neighbors_ex[i].resize(gpu_num); + } + std::vector> tasks; + for (size_t i = 0; i < graph_table->shards.size(); ++i) { + tasks.push_back( + graph_table->_shards_task_pool[i % graph_table->task_pool_size_] + ->enqueue([&, i, this]() -> int { + if (this->status == GraphSamplerStatus::terminating) return 0; + paddle::framework::GpuPsGraphNode node; + std::vector &v = + this->graph_table->shards[i]->get_bucket(); + size_t ind = i % this->graph_table->task_pool_size_; + for (size_t j = 0; j < v.size(); j++) { + size_t location = v[j]->get_id() % this->gpu_num; + node.node_id = v[j]->get_id(); + node.neighbor_size = v[j]->get_neighbor_size(); + node.neighbor_offset = + (int)sample_neighbors_ex[ind][location].size(); + sample_nodes_ex[ind][location].emplace_back(node); + for (int k = 0; k < node.neighbor_size; k++) + sample_neighbors_ex[ind][location].push_back( + v[j]->get_neighbor_id(k)); + } + return 0; + })); + } + for (size_t i = 0; i < tasks.size(); i++) tasks[i].get(); + tasks.clear(); + for (size_t i = 0; i < gpu_num; i++) { + tasks.push_back( + graph_table->_shards_task_pool[i % graph_table->task_pool_size_] + ->enqueue([&, i, this]() -> int { + if (this->status == GraphSamplerStatus::terminating) return 0; + int total_offset = 0; + size_t ind = i % this->graph_table->task_pool_size_; + for (int j = 0; j < this->graph_table->task_pool_size_; j++) { + for (size_t k = 0; k < sample_nodes_ex[j][ind].size(); k++) { + sample_nodes[ind].push_back(sample_nodes_ex[j][ind][k]); + sample_nodes[ind].back().neighbor_offset += total_offset; + } + size_t neighbor_size = sample_neighbors_ex[j][ind].size(); + total_offset += neighbor_size; + for (size_t k = 0; k < neighbor_size; k++) { + sample_neighbors[ind].push_back( + sample_neighbors_ex[j][ind][k]); + } + } + return 0; + })); + } + for (size_t i = 0; i < tasks.size(); i++) tasks[i].get(); + + if (this->status == GraphSamplerStatus::terminating) { + pthread_rwlock_unlock(rw_lock); + return 0; + } + for (size_t i = 0; i < gpu_num; i++) { + sample_res[i].node_list = sample_nodes[i].data(); + sample_res[i].neighbor_list = sample_neighbors[i].data(); + sample_res[i].node_size = sample_nodes[i].size(); + sample_res[i].neighbor_size = sample_neighbors[i].size(); + } + pthread_rwlock_unlock(rw_lock); + if (this->status == GraphSamplerStatus::terminating) { + return 0; + } + callback(sample_res); + return 0; +} +void CompleteGraphSampler::init(size_t gpu_num, GraphTable *graph_table, + std::vector args) { + this->gpu_num = gpu_num; + this->graph_table = graph_table; +} + +int BasicBfsGraphSampler::run_graph_sampling() { + pthread_rwlock_t *rw_lock = graph_table->rw_lock.get(); + pthread_rwlock_rdlock(rw_lock); + while (rounds > 0 && status == GraphSamplerStatus::running) { + for (size_t i = 0; i < sample_neighbors_map.size(); i++) { + sample_neighbors_map[i].clear(); + } + sample_neighbors_map.clear(); + std::vector nodes_left(graph_table->shards.size(), + node_num_for_each_shard); + std::promise prom; + std::future fut = prom.get_future(); + sample_neighbors_map.resize(graph_table->task_pool_size_); + int task_size = 0; + std::vector> tasks; + int init_size = 0; + //__sync_fetch_and_add + std::function bfs = [&, this](int i, int id) -> int { + VLOG(0) << "in bfs " << i << " " << id; + if (this->status == GraphSamplerStatus::terminating) { + int task_left = __sync_sub_and_fetch(&task_size, 1); + if (task_left == 0) { + prom.set_value(0); + } + return 0; + } + size_t ind = i % this->graph_table->task_pool_size_; + if (nodes_left[i] > 0) { + nodes_left[i]--; + auto iter = sample_neighbors_map[ind].find(id); + if (iter == sample_neighbors_map[ind].end()) { + sample_neighbors_map[ind][id] = std::vector(); + iter = sample_neighbors_map[ind].find(id); + Node *node = graph_table->shards[i]->find_node(id); + if (node != NULL) { + size_t edge_fetch_size = + std::min((size_t) this->edge_num_for_each_node, + node->get_neighbor_size()); + for (size_t k = 0; k < edge_fetch_size; k++) { + int64_t neighbor_id = node->get_neighbor_id(k); + int node_location = neighbor_id % this->graph_table->shard_num % + this->graph_table->task_pool_size_; + __sync_add_and_fetch(&task_size, 1); + graph_table->_shards_task_pool[node_location]->enqueue( + bfs, neighbor_id % this->graph_table->shard_num, neighbor_id); + iter->second.push_back(neighbor_id); + } + } + } + } + int task_left = __sync_sub_and_fetch(&task_size, 1); + if (task_left == 0) { + prom.set_value(0); + } + return 0; + }; + for (size_t i = 0; i < graph_table->shards.size(); ++i) { + std::vector &v = graph_table->shards[i]->get_bucket(); + if (v.size() > 0) { + init_size++; + __sync_add_and_fetch(&task_size, 1); + int64_t id = v[0]->get_id(); + graph_table->_shards_task_pool[i % graph_table->task_pool_size_] + ->enqueue(bfs, i, id); + } // if + } + if (init_size == 0) { + prom.set_value(0); + } + fut.get(); + if (this->status == GraphSamplerStatus::terminating) { + pthread_rwlock_unlock(rw_lock); + return 0; + } + std::cout << "bfs over" << std::endl; + sample_nodes.clear(); + sample_neighbors.clear(); + sample_res.clear(); + sample_nodes.resize(gpu_num); + sample_neighbors.resize(gpu_num); + sample_res.resize(gpu_num); + std::vector>> + sample_nodes_ex(graph_table->task_pool_size_); + std::vector>> sample_neighbors_ex( + graph_table->task_pool_size_); + for (int i = 0; i < graph_table->task_pool_size_; i++) { + sample_nodes_ex[i].resize(gpu_num); + sample_neighbors_ex[i].resize(gpu_num); + } + tasks.clear(); + for (size_t i = 0; i < (size_t)graph_table->task_pool_size_; ++i) { + tasks.push_back( + graph_table->_shards_task_pool[i]->enqueue([&, i, this]() -> int { + if (this->status == GraphSamplerStatus::terminating) { + return 0; + } + paddle::framework::GpuPsGraphNode node; + auto iter = sample_neighbors_map[i].begin(); + size_t ind = i; + for (; iter != sample_neighbors_map[i].end(); iter++) { + size_t location = iter->first % this->gpu_num; + node.node_id = iter->first; + node.neighbor_size = iter->second.size(); + node.neighbor_offset = + (int)sample_neighbors_ex[ind][location].size(); + sample_nodes_ex[ind][location].emplace_back(node); + for (auto k : iter->second) + sample_neighbors_ex[ind][location].push_back(k); + } + return 0; + })); + } + + for (size_t i = 0; i < tasks.size(); i++) { + tasks[i].get(); + sample_neighbors_map[i].clear(); + } + tasks.clear(); + if (this->status == GraphSamplerStatus::terminating) { + pthread_rwlock_unlock(rw_lock); + return 0; + } + for (size_t i = 0; i < gpu_num; i++) { + tasks.push_back( + graph_table->_shards_task_pool[i % graph_table->task_pool_size_] + ->enqueue([&, i, this]() -> int { + if (this->status == GraphSamplerStatus::terminating) { + pthread_rwlock_unlock(rw_lock); + return 0; + } + int total_offset = 0; + size_t ind = i % graph_table->task_pool_size_; + for (int j = 0; j < this->graph_table->task_pool_size_; j++) { + for (size_t k = 0; k < sample_nodes_ex[j][ind].size(); k++) { + sample_nodes[i].push_back(sample_nodes_ex[j][ind][k]); + sample_nodes[i].back().neighbor_offset += total_offset; + // neighbor_offset[i].push_back(total_offset + + // neighbor_offset_ex[j][i][k]); + } + size_t neighbor_size = sample_neighbors_ex[j][ind].size(); + total_offset += neighbor_size; + for (size_t k = 0; k < neighbor_size; k++) { + sample_neighbors[ind].push_back( + sample_neighbors_ex[j][ind][k]); + } + } + return 0; + })); + } + for (size_t i = 0; i < tasks.size(); i++) tasks[i].get(); + if (this->status == GraphSamplerStatus::terminating) { + pthread_rwlock_unlock(rw_lock); + return 0; + } + // int64_t total_neighbors = + // std::accumulate(shard_neighbor_size.begin(),shard_neighbor_size.end(),0); + for (size_t i = 0; i < gpu_num; i++) { + sample_res[i].node_list = sample_nodes[i].data(); + sample_res[i].neighbor_list = sample_neighbors[i].data(); + sample_res[i].node_size = sample_nodes[i].size(); + sample_res[i].neighbor_size = sample_neighbors[i].size(); + } + pthread_rwlock_unlock(rw_lock); + if (this->status == GraphSamplerStatus::terminating) { + return 0; + } + callback(sample_res); + rounds--; + if (rounds > 0) { + for (int i = 0; + i < interval && this->status == GraphSamplerStatus::running; i++) { + std::this_thread::sleep_for(std::chrono::seconds(1)); + } + } + } + return 0; +} +void BasicBfsGraphSampler::init(size_t gpu_num, GraphTable *graph_table, + std::vector args) { + this->gpu_num = gpu_num; + this->graph_table = graph_table; + node_num_for_each_shard = args.size() > 0 ? std::stoi(args[0]) : 10; + edge_num_for_each_node = args.size() > 1 ? std::stoi(args[1]) : 10; + rounds = args.size() > 2 ? std::stoi(args[2]) : 1; + interval = args.size() > 3 ? std::stoi(args[3]) : 60; +} + +#endif + std::vector GraphShard::get_batch(int start, int end, int step) { if (start < 0) start = 0; std::vector res; @@ -38,10 +320,10 @@ std::vector GraphShard::get_batch(int start, int end, int step) { size_t GraphShard::get_size() { return bucket.size(); } -int32_t GraphTable::add_graph_node(std::vector &id_list, +int32_t GraphTable::add_graph_node(std::vector &id_list, std::vector &is_weight_list) { size_t node_size = id_list.size(); - std::vector>> batch(task_pool_size_); + std::vector>> batch(task_pool_size_); for (size_t i = 0; i < node_size; i++) { size_t shard_id = id_list[i] % shard_num; if (shard_id >= shard_end || shard_id < shard_start) { @@ -65,9 +347,9 @@ int32_t GraphTable::add_graph_node(std::vector &id_list, return 0; } -int32_t GraphTable::remove_graph_node(std::vector &id_list) { +int32_t GraphTable::remove_graph_node(std::vector &id_list) { size_t node_size = id_list.size(); - std::vector> batch(task_pool_size_); + std::vector> batch(task_pool_size_); for (size_t i = 0; i < node_size; i++) { size_t shard_id = id_list[i] % shard_num; if (shard_id >= shard_end || shard_id < shard_start) continue; @@ -98,7 +380,7 @@ void GraphShard::clear() { GraphShard::~GraphShard() { clear(); } -void GraphShard::delete_node(uint64_t id) { +void GraphShard::delete_node(int64_t id) { auto iter = node_location.find(id); if (iter == node_location.end()) return; int pos = iter->second; @@ -110,7 +392,7 @@ void GraphShard::delete_node(uint64_t id) { node_location.erase(id); bucket.pop_back(); } -GraphNode *GraphShard::add_graph_node(uint64_t id) { +GraphNode *GraphShard::add_graph_node(int64_t id) { if (node_location.find(id) == node_location.end()) { node_location[id] = bucket.size(); bucket.push_back(new GraphNode(id)); @@ -126,7 +408,7 @@ GraphNode *GraphShard::add_graph_node(Node *node) { } return (GraphNode *)bucket[node_location[id]]; } -FeatureNode *GraphShard::add_feature_node(uint64_t id) { +FeatureNode *GraphShard::add_feature_node(int64_t id) { if (node_location.find(id) == node_location.end()) { node_location[id] = bucket.size(); bucket.push_back(new FeatureNode(id)); @@ -134,11 +416,11 @@ FeatureNode *GraphShard::add_feature_node(uint64_t id) { return (FeatureNode *)bucket[node_location[id]]; } -void GraphShard::add_neighbor(uint64_t id, uint64_t dst_id, float weight) { +void GraphShard::add_neighbor(int64_t id, int64_t dst_id, float weight) { find_node(id)->add_edge(dst_id, weight); } -Node *GraphShard::find_node(uint64_t id) { +Node *GraphShard::find_node(int64_t id) { auto iter = node_location.find(id); return iter == node_location.end() ? nullptr : bucket[iter->second]; } @@ -185,14 +467,14 @@ int32_t GraphTable::load(const std::string &path, const std::string ¶m) { } int32_t GraphTable::get_nodes_ids_by_ranges( - std::vector> ranges, std::vector &res) { + std::vector> ranges, std::vector &res) { int start = 0, end, index = 0, total_size = 0; res.clear(); - std::vector>> tasks; + std::vector>> tasks; for (size_t i = 0; i < shards.size() && index < (int)ranges.size(); i++) { end = total_size + shards[i]->get_size(); start = total_size; - while (start < end && index < ranges.size()) { + while (start < end && index < (int)ranges.size()) { if (ranges[index].second <= start) index++; else if (ranges[index].first >= end) { @@ -204,7 +486,7 @@ int32_t GraphTable::get_nodes_ids_by_ranges( first -= total_size; second -= total_size; tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue( - [this, first, second, i]() -> std::vector { + [this, first, second, i]() -> std::vector { return shards[i]->get_ids_by_range(first, second); })); } @@ -276,6 +558,9 @@ int32_t GraphTable::load_nodes(const std::string &path, std::string node_type) { } int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) { +#ifdef PADDLE_WITH_HETERPS + if (gpups_mode) pthread_rwlock_rdlock(rw_lock.get()); +#endif auto paths = paddle::string::split_string(path, ";"); int64_t count = 0; std::string sample_type = "random"; @@ -351,6 +636,13 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) { /*----------------------- relocate the duplicate nodes to make them distributed evenly among threads. */ + if (!use_duplicate_nodes) { +#ifdef PADDLE_WITH_HETERPS + if (gpups_mode) pthread_rwlock_unlock(rw_lock.get()); +#endif + + return 0; + } for (auto &shard : extra_shards) { auto bucket = shard->get_bucket(); for (size_t i = 0; i < bucket.size(); i++) { @@ -360,13 +652,13 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) { int size = extra_nodes_to_thread_index.size(); if (size == 0) return 0; std::vector index; - for (int i = 0; i < used.size(); i++) index.push_back(i); + for (int i = 0; i < (int)used.size(); i++) index.push_back(i); sort(index.begin(), index.end(), [&](int &a, int &b) { return used[a] < used[b]; }); std::vector alloc(index.size(), 0), has_alloc(index.size(), 0); int t = 1, aim = 0, mod = 0; - for (; t < used.size(); t++) { + for (; t < (int)used.size(); t++) { if ((used[index[t]] - used[index[t - 1]]) * t >= size) { break; } else { @@ -380,7 +672,7 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) { if (t - x <= mod) alloc[index[x]]++; alloc[index[x]] -= used[index[x]]; } - std::vector vec[index.size()]; + std::vector vec[index.size()]; for (auto p : extra_nodes_to_thread_index) { has_alloc[p.second]++; vec[p.second].push_back(p.first); @@ -395,7 +687,7 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) { has_alloc[index[right]] - alloc[index[right]]); has_alloc[index[left]] += x; has_alloc[index[right]] -= x; - uint64_t id; + int64_t id; while (x--) { id = vec[index[right]].back(); vec[index[right]].pop_back(); @@ -424,10 +716,13 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) { delete extra_shards[i]; extra_shards[i] = extra_shards_copy[i]; } +#ifdef PADDLE_WITH_HETERPS + if (gpups_mode) pthread_rwlock_unlock(rw_lock.get()); +#endif return 0; } -Node *GraphTable::find_node(uint64_t id) { +Node *GraphTable::find_node(int64_t id) { size_t shard_id = id % shard_num; if (shard_id >= shard_end || shard_id < shard_start) { if (use_duplicate_nodes == false || extra_nodes_to_thread_index.size() == 0) @@ -443,7 +738,7 @@ Node *GraphTable::find_node(uint64_t id) { Node *node = shards[index]->find_node(id); return node; } -uint32_t GraphTable::get_thread_pool_index(uint64_t node_id) { +uint32_t GraphTable::get_thread_pool_index(int64_t node_id) { if (use_duplicate_nodes == false || extra_nodes_to_thread_index.size() == 0) return node_id % shard_num % shard_num_per_server % task_pool_size_; size_t src_shard_id = node_id % shard_num; @@ -456,8 +751,7 @@ uint32_t GraphTable::get_thread_pool_index(uint64_t node_id) { return src_shard_id % shard_num_per_server % task_pool_size_; } -uint32_t GraphTable::get_thread_pool_index_by_shard_index( - uint64_t shard_index) { +uint32_t GraphTable::get_thread_pool_index_by_shard_index(int64_t shard_index) { return shard_index % shard_num_per_server % task_pool_size_; } @@ -484,7 +778,7 @@ int32_t GraphTable::random_sample_nodes(int sample_size, std::unique_ptr &buffer, int &actual_size) { int total_size = 0; - for (int i = 0; i < shards.size(); i++) { + for (int i = 0; i < (int)shards.size(); i++) { total_size += shards[i]->get_size(); } if (sample_size > total_size) sample_size = total_size; @@ -537,16 +831,16 @@ int32_t GraphTable::random_sample_nodes(int sample_size, } } for (auto &pair : first_half) second_half.push_back(pair); - std::vector res; + std::vector res; get_nodes_ids_by_ranges(second_half, res); - actual_size = res.size() * sizeof(uint64_t); + actual_size = res.size() * sizeof(int64_t); buffer.reset(new char[actual_size]); char *pointer = buffer.get(); memcpy(pointer, res.data(), actual_size); return 0; } int32_t GraphTable::random_sample_neighbors( - uint64_t *node_ids, int sample_size, + int64_t *node_ids, int sample_size, std::vector> &buffers, std::vector &actual_sizes, bool need_weight) { size_t node_num = buffers.size(); @@ -560,10 +854,10 @@ int32_t GraphTable::random_sample_neighbors( seq_id[index].emplace_back(idx); id_list[index].emplace_back(node_ids[idx], sample_size, need_weight); } - for (int i = 0; i < seq_id.size(); i++) { + for (int i = 0; i < (int)seq_id.size(); i++) { if (seq_id[i].size() == 0) continue; tasks.push_back(_shards_task_pool[i]->enqueue([&, i, this]() -> int { - uint64_t node_id; + int64_t node_id; std::vector> r; LRUResponse response = LRUResponse::blocked; if (use_cache) { @@ -576,7 +870,7 @@ int32_t GraphTable::random_sample_neighbors( std::vector sample_keys; auto &rng = _shards_task_rng_pool[i]; for (size_t k = 0; k < id_list[i].size(); k++) { - if (index < r.size() && + if (index < (int)r.size() && r[index].first.node_key == id_list[i][k].node_key) { idx = seq_id[i][k]; actual_sizes[idx] = r[index].second.actual_size; @@ -597,7 +891,7 @@ int32_t GraphTable::random_sample_neighbors( res.size() * (need_weight ? (Node::id_size + Node::weight_size) : Node::id_size); int offset = 0; - uint64_t id; + int64_t id; float weight; char *buffer_addr = new char[actual_size]; if (response == LRUResponse::ok) { @@ -632,13 +926,13 @@ int32_t GraphTable::random_sample_neighbors( return 0; } -int32_t GraphTable::get_node_feat(const std::vector &node_ids, +int32_t GraphTable::get_node_feat(const std::vector &node_ids, const std::vector &feature_names, std::vector> &res) { size_t node_num = node_ids.size(); std::vector> tasks; for (size_t idx = 0; idx < node_num; ++idx) { - uint64_t node_id = node_ids[idx]; + int64_t node_id = node_ids[idx]; tasks.push_back(_shards_task_pool[get_thread_pool_index(node_id)]->enqueue( [&, idx, node_id]() -> int { Node *node = find_node(node_id); @@ -646,7 +940,8 @@ int32_t GraphTable::get_node_feat(const std::vector &node_ids, if (node == nullptr) { return 0; } - for (int feat_idx = 0; feat_idx < feature_names.size(); ++feat_idx) { + for (int feat_idx = 0; feat_idx < (int)feature_names.size(); + ++feat_idx) { const std::string &feature_name = feature_names[feat_idx]; if (feat_id_map.find(feature_name) != feat_id_map.end()) { // res[feat_idx][idx] = @@ -665,19 +960,20 @@ int32_t GraphTable::get_node_feat(const std::vector &node_ids, } int32_t GraphTable::set_node_feat( - const std::vector &node_ids, + const std::vector &node_ids, const std::vector &feature_names, const std::vector> &res) { size_t node_num = node_ids.size(); std::vector> tasks; for (size_t idx = 0; idx < node_num; ++idx) { - uint64_t node_id = node_ids[idx]; + int64_t node_id = node_ids[idx]; tasks.push_back(_shards_task_pool[get_thread_pool_index(node_id)]->enqueue( [&, idx, node_id]() -> int { size_t index = node_id % this->shard_num - this->shard_start; auto node = shards[index]->add_feature_node(node_id); node->set_feature_size(this->feat_name.size()); - for (int feat_idx = 0; feat_idx < feature_names.size(); ++feat_idx) { + for (int feat_idx = 0; feat_idx < (int)feature_names.size(); + ++feat_idx) { const std::string &feature_name = feature_names[feat_idx]; if (feat_id_map.find(feature_name) != feat_id_map.end()) { node->set_feature(feat_id_map[feature_name], res[feat_idx][idx]); @@ -771,35 +1067,68 @@ int32_t GraphTable::pull_graph_list(int start, int total_size, return 0; } -int32_t GraphTable::get_server_index_by_id(uint64_t id) { +int32_t GraphTable::get_server_index_by_id(int64_t id) { return id % shard_num / shard_num_per_server; } +int32_t GraphTable::initialize(const TableParameter &config, + const FsClientParameter &fs_config) { + LOG(INFO) << "in graphTable initialize"; + _config = config; + if (initialize_accessor() != 0) { + LOG(WARNING) << "Table accessor initialize failed"; + return -1; + } -int32_t GraphTable::initialize() { + if (_afs_client.initialize(fs_config) != 0) { + LOG(WARNING) << "Table fs_client initialize failed"; + // return -1; + } + auto graph = config.graph_parameter(); + shard_num = _config.shard_num(); + LOG(INFO) << "in graphTable initialize over"; + return initialize(graph); +} +int32_t GraphTable::initialize(const GraphParameter &graph) { +#ifdef PADDLE_WITH_HETERPS + if (graph.gpups_mode()) { + gpups_mode = true; + if (shard_num == 0) { + shard_num = graph.gpups_mode_shard_num(); + server_num = 1; + _shard_idx = 0; + } + auto *sampler = + CREATE_PSCORE_CLASS(GraphSampler, graph.gpups_graph_sample_class()); + auto slices = + string::split_string(graph.gpups_graph_sample_args(), ","); + std::cout << "slices" << std::endl; + for (auto x : slices) std::cout << x << std::endl; + sampler->init(graph.gpu_num(), this, slices); + graph_sampler.reset(sampler); + } +#endif + task_pool_size_ = graph.task_pool_size(); _shards_task_pool.resize(task_pool_size_); for (size_t i = 0; i < _shards_task_pool.size(); ++i) { _shards_task_pool[i].reset(new ::ThreadPool(1)); _shards_task_rng_pool.push_back(paddle::framework::GetCPURandomEngine(0)); } - server_num = _shard_num; - // VLOG(0) << "in init graph table server num = " << server_num; - /* - _shard_num is actually server number here - when a server initialize its tables, it sets tables' _shard_num to server_num, - and _shard_idx to server - rank - */ - auto common = _config.common(); - - this->table_name = common.table_name(); - this->table_type = common.name(); + auto graph_feature = graph.graph_feature(); + // this->table_name = common.table_name(); + // this->table_type = common.name(); + this->table_name = graph.table_name(); + this->table_type = graph.table_type(); VLOG(0) << " init graph table type " << this->table_type << " table name " << this->table_name; - int feat_conf_size = static_cast(common.attributes().size()); + // int feat_conf_size = static_cast(common.attributes().size()); + int feat_conf_size = static_cast(graph_feature.name().size()); for (int i = 0; i < feat_conf_size; i++) { - auto &f_name = common.attributes()[i]; - auto &f_shape = common.dims()[i]; - auto &f_dtype = common.params()[i]; + // auto &f_name = common.attributes()[i]; + // auto &f_shape = common.dims()[i]; + // auto &f_dtype = common.params()[i]; + auto &f_name = graph_feature.name()[i]; + auto &f_shape = graph_feature.shape()[i]; + auto &f_dtype = graph_feature.dtype()[i]; this->feat_name.push_back(f_name); this->feat_shape.push_back(f_shape); this->feat_dtype.push_back(f_dtype); @@ -807,8 +1136,6 @@ int32_t GraphTable::initialize() { VLOG(0) << "init graph table feat conf name:" << f_name << " shape:" << f_shape << " dtype:" << f_dtype; } - - shard_num = _config.shard_num(); VLOG(0) << "in init graph table shard num = " << shard_num << " shard_idx" << _shard_idx; shard_num_per_server = sparse_local_shard_num(shard_num, server_num); @@ -826,5 +1153,6 @@ int32_t GraphTable::initialize() { return 0; } + } // namespace distributed }; // namespace paddle diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.h b/paddle/fluid/distributed/ps/table/common_graph_table.h index c76a62248c8fcab677d3afd8b3985700ca5f2f33..7946569525cc4bb1351046632dfe5894611c4b67 100644 --- a/paddle/fluid/distributed/ps/table/common_graph_table.h +++ b/paddle/fluid/distributed/ps/table/common_graph_table.h @@ -38,10 +38,14 @@ #include #include "paddle/fluid/distributed/ps/table/accessor.h" #include "paddle/fluid/distributed/ps/table/common_table.h" +#include "paddle/fluid/distributed/ps/table/graph/class_macro.h" #include "paddle/fluid/distributed/ps/table/graph/graph_node.h" #include "paddle/fluid/string/string_helper.h" #include "paddle/phi/core/utils/rw_lock.h" +#ifdef PADDLE_WITH_HETERPS +#include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h" +#endif namespace paddle { namespace distributed { class GraphShard { @@ -51,37 +55,37 @@ class GraphShard { ~GraphShard(); std::vector &get_bucket() { return bucket; } std::vector get_batch(int start, int end, int step); - std::vector get_ids_by_range(int start, int end) { - std::vector res; + std::vector get_ids_by_range(int start, int end) { + std::vector res; for (int i = start; i < end && i < (int)bucket.size(); i++) { res.push_back(bucket[i]->get_id()); } return res; } - GraphNode *add_graph_node(uint64_t id); + GraphNode *add_graph_node(int64_t id); GraphNode *add_graph_node(Node *node); - FeatureNode *add_feature_node(uint64_t id); - Node *find_node(uint64_t id); - void delete_node(uint64_t id); + FeatureNode *add_feature_node(int64_t id); + Node *find_node(int64_t id); + void delete_node(int64_t id); void clear(); - void add_neighbor(uint64_t id, uint64_t dst_id, float weight); - std::unordered_map &get_node_location() { + void add_neighbor(int64_t id, int64_t dst_id, float weight); + std::unordered_map &get_node_location() { return node_location; } private: - std::unordered_map node_location; + std::unordered_map node_location; std::vector bucket; }; enum LRUResponse { ok = 0, blocked = 1, err = 2 }; struct SampleKey { - uint64_t node_key; + int64_t node_key; size_t sample_size; bool is_weighted; - SampleKey(uint64_t _node_key, size_t _sample_size, bool _is_weighted) + SampleKey(int64_t _node_key, size_t _sample_size, bool _is_weighted) : node_key(_node_key), sample_size(_sample_size), is_weighted(_is_weighted) {} @@ -300,7 +304,7 @@ class ScaledLRU { node_size += lru_pool[i].node_size - lru_pool[i].remove_count; } - if (node_size <= size_t(1.1 * size_limit) + 1) return 0; + if ((size_t)node_size <= size_t(1.1 * size_limit) + 1) return 0; if (pthread_rwlock_wrlock(&rwlock) == 0) { // VLOG(0)<"in shrink\n"; global_count = 0; @@ -308,9 +312,9 @@ class ScaledLRU { global_count += lru_pool[i].node_size - lru_pool[i].remove_count; } // VLOG(0)<<"global_count "< size_limit) { + if ((size_t)global_count > size_limit) { size_t remove = global_count - size_limit; - for (int i = 0; i < lru_pool.size(); i++) { + for (size_t i = 0; i < lru_pool.size(); i++) { lru_pool[i].total_diff = 0; lru_pool[i].remove_count += 1.0 * (lru_pool[i].node_size - lru_pool[i].remove_count) / @@ -352,9 +356,69 @@ class ScaledLRU { friend class RandomSampleLRU; }; +#ifdef PADDLE_WITH_HETERPS +enum GraphSamplerStatus { waiting = 0, running = 1, terminating = 2 }; +class GraphTable; +class GraphSampler { + public: + GraphSampler() { + status = GraphSamplerStatus::waiting; + thread_pool.reset(new ::ThreadPool(1)); + callback = [](std::vector &res) { + return; + }; + } + virtual int run_graph_sampling() = 0; + virtual int start_graph_sampling() { + if (status != GraphSamplerStatus::waiting) { + return -1; + } + std::promise prom; + std::future fut = prom.get_future(); + graph_sample_task_over = thread_pool->enqueue([&prom, this]() { + prom.set_value(0); + status = GraphSamplerStatus::running; + return run_graph_sampling(); + }); + return fut.get(); + } + virtual void init(size_t gpu_num, GraphTable *graph_table, + std::vector args) = 0; + virtual void set_graph_sample_callback( + std::function &)> + callback) { + this->callback = callback; + } + + virtual int end_graph_sampling() { + if (status == GraphSamplerStatus::running) { + status = GraphSamplerStatus::terminating; + return graph_sample_task_over.get(); + } + return -1; + } + virtual GraphSamplerStatus get_graph_sampler_status() { return status; } + + protected: + std::function &)> + callback; + std::shared_ptr<::ThreadPool> thread_pool; + GraphSamplerStatus status; + std::future graph_sample_task_over; + std::vector sample_res; +}; +#endif + class GraphTable : public SparseTable { public: - GraphTable() { use_cache = false; } + GraphTable() { + use_cache = false; + shard_num = 0; +#ifdef PADDLE_WITH_HETERPS + gpups_mode = false; +#endif + rw_lock.reset(new pthread_rwlock_t()); + } virtual ~GraphTable(); virtual int32_t pull_graph_list(int start, int size, std::unique_ptr &buffer, @@ -362,7 +426,7 @@ class GraphTable : public SparseTable { int step); virtual int32_t random_sample_neighbors( - uint64_t *node_ids, int sample_size, + int64_t *node_ids, int sample_size, std::vector> &buffers, std::vector &actual_sizes, bool need_weight); @@ -370,9 +434,11 @@ class GraphTable : public SparseTable { int &actual_sizes); virtual int32_t get_nodes_ids_by_ranges( - std::vector> ranges, std::vector &res); - virtual int32_t initialize(); - + std::vector> ranges, std::vector &res); + virtual int32_t initialize() { return 0; } + virtual int32_t initialize(const TableParameter &config, + const FsClientParameter &fs_config); + virtual int32_t initialize(const GraphParameter &config); int32_t load(const std::string &path, const std::string ¶m); int32_t load_graph_split_config(const std::string &path); @@ -380,13 +446,13 @@ class GraphTable : public SparseTable { int32_t load_nodes(const std::string &path, std::string node_type); - int32_t add_graph_node(std::vector &id_list, + int32_t add_graph_node(std::vector &id_list, std::vector &is_weight_list); - int32_t remove_graph_node(std::vector &id_list); + int32_t remove_graph_node(std::vector &id_list); - int32_t get_server_index_by_id(uint64_t id); - Node *find_node(uint64_t id); + int32_t get_server_index_by_id(int64_t id); + Node *find_node(int64_t id); virtual int32_t pull_sparse(float *values, const PullSparseValue &pull_value) { @@ -407,16 +473,27 @@ class GraphTable : public SparseTable { return 0; } virtual int32_t initialize_shard() { return 0; } - virtual uint32_t get_thread_pool_index_by_shard_index(uint64_t shard_index); - virtual uint32_t get_thread_pool_index(uint64_t node_id); + virtual int32_t set_shard(size_t shard_idx, size_t server_num) { + _shard_idx = shard_idx; + /* + _shard_num is not used in graph_table, this following operation is for the + purpose of + being compatible with base class table. + */ + _shard_num = server_num; + this->server_num = server_num; + return 0; + } + virtual uint32_t get_thread_pool_index_by_shard_index(int64_t shard_index); + virtual uint32_t get_thread_pool_index(int64_t node_id); virtual std::pair parse_feature(std::string feat_str); - virtual int32_t get_node_feat(const std::vector &node_ids, + virtual int32_t get_node_feat(const std::vector &node_ids, const std::vector &feature_names, std::vector> &res); virtual int32_t set_node_feat( - const std::vector &node_ids, + const std::vector &node_ids, const std::vector &feature_names, const std::vector> &res); @@ -433,11 +510,25 @@ class GraphTable : public SparseTable { } return 0; } - +#ifdef PADDLE_WITH_HETERPS + virtual int32_t start_graph_sampling() { + return this->graph_sampler->start_graph_sampling(); + } + virtual int32_t end_graph_sampling() { + return this->graph_sampler->end_graph_sampling(); + } + virtual int32_t set_graph_sample_callback( + std::function &)> + callback) { + graph_sampler->set_graph_sample_callback(callback); + return 0; + } +// virtual GraphSampler *get_graph_sampler() { return graph_sampler.get(); } +#endif protected: std::vector shards, extra_shards; size_t shard_start, shard_end, server_num, shard_num_per_server, shard_num; - const int task_pool_size_ = 24; + int task_pool_size_ = 24; const int random_sample_nodes_ranges = 3; std::vector feat_name; @@ -450,11 +541,61 @@ class GraphTable : public SparseTable { std::vector> _shards_task_pool; std::vector> _shards_task_rng_pool; std::shared_ptr> scaled_lru; - std::unordered_set extra_nodes; - std::unordered_map extra_nodes_to_thread_index; + std::unordered_set extra_nodes; + std::unordered_map extra_nodes_to_thread_index; bool use_cache, use_duplicate_nodes; mutable std::mutex mutex_; + std::shared_ptr rw_lock; +#ifdef PADDLE_WITH_HETERPS + // paddle::framework::GpuPsGraphTable gpu_graph_table; + bool gpups_mode; + // std::shared_ptr<::ThreadPool> graph_sample_pool; + std::shared_ptr graph_sampler; + REGISTER_GRAPH_FRIEND_CLASS(2, CompleteGraphSampler, BasicBfsGraphSampler) +#endif +}; + +#ifdef PADDLE_WITH_HETERPS +REGISTER_PSCORE_REGISTERER(GraphSampler); +class CompleteGraphSampler : public GraphSampler { + public: + CompleteGraphSampler() {} + ~CompleteGraphSampler() {} + // virtual pthread_rwlock_t *export_rw_lock(); + virtual int run_graph_sampling(); + virtual void init(size_t gpu_num, GraphTable *graph_table, + std::vector args_); + + protected: + GraphTable *graph_table; + std::vector> sample_nodes; + std::vector> sample_neighbors; + // std::vector sample_res; + // std::shared_ptr random; + int gpu_num; +}; + +class BasicBfsGraphSampler : public GraphSampler { + public: + BasicBfsGraphSampler() {} + ~BasicBfsGraphSampler() {} + // virtual pthread_rwlock_t *export_rw_lock(); + virtual int run_graph_sampling(); + virtual void init(size_t gpu_num, GraphTable *graph_table, + std::vector args_); + + protected: + GraphTable *graph_table; + // std::vector> sample_nodes; + std::vector> sample_nodes; + std::vector> sample_neighbors; + size_t gpu_num; + int node_num_for_each_shard, edge_num_for_each_node; + int rounds, interval; + std::vector>> + sample_neighbors_map; }; +#endif } // namespace distributed }; // namespace paddle diff --git a/paddle/fluid/distributed/ps/table/depends/initializers.h b/paddle/fluid/distributed/ps/table/depends/initializers.h index 5ac0c08f97d76f6bc1cb77f1f6cd0da77be2385f..f46e659a88babb07918d02f1e05859829895f2bf 100644 --- a/paddle/fluid/distributed/ps/table/depends/initializers.h +++ b/paddle/fluid/distributed/ps/table/depends/initializers.h @@ -23,6 +23,7 @@ #include "gflags/gflags.h" #include "paddle/fluid/framework/generator.h" + #include "paddle/fluid/operators/truncated_gaussian_random_op.h" namespace paddle { @@ -117,13 +118,9 @@ class TruncatedGaussianInitializer : public Initializer { seed_ = static_cast(std::stoi(attrs[1])); mean_ = std::stof(attrs[2]); std_ = std::stof(attrs[3]); - auto normal_cdf = [](float x) { - return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0; - }; - float a_normal_cdf = normal_cdf((-2.0 - mean_) / std_); - float b_normal_cdf = normal_cdf((2.0 - mean_) / std_); - std::uniform_real_distribution dist_(2.0 * a_normal_cdf - 1.0, - 2.0 * b_normal_cdf - 1.0); + + std::uniform_real_distribution dist_( + std::numeric_limits::min(), 1.0); random_engine_ = framework::GetCPURandomEngine(seed_); } diff --git a/paddle/fluid/distributed/ps/table/graph/class_macro.h b/paddle/fluid/distributed/ps/table/graph/class_macro.h new file mode 100644 index 0000000000000000000000000000000000000000..bf59dbacb253707efdc527a23232fcb6c11554b4 --- /dev/null +++ b/paddle/fluid/distributed/ps/table/graph/class_macro.h @@ -0,0 +1,39 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#define DECLARE_GRAPH_FRIEND_CLASS(a) friend class a; +#define DECLARE_1_FRIEND_CLASS(a, ...) DECLARE_GRAPH_FRIEND_CLASS(a) +#define DECLARE_2_FRIEND_CLASS(a, ...) \ + DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_1_FRIEND_CLASS(__VA_ARGS__) +#define DECLARE_3_FRIEND_CLASS(a, ...) \ + DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_2_FRIEND_CLASS(__VA_ARGS__) +#define DECLARE_4_FRIEND_CLASS(a, ...) \ + DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_3_FRIEND_CLASS(__VA_ARGS__) +#define DECLARE_5_FRIEND_CLASS(a, ...) \ + DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_4_FRIEND_CLASS(__VA_ARGS__) +#define DECLARE_6_FRIEND_CLASS(a, ...) \ + DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_5_FRIEND_CLASS(__VA_ARGS__) +#define DECLARE_7_FRIEND_CLASS(a, ...) \ + DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_6_FRIEND_CLASS(__VA_ARGS__) +#define DECLARE_8_FRIEND_CLASS(a, ...) \ + DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_7_FRIEND_CLASS(__VA_ARGS__) +#define DECLARE_9_FRIEND_CLASS(a, ...) \ + DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_8_FRIEND_CLASS(__VA_ARGS__) +#define DECLARE_10_FRIEND_CLASS(a, ...) \ + DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_9_FRIEND_CLASS(__VA_ARGS__) +#define DECLARE_11_FRIEND_CLASS(a, ...) \ + DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_10_FRIEND_CLASS(__VA_ARGS__) +#define REGISTER_GRAPH_FRIEND_CLASS(n, ...) \ + DECLARE_##n##_FRIEND_CLASS(__VA_ARGS__) diff --git a/paddle/fluid/distributed/ps/table/graph/graph_edge.cc b/paddle/fluid/distributed/ps/table/graph/graph_edge.cc index d1961b655d8829716b392c24ad6f1139089eb80d..004a536e8e56c28151986d56833a5708999e297c 100644 --- a/paddle/fluid/distributed/ps/table/graph/graph_edge.cc +++ b/paddle/fluid/distributed/ps/table/graph/graph_edge.cc @@ -17,11 +17,11 @@ namespace paddle { namespace distributed { -void GraphEdgeBlob::add_edge(uint64_t id, float weight = 1) { +void GraphEdgeBlob::add_edge(int64_t id, float weight = 1) { id_arr.push_back(id); } -void WeightedGraphEdgeBlob::add_edge(uint64_t id, float weight = 1) { +void WeightedGraphEdgeBlob::add_edge(int64_t id, float weight = 1) { id_arr.push_back(id); weight_arr.push_back(weight); } diff --git a/paddle/fluid/distributed/ps/table/graph/graph_edge.h b/paddle/fluid/distributed/ps/table/graph/graph_edge.h index 3dfe5a6f357a7cd7d79834a20b6411995665f4fa..5fc785fe25682c8ff8de6606581cf7a13ae52999 100644 --- a/paddle/fluid/distributed/ps/table/graph/graph_edge.h +++ b/paddle/fluid/distributed/ps/table/graph/graph_edge.h @@ -24,19 +24,20 @@ class GraphEdgeBlob { GraphEdgeBlob() {} virtual ~GraphEdgeBlob() {} size_t size() { return id_arr.size(); } - virtual void add_edge(uint64_t id, float weight); - uint64_t get_id(int idx) { return id_arr[idx]; } + virtual void add_edge(int64_t id, float weight); + int64_t get_id(int idx) { return id_arr[idx]; } virtual float get_weight(int idx) { return 1; } + std::vector& export_id_array() { return id_arr; } protected: - std::vector id_arr; + std::vector id_arr; }; class WeightedGraphEdgeBlob : public GraphEdgeBlob { public: WeightedGraphEdgeBlob() {} virtual ~WeightedGraphEdgeBlob() {} - virtual void add_edge(uint64_t id, float weight); + virtual void add_edge(int64_t id, float weight); virtual float get_weight(int idx) { return weight_arr[idx]; } protected: diff --git a/paddle/fluid/distributed/ps/table/graph/graph_node.h b/paddle/fluid/distributed/ps/table/graph/graph_node.h index b838c2c1258d84fec8c4a25f5855209d5b428d4c..c6c594036d4fc94b296c0801b05c05801beb4fc0 100644 --- a/paddle/fluid/distributed/ps/table/graph/graph_node.h +++ b/paddle/fluid/distributed/ps/table/graph/graph_node.h @@ -48,6 +48,7 @@ class Node { virtual void set_feature(int idx, std::string str) {} virtual void set_feature_size(int size) {} virtual int get_feature_size() { return 0; } + virtual size_t get_neighbor_size() { return 0; } protected: uint64_t id; @@ -70,6 +71,7 @@ class GraphNode : public Node { } virtual uint64_t get_neighbor_id(int idx) { return edges->get_id(idx); } virtual float get_neighbor_weight(int idx) { return edges->get_weight(idx); } + virtual size_t get_neighbor_size() { return edges->size(); } protected: Sampler *sampler; diff --git a/paddle/fluid/distributed/ps/table/table.cc b/paddle/fluid/distributed/ps/table/table.cc index fa8169da07ab7fdf7ed28c840f062741913a8702..fc2ea56e95d7721fdba10e8499c22ca98bbd4c3a 100644 --- a/paddle/fluid/distributed/ps/table/table.cc +++ b/paddle/fluid/distributed/ps/table/table.cc @@ -37,6 +37,8 @@ REGISTER_PSCORE_CLASS(Table, CommonDenseTable); REGISTER_PSCORE_CLASS(Table, CommonSparseTable); #ifdef PADDLE_WITH_HETERPS REGISTER_PSCORE_CLASS(Table, SSDSparseTable); +REGISTER_PSCORE_CLASS(GraphSampler, CompleteGraphSampler); +REGISTER_PSCORE_CLASS(GraphSampler, BasicBfsGraphSampler); #endif REGISTER_PSCORE_CLASS(Table, SparseGeoTable); REGISTER_PSCORE_CLASS(Table, BarrierTable); diff --git a/paddle/fluid/distributed/test/CMakeLists.txt b/paddle/fluid/distributed/test/CMakeLists.txt index 2223334ccc442f5e53805ac8c078df07155565a8..cb46c38d4de4b7546af3e3f9e973ee2accba1921 100644 --- a/paddle/fluid/distributed/test/CMakeLists.txt +++ b/paddle/fluid/distributed/test/CMakeLists.txt @@ -24,6 +24,9 @@ cc_test(graph_node_test SRCS graph_node_test.cc DEPS graph_py_service scope serv set_source_files_properties(graph_node_split_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_test(graph_node_split_test SRCS graph_node_split_test.cc DEPS graph_py_service scope server client communicator ps_service boost table ps_framework_proto ${COMMON_DEPS}) +set_source_files_properties(graph_table_sample_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_test(graph_table_sample_test SRCS graph_table_sample_test.cc DEPS scope server communicator ps_service boost table ps_framework_proto ${COMMON_DEPS}) + set_source_files_properties(feature_value_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_test(feature_value_test SRCS feature_value_test.cc DEPS ${COMMON_DEPS} boost table) diff --git a/paddle/fluid/distributed/test/graph_node_split_test.cc b/paddle/fluid/distributed/test/graph_node_split_test.cc index 9949dce4e933b03da4260c34b3beaf2b7bcdc4f1..a2f495de3c953a418f6e9c57a0535264eb401e65 100644 --- a/paddle/fluid/distributed/test/graph_node_split_test.cc +++ b/paddle/fluid/distributed/test/graph_node_split_test.cc @@ -236,7 +236,7 @@ void RunGraphSplit() { sleep(2); std::map> dense_regions; dense_regions.insert( - std::pair>(0, {})); + std::pair>(0, {})); auto regions = dense_regions[0]; RunClient(dense_regions, 0, pserver_ptr_->get_service()); @@ -250,16 +250,16 @@ void RunGraphSplit() { worker_ptr_->load(0, std::string(edge_file_name), std::string("e>")); srand(time(0)); pull_status.wait(); - std::vector> _vs; + std::vector> _vs; std::vector> vs; pull_status = worker_ptr_->batch_sample_neighbors( - 0, std::vector(1, 10240001024), 4, _vs, vs, true); + 0, std::vector(1, 10240001024), 4, _vs, vs, true); pull_status.wait(); ASSERT_EQ(0, _vs[0].size()); _vs.clear(); vs.clear(); pull_status = worker_ptr_->batch_sample_neighbors( - 0, std::vector(1, 97), 4, _vs, vs, true); + 0, std::vector(1, 97), 4, _vs, vs, true); pull_status.wait(); ASSERT_EQ(3, _vs[0].size()); std::remove(edge_file_name); diff --git a/paddle/fluid/distributed/test/graph_node_test.cc b/paddle/fluid/distributed/test/graph_node_test.cc index 22c2d1e60992e2955824f004fbb89ea6c22da823..565d51379d5a8519de241deea192ffbdbfa49fd0 100644 --- a/paddle/fluid/distributed/test/graph_node_test.cc +++ b/paddle/fluid/distributed/test/graph_node_test.cc @@ -48,10 +48,10 @@ namespace distributed = paddle::distributed; void testSampleNodes( std::shared_ptr& worker_ptr_) { - std::vector ids; + std::vector ids; auto pull_status = worker_ptr_->random_sample_nodes(0, 0, 6, ids); - std::unordered_set s; - std::unordered_set s1 = {37, 59}; + std::unordered_set s; + std::unordered_set s1 = {37, 59}; pull_status.wait(); for (auto id : ids) s.insert(id); ASSERT_EQ(true, s.size() == s1.size()); @@ -106,14 +106,14 @@ void testFeatureNodeSerializeFloat64() { void testSingleSampleNeighboor( std::shared_ptr& worker_ptr_) { - std::vector> vs; + std::vector> vs; std::vector> vs1; auto pull_status = worker_ptr_->batch_sample_neighbors( - 0, std::vector(1, 37), 4, vs, vs1, true); + 0, std::vector(1, 37), 4, vs, vs1, true); pull_status.wait(); - std::unordered_set s; - std::unordered_set s1 = {112, 45, 145}; + std::unordered_set s; + std::unordered_set s1 = {112, 45, 145}; for (auto g : vs[0]) { s.insert(g); } @@ -126,7 +126,7 @@ void testSingleSampleNeighboor( vs.clear(); vs1.clear(); pull_status = worker_ptr_->batch_sample_neighbors( - 0, std::vector(1, 96), 4, vs, vs1, true); + 0, std::vector(1, 96), 4, vs, vs1, true); pull_status.wait(); s1 = {111, 48, 247}; for (auto g : vs[0]) { @@ -147,30 +147,30 @@ void testAddNode( std::shared_ptr& worker_ptr_) { worker_ptr_->clear_nodes(0); int total_num = 270000; - uint64_t id; - std::unordered_set id_set; + int64_t id; + std::unordered_set id_set; for (int i = 0; i < total_num; i++) { while (id_set.find(id = rand()) != id_set.end()) ; id_set.insert(id); } - std::vector id_list(id_set.begin(), id_set.end()); + std::vector id_list(id_set.begin(), id_set.end()); std::vector weight_list; auto status = worker_ptr_->add_graph_node(0, id_list, weight_list); status.wait(); - std::vector ids[2]; + std::vector ids[2]; for (int i = 0; i < 2; i++) { auto sample_status = worker_ptr_->random_sample_nodes(0, i, total_num, ids[i]); sample_status.wait(); } - std::unordered_set id_set_check(ids[0].begin(), ids[0].end()); + std::unordered_set id_set_check(ids[0].begin(), ids[0].end()); for (auto x : ids[1]) id_set_check.insert(x); ASSERT_EQ(id_set.size(), id_set_check.size()); for (auto x : id_set) { ASSERT_EQ(id_set_check.find(x) != id_set_check.end(), true); } - std::vector remove_ids; + std::vector remove_ids; for (auto p : id_set_check) { if (remove_ids.size() == 0) remove_ids.push_back(p); @@ -187,7 +187,7 @@ void testAddNode( worker_ptr_->random_sample_nodes(0, i, total_num, ids[i]); sample_status.wait(); } - std::unordered_set id_set_check1(ids[0].begin(), ids[0].end()); + std::unordered_set id_set_check1(ids[0].begin(), ids[0].end()); for (auto x : ids[1]) id_set_check1.insert(x); ASSERT_EQ(id_set_check1.size(), id_set_check.size()); for (auto x : id_set_check1) { @@ -196,14 +196,14 @@ void testAddNode( } void testBatchSampleNeighboor( std::shared_ptr& worker_ptr_) { - std::vector> vs; + std::vector> vs; std::vector> vs1; - std::vector v = {37, 96}; + std::vector v = {37, 96}; auto pull_status = worker_ptr_->batch_sample_neighbors(0, v, 4, vs, vs1, false); pull_status.wait(); - std::unordered_set s; - std::unordered_set s1 = {112, 45, 145}; + std::unordered_set s; + std::unordered_set s1 = {112, 45, 145}; for (auto g : vs[0]) { s.insert(g); } @@ -417,7 +417,7 @@ void RunBrpcPushSparse() { std::map> dense_regions; dense_regions.insert( - std::pair>(0, {})); + std::pair>(0, {})); auto regions = dense_regions[0]; RunClient(dense_regions, 0, pserver_ptr_->get_service()); @@ -427,14 +427,14 @@ void RunBrpcPushSparse() { worker_ptr_->load(0, std::string(edge_file_name), std::string("e>")); srand(time(0)); pull_status.wait(); - std::vector> _vs; + std::vector> _vs; std::vector> vs; testSampleNodes(worker_ptr_); sleep(5); testSingleSampleNeighboor(worker_ptr_); testBatchSampleNeighboor(worker_ptr_); pull_status = worker_ptr_->batch_sample_neighbors( - 0, std::vector(1, 10240001024), 4, _vs, vs, true); + 0, std::vector(1, 10240001024), 4, _vs, vs, true); pull_status.wait(); ASSERT_EQ(0, _vs[0].size()); paddle::distributed::GraphTable* g = @@ -445,14 +445,14 @@ void RunBrpcPushSparse() { while (round--) { vs.clear(); pull_status = worker_ptr_->batch_sample_neighbors( - 0, std::vector(1, 37), 1, _vs, vs, false); + 0, std::vector(1, 37), 1, _vs, vs, false); pull_status.wait(); for (int i = 0; i < ttl; i++) { - std::vector> vs1; + std::vector> vs1; std::vector> vs2; pull_status = worker_ptr_->batch_sample_neighbors( - 0, std::vector(1, 37), 1, vs1, vs2, false); + 0, std::vector(1, 37), 1, vs1, vs2, false); pull_status.wait(); ASSERT_EQ(_vs[0].size(), vs1[0].size()); @@ -540,7 +540,7 @@ void RunBrpcPushSparse() { // Test Pull by step - std::unordered_set count_item_nodes; + std::unordered_set count_item_nodes; // pull by step 2 for (int test_step = 1; test_step < 4; test_step++) { count_item_nodes.clear(); @@ -558,18 +558,18 @@ void RunBrpcPushSparse() { ASSERT_EQ(count_item_nodes.size(), 12); } - std::pair>, std::vector> res; + std::pair>, std::vector> res; res = client1.batch_sample_neighbors( - std::string("user2item"), std::vector(1, 96), 4, true, false); + std::string("user2item"), std::vector(1, 96), 4, true, false); ASSERT_EQ(res.first[0].size(), 3); - std::vector node_ids; + std::vector node_ids; node_ids.push_back(96); node_ids.push_back(37); res = client1.batch_sample_neighbors(std::string("user2item"), node_ids, 4, true, false); ASSERT_EQ(res.first[1].size(), 1); - std::vector nodes_ids = client2.random_sample_nodes("user", 0, 6); + std::vector nodes_ids = client2.random_sample_nodes("user", 0, 6); ASSERT_EQ(nodes_ids.size(), 2); ASSERT_EQ(true, (nodes_ids[0] == 59 && nodes_ids[1] == 37) || (nodes_ids[0] == 37 && nodes_ids[1] == 59)); diff --git a/paddle/fluid/distributed/test/graph_table_sample_test.cc b/paddle/fluid/distributed/test/graph_table_sample_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..65455028247ddf7d310040ecae0018b619f75bf1 --- /dev/null +++ b/paddle/fluid/distributed/test/graph_table_sample_test.cc @@ -0,0 +1,148 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include // NOLINT +#include +#include +#include +#include // NOLINT +#include +#include +#include "google/protobuf/text_format.h" + +#include +#include "gtest/gtest.h" +#include "paddle/fluid/distributed/ps.pb.h" +#include "paddle/fluid/distributed/ps/service/env.h" +#include "paddle/fluid/distributed/ps/service/sendrecv.pb.h" +#include "paddle/fluid/distributed/ps/table/common_graph_table.h" +#include "paddle/fluid/distributed/ps/table/graph/graph_node.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/string/printf.h" +#include "paddle/phi/kernels/funcs/math_function.h" +namespace framework = paddle::framework; +namespace platform = paddle::platform; +namespace operators = paddle::operators; +namespace memory = paddle::memory; +namespace distributed = paddle::distributed; + +std::vector edges = { + std::string("37\t45\t0.34"), std::string("37\t145\t0.31"), + std::string("37\t112\t0.21"), std::string("96\t48\t1.4"), + std::string("96\t247\t0.31"), std::string("96\t111\t1.21"), + std::string("59\t45\t0.34"), std::string("59\t145\t0.31"), + std::string("59\t122\t0.21"), std::string("97\t48\t0.34"), + std::string("97\t247\t0.31"), std::string("97\t111\t0.21")}; +// odd id:96 48 122 112 +char edge_file_name[] = "edges.txt"; + +std::vector nodes = { + std::string("user\t37\ta 0.34\tb 13 14\tc hello\td abc"), + std::string("user\t96\ta 0.31\tb 15 10\tc 96hello\td abcd"), + std::string("user\t59\ta 0.11\tb 11 14"), + std::string("user\t97\ta 0.11\tb 12 11"), + std::string("item\t45\ta 0.21"), + std::string("item\t145\ta 0.21"), + std::string("item\t112\ta 0.21"), + std::string("item\t48\ta 0.21"), + std::string("item\t247\ta 0.21"), + std::string("item\t111\ta 0.21"), + std::string("item\t46\ta 0.21"), + std::string("item\t146\ta 0.21"), + std::string("item\t122\ta 0.21"), + std::string("item\t49\ta 0.21"), + std::string("item\t248\ta 0.21"), + std::string("item\t113\ta 0.21")}; +char node_file_name[] = "nodes.txt"; + +void prepare_file(char file_name[], std::vector data) { + std::ofstream ofile; + ofile.open(file_name); + for (auto x : data) { + ofile << x << std::endl; + } + + ofile.close(); +} + +void testGraphSample() { +#ifdef PADDLE_WITH_HETERPS + ::paddle::distributed::GraphParameter table_proto; + table_proto.set_gpups_mode(true); + table_proto.set_gpups_mode_shard_num(127); + table_proto.set_gpu_num(2); + + distributed::GraphTable graph_table, graph_table1; + graph_table.initialize(table_proto); + prepare_file(edge_file_name, edges); + graph_table.load(std::string(edge_file_name), std::string("e>")); + std::vector res; + std::promise prom; + std::future fut = prom.get_future(); + graph_table.set_graph_sample_callback( + [&res, &prom](std::vector &res0) { + res = res0; + prom.set_value(0); + }); + graph_table.start_graph_sampling(); + fut.get(); + graph_table.end_graph_sampling(); + ASSERT_EQ(2, res.size()); + // 37 59 97 + for (int i = 0; i < (int)res[1].node_size; i++) { + std::cout << res[1].node_list[i].node_id << std::endl; + } + ASSERT_EQ(3, res[1].node_size); + + ::paddle::distributed::GraphParameter table_proto1; + table_proto1.set_gpups_mode(true); + table_proto1.set_gpups_mode_shard_num(127); + table_proto1.set_gpu_num(2); + table_proto1.set_gpups_graph_sample_class("BasicBfsGraphSampler"); + table_proto1.set_gpups_graph_sample_args("5,5,1,1"); + graph_table1.initialize(table_proto1); + graph_table1.load(std::string(edge_file_name), std::string("e>")); + std::vector res1; + std::promise prom1; + std::future fut1 = prom1.get_future(); + graph_table1.set_graph_sample_callback( + [&res1, &prom1](std::vector &res0) { + res1 = res0; + prom1.set_value(0); + }); + graph_table1.start_graph_sampling(); + fut1.get(); + graph_table1.end_graph_sampling(); + // distributed::BasicBfsGraphSampler *sampler1 = + // (distributed::BasicBfsGraphSampler *)graph_table1.get_graph_sampler(); + // sampler1->start_graph_sampling(); + // std::this_thread::sleep_for (std::chrono::seconds(1)); + // std::vector res1;// = + // sampler1->fetch_sample_res(); + ASSERT_EQ(2, res1.size()); + // odd id:96 48 122 112 + for (int i = 0; i < (int)res1[0].node_size; i++) { + std::cout << res1[0].node_list[i].node_id << std::endl; + } + ASSERT_EQ(4, res1[0].node_size); +#endif +} + +TEST(testGraphSample, Run) { testGraphSample(); } diff --git a/paddle/fluid/eager/accumulation/accumulation_node.cc b/paddle/fluid/eager/accumulation/accumulation_node.cc index 3a2ec403c0a59aaa23decc72fb9581b5a7f78343..9c4089af092e418d6845864671124917c6498cf1 100644 --- a/paddle/fluid/eager/accumulation/accumulation_node.cc +++ b/paddle/fluid/eager/accumulation/accumulation_node.cc @@ -24,7 +24,7 @@ #include "paddle/fluid/platform/errors.h" #include "glog/logging.h" - +DECLARE_bool(retain_grad_for_all_tensor); namespace egr { static void CopyOrAddTensor(paddle::experimental::Tensor* tensor, @@ -39,8 +39,8 @@ static void CopyOrAddTensor(paddle::experimental::Tensor* tensor, } std::vector> GradNodeAccumulation:: -operator()( - const std::vector>& grads) { +operator()(const std::vector>& grads, + bool create_graph) { VLOG(3) << "Running Eager Backward Node: GradNodeAccumulation"; PADDLE_ENFORCE(grads.size() == 1, paddle::platform::errors::Fatal( @@ -62,7 +62,7 @@ operator()( grad_out = grads[0][0]; } - if (!weak_grad_.expired()) { + if (!weak_grad_.expired() && FLAGS_retain_grad_for_all_tensor) { auto grad = weak_grad_.lock(); CopyOrAddTensor(grad.get(), grad_out); } diff --git a/paddle/fluid/eager/accumulation/accumulation_node.h b/paddle/fluid/eager/accumulation/accumulation_node.h index 07fa40165167ce2352018c0e1b1cb08222d5a181..a91a0b6e34c0d9440e3645d1a6982748c4315962 100644 --- a/paddle/fluid/eager/accumulation/accumulation_node.h +++ b/paddle/fluid/eager/accumulation/accumulation_node.h @@ -35,8 +35,15 @@ class GradNodeAccumulation : public GradNodeBase { // Functor: perform backward computations virtual std::vector> operator()( - const std::vector>& grads) - override; + const std::vector>& grads, + bool create_graph = false) override; + + void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; } + + bool IsTensorWrappersCleared() override { + VLOG(6) << "Do nothing here now"; + return false; + } std::string name() { return "GradNodeAccumulation"; } diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc index 5a2595b9103e4d49845fa8938ee3577b6b3f3f06..0bc998a03a80b7b8a1e486ad68f1575c130d2c1b 100644 --- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc +++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc @@ -145,8 +145,8 @@ void GradNodeScale::SetTensorWrappers_X( void GradNodeScale::SetAttributes_scale(float scale) { scale_ = scale; } std::vector> GradNodeScale:: -operator()( - const std::vector>& grads) { +operator()(const std::vector>& grads, + bool create_graph) { // 1. Check Output Size PADDLE_ENFORCE( ((grads.size() == 1) && (grads[0].size() == 1)), diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h index 247fde6ed1f869542969b068cdae9f59cedd732a..e263f73a6b8a4a1f9ce23d9b5ca383fd6828016b 100644 --- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h +++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h @@ -39,8 +39,15 @@ class GradNodeScale : public GradNodeBase { // Functor: perform backward computations virtual std::vector> operator()( - const std::vector>& grads) - override; + const std::vector>& grads, + bool create_graph = false) override; + + void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; } + + bool IsTensorWrappersCleared() override { + VLOG(6) << "Do nothing here now"; + return false; + } void SetTensorWrappers_X( const std::vector& tensors); diff --git a/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc b/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc index ba6a936d68651c0bcf3815eab58b5a6e66d7024c..1be3b31de00a6bb94b8ad16bff4bf9c1fa61123f 100644 --- a/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc +++ b/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc @@ -86,9 +86,9 @@ paddle::experimental::Tensor scale(const paddle::experimental::Tensor& x, scale_node->SetTensorWrappers_X({x}); // Set Grad out rank as same as fwd input and set stop gradient to bwd - scale_node->SetGradOutMeta(p_autograd_in, /*slot id*/ 0); + scale_node->SetGradOutMeta(x, /*slot id*/ 0); // Set Grad out rank as same as fwd input and set stop gradient to bwd - scale_node->SetGradInMeta(p_autograd_out, /*slot id*/ 0); + scale_node->SetGradInMeta(out, /*slot id*/ 0); // Set History for output set current Grad Node for EagerUtils::SetHistory(p_autograd_out, scale_node); diff --git a/paddle/fluid/eager/api/utils/tensor_utils.cc b/paddle/fluid/eager/api/utils/tensor_utils.cc index 77c39d1b0a37c3946e4c170484118a5fb6f79170..b485beca57a214bc00cb813e9de6a53eca1e67ea 100644 --- a/paddle/fluid/eager/api/utils/tensor_utils.cc +++ b/paddle/fluid/eager/api/utils/tensor_utils.cc @@ -30,7 +30,8 @@ namespace egr_utils_api { bool IsLeafTensor(const paddle::experimental::Tensor& target) { std::shared_ptr grad_node = EagerUtils::grad_node(target); - if (std::dynamic_pointer_cast(grad_node)) { + if (!grad_node || + std::dynamic_pointer_cast(grad_node)) { return true; } diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc index 6a2e5e7ac6cd75068bba4e9b675ab67588c38366..b8d59e8dd8b4c60e28323955effd232eb2b51945 100644 --- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc +++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc @@ -56,23 +56,29 @@ static std::string LegalizeVariableName(const std::string& var_name) { return ret; } -static bool IgnoreGradAttribute(const std::string& op_type, - const std::string& attr_name) { - // Attributes in operators_with_attrs are created manually during code - // generation - // We should ignore these arbitrary attrs when setting up grad attribute map - if (operators_with_attrs.count(op_type)) { - if (operators_with_attrs[op_type].count(attr_name)) { - return true; - } - } +static std::string HandleDynamicGradAttributes(const std::string& fwd_op_type, + const std::string& attrs_name) { + std::string additional_grad_attrs_str = ""; + + if (fwd_op_type == "sum") { + const char* GRAD_ATTRS_TEMPLATE = " %s[\"%s\"] = %s;\n"; + additional_grad_attrs_str = paddle::string::Sprintf( + GRAD_ATTRS_TEMPLATE, attrs_name, "scale", "float(1.0)"); + additional_grad_attrs_str += paddle::string::Sprintf( + GRAD_ATTRS_TEMPLATE, attrs_name, "bias", "float(0.0f)"); + additional_grad_attrs_str += paddle::string::Sprintf( + GRAD_ATTRS_TEMPLATE, attrs_name, "bias_after_scale", "bool(true)"); + + } else if (fwd_op_type == "scale") { + const char* GRAD_ATTRS_TEMPLATE = " %s[\"%s\"] = %s;\n"; - // Only allow SumOp - if (op_type != "sum") { - return true; + additional_grad_attrs_str += paddle::string::Sprintf( + GRAD_ATTRS_TEMPLATE, attrs_name, "bias", "float(0.0f)"); + additional_grad_attrs_str += paddle::string::Sprintf( + GRAD_ATTRS_TEMPLATE, attrs_name, "bias_after_scale", "bool(true)"); } - return false; + return additional_grad_attrs_str; } static void PrepareAttrMapForOps() { @@ -973,7 +979,9 @@ static bool CollectGradInformationFromOpInfo( /* --------------------------------------------------- */ static std::string GenerateGradNodeCreationContent( const ForwardGenerationInfo& fwd_info, - const GradNodeGenerationInfo& bwd_info) { + const GradNodeGenerationInfo& bwd_info, + const std::string& trace_op_body_str, + std::map inplace_map = {}) { VLOG(6) << "Generating GradNode Creation codes"; const std::string& op_type = fwd_info.GetOpType(); @@ -992,7 +1000,8 @@ static std::string GenerateGradNodeCreationContent( // If single output slotname and not duplicable, // then generate: "egr::AutogradMeta* p_autograd_out = // egr::EagerUtils::autograd_meta("op_proto->outputs()[0].name()")" - std::string get_autograd_meta_str = " // Prepare Autograd Meta \n"; + std::string get_input_autograd_meta_str = " // Prepare Autograd Meta \n"; + std::string get_output_autograd_meta_str = ""; // If single output slotname and not duplicable, // then generate: "egr::AutogradMeta* p_autograd_out = // egr::EagerUtils::autograd_meta("op_proto.outputs()[0].name()")" @@ -1000,22 +1009,39 @@ static std::string GenerateGradNodeCreationContent( const std::string& output_name = output.name(); const std::string& output_autograd_name = "p_autograd_" + output_name; + // output autograd_meta should be got after running TraceOP. if (output.duplicable()) { const char* GET_MULTI_AUTOGRAD_META_TEMPLATE = - " std::vector %s = " + " std::vector %s = " "egr::EagerUtils::autograd_meta(&%s);\n"; - get_autograd_meta_str += paddle::string::Sprintf( + get_output_autograd_meta_str += paddle::string::Sprintf( GET_MULTI_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name); } else { - const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE = - " egr::AutogradMeta* %s = " - "egr::EagerUtils::autograd_meta(&%s);\n"; - get_autograd_meta_str += paddle::string::Sprintf( - GET_SINGLE_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name); + // In inplace op, the case where output is duplicable is not considered. + // Replace output directly with input in inplace op. + if (!inplace_map.empty() && inplace_map.count(output_name)) { + auto inplace_input_name = inplace_map[output_name]; + const std::string& inplace_input_autograd_name = + "p_autograd_" + inplace_input_name; + const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE = + " %s = egr::EagerUtils::autograd_meta(&%s);\n"; + get_output_autograd_meta_str += paddle::string::Sprintf( + GET_SINGLE_AUTOGRAD_META_TEMPLATE, inplace_input_autograd_name, + inplace_input_name); + } else { + const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE = + " egr::AutogradMeta* %s = " + "egr::EagerUtils::autograd_meta(&%s);\n"; + get_output_autograd_meta_str += + paddle::string::Sprintf(GET_SINGLE_AUTOGRAD_META_TEMPLATE, + output_autograd_name, output_name); + } } } VLOG(6) << "Generated outputs autograd_meta"; + // input autograd_meta should be got before running TraceOP (for checking + // inplace). for (const proto::OpProto::Var& input : in_vars) { const std::string& input_name = input.name(); const std::string& input_autograd_name = "p_autograd_" + input_name; @@ -1024,28 +1050,46 @@ static std::string GenerateGradNodeCreationContent( const char* GET_MULTI_AUTOGRAD_META_TEMPLATE = " std::vector %s = " "egr::EagerUtils::nullable_autograd_meta(%s);\n"; - get_autograd_meta_str += paddle::string::Sprintf( + get_input_autograd_meta_str += paddle::string::Sprintf( GET_MULTI_AUTOGRAD_META_TEMPLATE, input_autograd_name, input_name); } else if (input.dispensable()) { const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE = " egr::AutogradMeta* %s = " "egr::EagerUtils::nullable_autograd_meta(%s);\n"; - get_autograd_meta_str += paddle::string::Sprintf( + get_input_autograd_meta_str += paddle::string::Sprintf( GET_SINGLE_AUTOGRAD_META_TEMPLATE, input_autograd_name, input_name); } else { const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE = " egr::AutogradMeta* %s = " "egr::EagerUtils::nullable_autograd_meta(%s);\n"; - get_autograd_meta_str += paddle::string::Sprintf( + get_input_autograd_meta_str += paddle::string::Sprintf( GET_SINGLE_AUTOGRAD_META_TEMPLATE, input_autograd_name, input_name); } } VLOG(6) << "Generated inputs autograd_meta"; + // check inplace input to avoid inplace operations on leaf nodes with + // stop_gradient=False. + std::string check_inplace_str = ""; + if (!inplace_map.empty()) { + const char* CHECKING_INPLACE_TEMPLATE = + " // Check Inplace\n" + " egr::EagerUtils::CheckInplace(%s, p_autograd_%s, " + "require_any_grad);\n"; + for (auto& inplace_pair : inplace_map) { + std::string inplace_name = inplace_pair.second; + check_inplace_str += paddle::string::Sprintf(CHECKING_INPLACE_TEMPLATE, + inplace_name, inplace_name); + } + VLOG(6) << "Check Inplace Input"; + } + std::string prepare_autograd_meta_str = ""; - prepare_autograd_meta_str += get_autograd_meta_str; + // only generate input autograd_meta in temporary. + // output autograd_meta will be generated after running TraceOP. + prepare_autograd_meta_str += get_input_autograd_meta_str; prepare_autograd_meta_str += "\n"; // [GradOpNode] GetTraceBackward @@ -1060,7 +1104,7 @@ static std::string GenerateGradNodeCreationContent( size_t bwd_in_slot_num = out_vars.size(); size_t bwd_out_slot_num = in_vars.size(); const char* GRAD_OP_NODE_TEMPLATE = - " auto grad_node = std::make_shared(%d, %d);\n"; + " auto grad_node = std::make_shared(%d, %d);\n"; grad_node_creation_str += " // Create GradOpNode\n"; grad_node_creation_str += paddle::string::Sprintf( GRAD_OP_NODE_TEMPLATE, op_type, bwd_in_slot_num, bwd_out_slot_num); @@ -1069,14 +1113,14 @@ static std::string GenerateGradNodeCreationContent( VLOG(6) << "Generated GradOpNode construction"; // [GradOpNode] Set Attrs - grad_node_creation_str += " // Set Attributes\n"; - grad_node_creation_str += " grad_node->SetAttrMap(std::move(attrs));\n"; + grad_node_creation_str += " // Set Attributes\n"; + grad_node_creation_str += " grad_node->SetAttrMap(std::move(attrs));\n"; grad_node_creation_str += - " grad_node->SetDefaultAttrMap(std::move(default_attrs));\n"; + " grad_node->SetDefaultAttrMap(std::move(default_attrs));\n"; grad_node_creation_str += "\n"; // [GradOpNode] Set TensorWrappers - grad_node_creation_str += " // Set Tensor Wrappers\n"; + grad_node_creation_str += " // Set Tensor Wrappers\n"; for (const auto& iter : op_base_infos) { const std::map& grad_ins_fwd_slotname_map = iter.GetGradInsFwdSlotnameMap(); @@ -1088,10 +1132,18 @@ static std::string GenerateGradNodeCreationContent( full_reserved = "true"; } const char* SET_TENSOR_WRAPPER_TEMPLATE = - " grad_node->SetTensorWrapper%s(%s, %s);\n"; - grad_node_creation_str += paddle::string::Sprintf( - SET_TENSOR_WRAPPER_TEMPLATE, tensor_wrapper_name, tensor_wrapper_name, - full_reserved); + " grad_node->SetTensorWrapper%s(%s, %s);\n"; + // Replace output directly with input in inplace op. + if (!inplace_map.empty() && inplace_map.count(tensor_wrapper_name)) { + auto inplace_input_name = inplace_map[tensor_wrapper_name]; + grad_node_creation_str += paddle::string::Sprintf( + SET_TENSOR_WRAPPER_TEMPLATE, tensor_wrapper_name, + inplace_input_name, full_reserved); + } else { + grad_node_creation_str += paddle::string::Sprintf( + SET_TENSOR_WRAPPER_TEMPLATE, tensor_wrapper_name, + tensor_wrapper_name, full_reserved); + } } } grad_node_creation_str += "\n"; @@ -1109,12 +1161,12 @@ static std::string GenerateGradNodeCreationContent( size_t input_position = fwd_inputs_name_pos_map.at(input_name); const char* SET_GRAD_OUT_META_TEMPLATE = - " grad_node->SetGradOutMeta(%s, %d);\n"; + " grad_node->SetGradOutMeta(%s, %d);\n"; grad_node_creation_str += paddle::string::Sprintf( - SET_GRAD_OUT_META_TEMPLATE, input_autograd_name, input_position); + SET_GRAD_OUT_META_TEMPLATE, input_name, input_position); const char* ADD_EDGES_TEMPLATE = - " if(%s) grad_node->AddEdges(%s, %d);\n"; + " if(%s) grad_node->AddEdges(%s, %d);\n"; grad_node_creation_str += paddle::string::Sprintf(ADD_EDGES_TEMPLATE, input_autograd_name, input_autograd_name, input_position); @@ -1123,11 +1175,11 @@ static std::string GenerateGradNodeCreationContent( size_t input_position = fwd_inputs_name_pos_map.at(input_name); const char* SET_GRAD_OUT_META_TEMPLATE = - " grad_node->SetGradOutMeta(&%s, %d);\n"; + " grad_node->SetGradOutMeta(%s, %d);\n"; grad_node_creation_str += paddle::string::Sprintf( - SET_GRAD_OUT_META_TEMPLATE, input_autograd_name, input_position); + SET_GRAD_OUT_META_TEMPLATE, input_name, input_position); - const char* ADD_EDGES_TEMPLATE = " grad_node->AddEdges(&%s, %d);\n"; + const char* ADD_EDGES_TEMPLATE = " grad_node->AddEdges(&%s, %d);\n"; grad_node_creation_str += paddle::string::Sprintf( ADD_EDGES_TEMPLATE, input_autograd_name, input_position); } @@ -1139,73 +1191,125 @@ static std::string GenerateGradNodeCreationContent( std::string pass_stop_gradient_args = "false"; for (const proto::OpProto::Var& output : out_vars) { const std::string& output_name = output.name(); - const std::string& output_autograd_name = "p_autograd_" + output_name; - size_t output_position = fwd_outputs_name_pos_map.at(output_name); - - // Intermediate Tensor does not require SetHistory, nor RetainGrad - - if (output.duplicable()) { - pass_stop_gradient_args += ", &" + output_autograd_name; + // Replace output directly with input in inplace op. + if (!inplace_map.empty() && inplace_map.count(output_name)) { + auto inplace_input_name = inplace_map[output_name]; + const std::string& inplace_input_autograd_name = + "p_autograd_" + inplace_input_name; + size_t output_position = fwd_outputs_name_pos_map.at(output_name); + + // Intermediate Tensor does not require SetHistory, nor RetainGrad + pass_stop_gradient_args += ", " + inplace_input_autograd_name; const char* SET_OUT_RANK_TEMPLATE = - " egr::EagerUtils::SetOutRankWithSlot(&%s, %d);\n"; + " egr::EagerUtils::SetOutRankWithSlot(%s, %d);\n"; grad_node_creation_str += paddle::string::Sprintf( - SET_OUT_RANK_TEMPLATE, output_autograd_name, output_position); + SET_OUT_RANK_TEMPLATE, inplace_input_autograd_name, output_position); // Intermediate Tensor does not require SetHistory if (!output.intermediate()) { const char* SET_HISTORY_TEMPLATE = - " egr::EagerUtils::SetHistory(&%s, grad_node);\n"; - grad_node_creation_str += - paddle::string::Sprintf(SET_HISTORY_TEMPLATE, output_autograd_name); + " egr::EagerUtils::SetHistory(%s, grad_node);\n"; + grad_node_creation_str += paddle::string::Sprintf( + SET_HISTORY_TEMPLATE, inplace_input_autograd_name); } const char* SET_GRAD_IN_META_TEMPLATE = - " grad_node->SetGradInMeta(&%s, %d);\n"; + " grad_node->SetGradInMeta(%s, %d);\n"; grad_node_creation_str += paddle::string::Sprintf( - SET_GRAD_IN_META_TEMPLATE, output_autograd_name, output_position); + SET_GRAD_IN_META_TEMPLATE, inplace_input_name, output_position); + // Intermediate Tensor does not require CheckAndRetainGrad + if (!output.intermediate()) { + VLOG(6) << "Generated Call RetainGradForTensor"; + const char* RETAIN_GRAD_TEMPLATE = + " egr::EagerUtils::CheckAndRetainGrad(%s);\n"; + grad_node_creation_str += + paddle::string::Sprintf(RETAIN_GRAD_TEMPLATE, inplace_input_name); + } } else { - pass_stop_gradient_args += ", " + output_autograd_name; - const char* SET_OUT_RANK_TEMPLATE = - " egr::EagerUtils::SetOutRankWithSlot(%s, %d);\n"; - grad_node_creation_str += paddle::string::Sprintf( - SET_OUT_RANK_TEMPLATE, output_autograd_name, output_position); + const std::string& output_autograd_name = "p_autograd_" + output_name; + size_t output_position = fwd_outputs_name_pos_map.at(output_name); - // Intermediate Tensor does not require SetHistory + // Intermediate Tensor does not require SetHistory, nor RetainGrad + + if (output.duplicable()) { + pass_stop_gradient_args += ", &" + output_autograd_name; + const char* SET_OUT_RANK_TEMPLATE = + " egr::EagerUtils::SetOutRankWithSlot(&%s, %d);\n"; + grad_node_creation_str += paddle::string::Sprintf( + SET_OUT_RANK_TEMPLATE, output_autograd_name, output_position); + + // Intermediate Tensor does not require SetHistory + if (!output.intermediate()) { + const char* SET_HISTORY_TEMPLATE = + " egr::EagerUtils::SetHistory(&%s, grad_node);\n"; + grad_node_creation_str += paddle::string::Sprintf( + SET_HISTORY_TEMPLATE, output_autograd_name); + } + const char* SET_GRAD_IN_META_TEMPLATE = + " grad_node->SetGradInMeta(%s, %d);\n"; + grad_node_creation_str += paddle::string::Sprintf( + SET_GRAD_IN_META_TEMPLATE, output_name, output_position); + + } else { + pass_stop_gradient_args += ", " + output_autograd_name; + const char* SET_OUT_RANK_TEMPLATE = + " egr::EagerUtils::SetOutRankWithSlot(%s, %d);\n"; + grad_node_creation_str += paddle::string::Sprintf( + SET_OUT_RANK_TEMPLATE, output_autograd_name, output_position); + + // Intermediate Tensor does not require SetHistory + if (!output.intermediate()) { + const char* SET_HISTORY_TEMPLATE = + " egr::EagerUtils::SetHistory(%s, grad_node);\n"; + grad_node_creation_str += paddle::string::Sprintf( + SET_HISTORY_TEMPLATE, output_autograd_name); + } + const char* SET_GRAD_IN_META_TEMPLATE = + " grad_node->SetGradInMeta(%s, %d);\n"; + grad_node_creation_str += paddle::string::Sprintf( + SET_GRAD_IN_META_TEMPLATE, output_name, output_position); + } + + // Intermediate Tensor does not require CheckAndRetainGrad if (!output.intermediate()) { - const char* SET_HISTORY_TEMPLATE = - " egr::EagerUtils::SetHistory(%s, grad_node);\n"; + VLOG(6) << "Generated Call RetainGradForTensor"; + const char* RETAIN_GRAD_TEMPLATE = + " egr::EagerUtils::CheckAndRetainGrad(%s);\n"; grad_node_creation_str += - paddle::string::Sprintf(SET_HISTORY_TEMPLATE, output_autograd_name); + paddle::string::Sprintf(RETAIN_GRAD_TEMPLATE, output_name); } - const char* SET_GRAD_IN_META_TEMPLATE = - " grad_node->SetGradInMeta(%s, %d);\n"; - grad_node_creation_str += paddle::string::Sprintf( - SET_GRAD_IN_META_TEMPLATE, output_autograd_name, output_position); - } - - // Intermediate Tensor does not require CheckAndRetainGrad - if (!output.intermediate()) { - VLOG(6) << "Generated Call RetainGradForTensor"; - const char* RETAIN_GRAD_TEMPLATE = - " egr::EagerUtils::CheckAndRetainGrad(%s);\n"; - grad_node_creation_str += - paddle::string::Sprintf(RETAIN_GRAD_TEMPLATE, output_name); } } VLOG(6) << "Generated SetGradIn/OutMeta"; // [Generation] GradNode Creation + // After getting require_any_grad, firstly use CheckInplace method for inplace + // op. + // Then execute TraceOp and generate output autograd_meta. + // Finally, Construct GradNode. (Replace output directly with input in inplace + // op.) + // Add event record + std::string event_name = op_type + " node_creation"; const char* GRAD_NODE_CREATION_TEMPLATE = - " %s" + "%s" " bool require_any_grad = egr::EagerUtils::ComputeRequireGrad(%s);\n" - " if(require_any_grad) {\n" - " VLOG(6) << \" Construct Grad for %s \"; \n" - " egr::EagerUtils::PassStopGradient(%s);\n" - "%s\n }"; + "%s\n" + "%s" + " {\n" + " paddle::platform::RecordEvent node_creation_record_event(\"%s\", " + "paddle::platform::TracerEventType::Operator, 1);\n" + "%s" + " if(require_any_grad) {\n" + " VLOG(6) << \" Construct Grad for %s \"; \n" + " egr::EagerUtils::PassStopGradient(%s);\n" + " %s\n" + " }\n" + " }"; std::string grad_node_creation_body_str = paddle::string::Sprintf( GRAD_NODE_CREATION_TEMPLATE, prepare_autograd_meta_str, - compute_require_grad_args, op_type, pass_stop_gradient_args, - grad_node_creation_str); + compute_require_grad_args, check_inplace_str, trace_op_body_str, + event_name, get_output_autograd_meta_str, op_type, + pass_stop_gradient_args, grad_node_creation_str); return grad_node_creation_body_str; } @@ -1215,7 +1319,8 @@ static std::string GenerateGradNodeCreationContent( /* -------------------------------- */ static std::pair GenerateForwardFunctionContents( const ForwardGenerationInfo& fwd_info, - const GradNodeGenerationInfo& bwd_info) { + const GradNodeGenerationInfo& bwd_info, + std::map inplace_map = {}) { /* --- Process Forward Info ---*/ const std::string& op_type = fwd_info.GetOpType(); const std::unordered_map& fwd_inputs_name_pos_map = @@ -1295,8 +1400,21 @@ static std::pair GenerateForwardFunctionContents( core_ops_args_type_info[op_type][input_position] = "list"; } else { - const char* FWD_INS_ARG_TEMPLATE = - "const paddle::experimental::Tensor& %s"; + // inplace tensor can't be const + const char* FWD_INS_ARG_TEMPLATE; + bool flag_find_input_name = false; + if (!inplace_map.empty()) { + for (auto& inplace_pair : inplace_map) { + if (inplace_pair.second == input_name) { + flag_find_input_name = true; + FWD_INS_ARG_TEMPLATE = "paddle::experimental::Tensor& %s"; + break; + } + } + } + if (!flag_find_input_name) { + FWD_INS_ARG_TEMPLATE = "const paddle::experimental::Tensor& %s"; + } input_args_str_list[input_position] = paddle::string::Sprintf(FWD_INS_ARG_TEMPLATE, input_name); @@ -1356,6 +1474,7 @@ static std::pair GenerateForwardFunctionContents( // [Generation] Get Outs Map std::string outs_contents_str = ""; + std::string inplace_mapping_str = ""; for (const proto::OpProto::Var& output : out_vars) { const std::string& output_name = output.name(); std::string outnum = "1"; @@ -1398,6 +1517,22 @@ static std::pair GenerateForwardFunctionContents( } core_ops_args_info[op_type].push_back(output_var_name); + } else if (!inplace_map.empty() && inplace_map.count(output_name)) { + // In inplace op, replace the output with the input directly. + PADDLE_ENFORCE_NE( + inplace_map[output_name], "", + paddle::platform::errors::InvalidArgument( + "Inplace op %s has no input corresponding to output %s.", op_type, + output_name)); + const char* FWD_OUTS_CONTENT_TEMPLATE = "{ \"%s\", ins[\"%s\"] },"; + auto inplace_input_name = inplace_map[output_name]; + outs_contents_str += paddle::string::Sprintf( + FWD_OUTS_CONTENT_TEMPLATE, output_name, inplace_input_name); + + // inplace_map used in TraceOp. + const char* INPLACE_MAPPING_TEMPLATE = R"({"%s", "%s"},)"; + inplace_mapping_str += paddle::string::Sprintf( + INPLACE_MAPPING_TEMPLATE, inplace_input_name, output_name); } else { if (output.duplicable()) { outnum = output_name + "Num"; @@ -1424,6 +1559,8 @@ static std::pair GenerateForwardFunctionContents( } if (outs_contents_str.size() > 0) outs_contents_str.pop_back(); // Remove trailing "," + if (inplace_mapping_str.size() > 0) + inplace_mapping_str.pop_back(); // Remove trailing "," const char* FWD_OUTS_MAP_TEMPLATE = " std::map GenerateForwardFunctionContents( dygraph_function_args_str += ", const paddle::framework::AttributeMap& attr_map"; + /* --------- Generate TraceOp ----- */ + // TraceOp should be run after compute require_any_grad. (for checking + // inplace) + // `trace_op_body_str` will be passed as a parameter to + // `GenerateGradNodeCreationContent`. + std::string trace_op_body_str = ""; // [Generation] Get TraceOp const char* FWD_TRACE_OP_TEMPLATE = " paddle::framework::AttributeMap attrs = attr_map;\n" @@ -1464,11 +1607,12 @@ static std::pair GenerateForwardFunctionContents( " egr::Controller::Instance().GetCurrentTracer()->TraceOp(\"%s\", ins, " "outs, attrs, \n" " egr::Controller::Instance().GetExpectedPlace(),\n" - " &default_attrs, true, {});\n"; - std::string trace_op_str = - paddle::string::Sprintf(FWD_TRACE_OP_TEMPLATE, op_type); - generated_function_body += trace_op_str; - generated_function_body += "\n"; + " &default_attrs, true, {%s});\n"; + std::string trace_op_str = paddle::string::Sprintf( + FWD_TRACE_OP_TEMPLATE, op_type, inplace_mapping_str); + + trace_op_body_str += trace_op_str; + trace_op_body_str += "\n"; VLOG(6) << "Generated AttrMap & TraceOp"; @@ -1533,48 +1677,64 @@ static std::pair GenerateForwardFunctionContents( output_varname, output_var_args_name); } } else { - const char* FWD_OUT_TENSOR_TEMPLATE = - " paddle::experimental::Tensor %s;\n" - " egr::EagerUtils::GetOutput(outs[\"%s\"][0], &%s);\n"; - out_tensor_str = - paddle::string::Sprintf(FWD_OUT_TENSOR_TEMPLATE, output_varname, - output_name, output_varname); + if (!inplace_map.empty() && inplace_map.count(output_name)) { + // Modify meta info of inplace tensor. + // Bump inplace version of inplace tensor. + auto inplace_input_name = inplace_map[output_name]; + const char* FWD_OUT_TENSOR_TEMPLATE = + " egr::EagerUtils::ModifyInplaceInput(outs[\"%s\"][0], &%s);\n" + " %s.bump_inplace_version();\n" + " VLOG(3) << \"Tensor(\" << %s.name() << \") uses Inplace " + "Strategy.\";\n"; + out_tensor_str = paddle::string::Sprintf( + FWD_OUT_TENSOR_TEMPLATE, output_name, inplace_input_name, + inplace_input_name, inplace_input_name); + } else { + const char* FWD_OUT_TENSOR_TEMPLATE = + " paddle::experimental::Tensor %s;\n" + " egr::EagerUtils::GetOutput(outs[\"%s\"][0], &%s);\n"; + out_tensor_str = + paddle::string::Sprintf(FWD_OUT_TENSOR_TEMPLATE, output_varname, + output_name, output_varname); + } } return_types[return_position] = "paddle::experimental::Tensor"; } - return_contents[return_position] = output_varname; - generated_function_body += out_tensor_str; + if (!inplace_map.empty() && inplace_map.count(output_name)) { + // Replace output directly with input in inplace op. + return_contents[return_position] = inplace_map[output_name]; + } else { + return_contents[return_position] = output_varname; + } + trace_op_body_str += out_tensor_str; } - generated_function_body += "\n"; + trace_op_body_str += "\n"; VLOG(6) << "Converted Output VarBase to EagerVariable(s)"; + /* ------ END Generate TraceOp ----- */ // [Generation] Handle core_ops_returns_info - core_ops_returns_info[op_type] = return_contents; + // avoid inplace op changing core_ops_returns_info + if (core_ops_returns_info.empty() || !core_ops_returns_info.count(op_type)) { + core_ops_returns_info[op_type] = return_contents; + } // [Generation] ComputeRequireGrad -> GradNodeCreation if (!bwd_info.GenerateForwardOnly()) { - std::string grad_node_creation_body_str = - GenerateGradNodeCreationContent(fwd_info, bwd_info); - - // Add event record - std::string event_name = op_type + " node_creation"; - const char* NODE_CREATION_TEMPLATE = - "{\n" - " paddle::platform::RecordEvent node_creation_record_event(\"%s\", " - "paddle::platform::TracerEventType::Operator, 1);\n" - " %s\n" - "}"; - - grad_node_creation_body_str = paddle::string::Sprintf( - NODE_CREATION_TEMPLATE, event_name, grad_node_creation_body_str); + // If GradNode needs to be generated, pass `trace_op_body_str` + // into `GenerateGradNodeCreationContent`. + std::string grad_node_creation_body_str = GenerateGradNodeCreationContent( + fwd_info, bwd_info, trace_op_body_str, inplace_map); generated_function_body += grad_node_creation_body_str; generated_function_body += "\n"; // [Generation] Call RetainGradForTensor VLOG(6) << "Generated GradNode Creation codes"; + } else { + // If GradNode doesn't need to be generated, generate TraceOP directly. + generated_function_body += trace_op_body_str; } // [Generation] Handle return: Tuple/Vector/Tensor @@ -1621,7 +1781,13 @@ static std::pair GenerateForwardFunctionContents( VLOG(6) << "Generated return codes"; // [Generation] Get Full Function - std::string function_name = op_type + "_dygraph_function"; + std::string function_name; + if (inplace_map.empty()) { + function_name = op_type + "_dygraph_function"; + } else { + // change function_name for inplace op. + function_name = op_type + "__dygraph_function"; + } if (dygraph_function_args_str.size() > 0) { auto iter = dygraph_function_args_str.begin(); @@ -1629,15 +1795,15 @@ static std::pair GenerateForwardFunctionContents( } const char* DYGRAPH_FUNCTION_EVENT_RECORD_FUNCTION_TEMPLATE = - "paddle::platform::RecordEvent dygraph_entrance_record_event(\"%s\", " + " paddle::platform::RecordEvent dygraph_entrance_record_event(\"%s\", " "paddle::platform::TracerEventType::Operator, 1);"; std::string event_name = op_type + " dygraph"; std::string fwd_record_event_str = paddle::string::Sprintf( DYGRAPH_FUNCTION_EVENT_RECORD_FUNCTION_TEMPLATE, event_name); const char* FWD_FUNCTION_TEMPLATE = "%s %s(%s) {\n\n" - " %s\n" - " %s\n" + "%s\n" + "%s\n" "}\n\n"; std::string fwd_function_str = paddle::string::Sprintf( FWD_FUNCTION_TEMPLATE, function_proto_return_type_str, function_name, @@ -1828,7 +1994,7 @@ static std::string GenerateSingleOpBase( !is_op_base_per_duplicable_input) { const char* GRAD_OUTS_CONTENT_TEMPLATE = "{ \"%s\", egr::EagerUtils::CreateVars( " - "this->OutputMeta()[%d].Size() ) },"; + "this->OutputMeta()[%d].size() ) },"; outs_contents_str += paddle::string::Sprintf( GRAD_OUTS_CONTENT_TEMPLATE, grad_output_name, fwd_input_position); } else { @@ -1866,18 +2032,9 @@ static std::string GenerateSingleOpBase( const char* ATTRS_TEMPLATE = " auto& %s = this->attr_map_;\n"; std::string grad_attrs_str = paddle::string::Sprintf(ATTRS_TEMPLATE, attrs_name); - for (const auto& iter : grad_attrs) { - if (IgnoreGradAttribute(fwd_op_type, iter.first)) continue; - std::pair type_val = - GetAttrType(iter.second, false /*is_arg*/); - const char* GRAD_ATTRS_TEMPLATE = - " %s %s = %s;\n" - " %s[\"%s\"] = %s;\n"; - std::string var_name = iter.first + std::to_string(*outs_size); - grad_attrs_str += paddle::string::Sprintf( - GRAD_ATTRS_TEMPLATE, type_val.first, var_name, type_val.second, - attrs_name, iter.first, var_name); - } + + // Handle dynamic grad attributes + grad_attrs_str += HandleDynamicGradAttributes(fwd_op_type, attrs_name); generated_grad_function_body += grad_attrs_str; const char* TRACE_OP_TEMPLATE = @@ -2056,7 +2213,7 @@ static std::string GenerateGradNodeCCContents( if (is_op_base_per_duplicable_input) { const char* OP_BASE_PER_DUP_INPUT_TEMPLATE = - " for(int i = 0; i < this->OutputMeta()[0].Size(); i++) {\n" + " for(size_t i = 0; i < this->OutputMeta()[0].size(); i++) {\n" " %s\n" " }\n"; generated_grad_function_body = paddle::string::Sprintf( @@ -2068,6 +2225,8 @@ static std::string GenerateGradNodeCCContents( "GradNode%s::ApplyGradientHooks(grads);\n" " std::vector> outputs(%d);\n" " %s\n" + " if(NeedComplexToRealConversion()) " + "HandleComplexGradToRealGrad(&outputs);\n" " return outputs;\n"; generated_grad_function_body = paddle::string::Sprintf(BWD_RETURN_TEMPLATE, fwd_op_type, in_vars.size(), @@ -2077,7 +2236,8 @@ static std::string GenerateGradNodeCCContents( const char* GRAD_FUNCTION_TEMPLATE = "std::vector> " "GradNode%s::operator()(const " - "std::vector>& grads) {\n%s\n}"; + "std::vector>& grads, " + "bool create_graph) {\n%s\n}"; std::string grad_function_str = paddle::string::Sprintf( GRAD_FUNCTION_TEMPLATE, fwd_op_type, generated_grad_function_body); @@ -2112,18 +2272,28 @@ static std::string GenerateGradNodeHeaderContents( "\n" " virtual std::vector> " "operator()(const " - "std::vector>& grads) " + "std::vector>& grads, const " + "bool create_graph = false) " "override;\n" "\n" + " void ClearTensorWrappers() override { \n" + "%s\n" + " is_tensor_wrappers_cleared = true;\n" + " }\n" " std::string name() override { return \" GradNode%s \"; } \n " "\n" " // SetX, SetY, ...\n" "%s\n" " // SetAttrMap\n" "%s\n" + " bool IsTensorWrappersCleared() override { \n" + " return is_tensor_wrappers_cleared;\n" + " }\n" " private:\n" " // TensorWrappers\n" "%s\n" + " bool is_tensor_wrappers_cleared = false;\n" + "\n" " // Attribute Map\n" "%s\n" "};"; @@ -2157,6 +2327,7 @@ static std::string GenerateGradNodeHeaderContents( std::string set_tensor_wrappers_str = ""; std::string tensor_wrapper_members_str = ""; + std::string clear_tensor_wrappers_str = ""; for (const auto& iter : op_base_infos) { const std::map& grad_ins_fwd_slotname_map = iter.GetGradInsFwdSlotnameMap(); @@ -2188,6 +2359,13 @@ static std::string GenerateGradNodeHeaderContents( SET_TENSOR_WRAPPER_BODY_TEMPLATE, tensor_wrapper_name, struct_tensor_wrapper_name); + const char* CLEAR_TENSOR_WRAPPER_TEMPLATE = + "for (auto tw: %s) {\n" + " tw.clear();\n" + " }\n"; + clear_tensor_wrappers_str += paddle::string::Sprintf( + CLEAR_TENSOR_WRAPPER_TEMPLATE, struct_tensor_wrapper_name); + } else { const char* ATTR_TENSOR_WRAPPER_ARG_TEMPLATE = "const paddle::experimental::Tensor& %s"; @@ -2200,10 +2378,14 @@ static std::string GenerateGradNodeHeaderContents( TENSOR_WRAPPER_MEMBER_TEMPLATE, struct_tensor_wrapper_name); const char* SET_TENSOR_WRAPPER_BODY_TEMPLATE = - "%s = egr::TensorWrapper(%s, %s /*full_reserved*/);"; + "%s = egr::TensorWrapper(%s, %s /*full_reserved*/);\n"; tensor_wrapper_body_str = paddle::string::Sprintf( SET_TENSOR_WRAPPER_BODY_TEMPLATE, struct_tensor_wrapper_name, tensor_wrapper_name, full_reserved_str); + + const char* CLEAR_TENSOR_WRAPPER_TEMPLATE = " %s.clear();\n"; + clear_tensor_wrappers_str += paddle::string::Sprintf( + CLEAR_TENSOR_WRAPPER_TEMPLATE, struct_tensor_wrapper_name); } std::string full_reserved_signature_str = "bool full_reserved"; const char* SET_TENSOR_WRAPPER_TEMPLATE = @@ -2218,8 +2400,8 @@ static std::string GenerateGradNodeHeaderContents( std::string grad_node_str = paddle::string::Sprintf( GRAD_NODE_TEMPLATE, op_type, op_type, op_type, op_type, op_type, op_type, - op_type, op_type, set_tensor_wrappers_str, set_attr_map_str, - tensor_wrapper_members_str, attr_members_str); + op_type, clear_tensor_wrappers_str, op_type, set_tensor_wrappers_str, + set_attr_map_str, tensor_wrapper_members_str, attr_members_str); return grad_node_str; } @@ -2404,7 +2586,7 @@ static void DygraphCodeGeneration(const std::string& output_dir) { /* --------------------------- */ VLOG(6) << "-------- GenerateForwardFunctionContents -------"; std::pair body_and_declaration = - GenerateForwardFunctionContents(fwd_info, bwd_info); + GenerateForwardFunctionContents(fwd_info, bwd_info, {}); fwd_function_str += body_and_declaration.first + "\n"; @@ -2412,6 +2594,30 @@ static void DygraphCodeGeneration(const std::string& output_dir) { std::string fwd_function_declare_str = body_and_declaration.second; dygraph_forward_api_str += fwd_function_declare_str; + auto& infer_inplace = + paddle::framework::OpInfoMap::Instance().Get(op_type).infer_inplace_; + std::map inplace_map; + // Inplace Function Generator. + // `sum` op has duplicate input. Don't consider adding inplace strategy + // for `sum` in temporary. + if (op_type != "sum" && infer_inplace) { + auto in_to_outs = infer_inplace(true); + for (auto& inplace_pair : in_to_outs) { + inplace_map[inplace_pair.second] = inplace_pair.first; + } + + VLOG(6) << "-------- GenerateInplaceForwardFunctionContents -------"; + std::pair inplace_body_and_declaration = + GenerateForwardFunctionContents(fwd_info, bwd_info, inplace_map); + + fwd_function_str += inplace_body_and_declaration.first + "\n"; + + VLOG(6) << "-------- GenerateInplaceDygraphForwardAPIContents -------"; + std::string inplace_fwd_function_declare_str = + inplace_body_and_declaration.second; + dygraph_forward_api_str += inplace_fwd_function_declare_str; + } + if (bwd_info.GenerateForwardOnly()) continue; VLOG(6) << "-------- GenerateGradNodeHeaderContents -------"; diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt index 53af6c1048d2454b1e9f375b837103930026ae54..771351dd4affbb355748c275a59681a6d5ba5577 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt @@ -27,6 +27,7 @@ add_custom_target(eager_final_state_codegen set(tmp_python_c_output_path "${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/tmp_eager_final_state_op_function_impl.h") set(python_c_output_path "${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/eager_final_state_op_function_impl.h") + add_custom_target(eager_final_state_python_c_codegen COMMAND "${PYTHON_EXECUTABLE}" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py" "--api_yaml_path=${api_yaml_path}" diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py index 656418a05ad6d04bc19838c97d86db9cda19c1c6..1685b6f3cb5c3cc2ecbb1b773b9708703b3746c4 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py @@ -28,6 +28,7 @@ namespace = "" yaml_types_mapping = { 'int' : 'int', 'int32' : 'int32_t', 'int64' : 'int64_t', 'size_t' : 'size_t', \ 'float' : 'float', 'double' : 'double', 'bool' : 'bool', \ + 'str' : 'std::string', \ 'Backend' : 'paddle::experimental::Backend', 'DataLayout' : 'paddle::experimental::DataLayout', 'DataType' : 'paddle::experimental::DataType', \ 'int64[]' : 'std::vector', 'int[]' : 'std::vector', 'Tensor' : 'Tensor', @@ -212,7 +213,8 @@ def ParseYamlArgs(string): default_value = m.group(3).split("=")[1].strip() if len( m.group(3).split("=")) > 1 else None - assert arg_type in yaml_types_mapping.keys() + assert arg_type in yaml_types_mapping.keys( + ), f"The argument type {arg_type} in yaml config is not supported in yaml_types_mapping." arg_type = yaml_types_mapping[arg_type] arg_name = RemoveSpecialSymbolsInName(arg_name) @@ -247,7 +249,8 @@ def ParseYamlReturns(string): else: ret_type = ret.strip() - assert ret_type in yaml_types_mapping.keys() + assert ret_type in yaml_types_mapping.keys( + ), f"The return type {ret_type} in yaml config is not supported in yaml_types_mapping." ret_type = yaml_types_mapping[ret_type] assert "Tensor" in ret_type @@ -475,6 +478,7 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map, # SetTensorWrapper Methods & TensorWrapper Members set_tensor_wrapper_methods_str = "" tensor_wrapper_members_str = "" + clear_tensor_wrapper_str = "" for tname, (ttype, is_fwd_input, _) in backward_fwd_input_map.items(): if tname in no_need_buffer_set: no_need_buffer = "true" @@ -496,6 +500,13 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map, """ tensor_wrapper_members_str += PLAIN_TENSOR_MEMBER_TEMPLATE.format( tensor_wrapper_name) + + CLEAR_TENSOR_WRAPPERS_TEMPLATE = """ + {}.clear(); +""" + clear_tensor_wrapper_str += CLEAR_TENSOR_WRAPPERS_TEMPLATE.format( + tensor_wrapper_name) + else: assert IsVectorTensorType(ttype) SET_VECTOR_TENSOR_WRAPPER_TEMPLATE = """ @@ -513,6 +524,15 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map, """ tensor_wrapper_members_str += VECTOR_TENSOR_MEMBER_TEMPLATE.format( tensor_wrapper_name) + + CLEAR_TENSOR_WRAPPERS_TEMPLATE = """ + for (auto tw: {}) { + tw.clear(); + }; +""" + clear_tensor_wrapper_str += CLEAR_TENSOR_WRAPPERS_TEMPLATE.format( + tensor_wrapper_name) + # End: SetTensorWrapper Methods & TensorWrapper Members # SetAttributes & Attribute Members @@ -521,7 +541,7 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map, for aname, atype, default_val, _ in backward_attrs_list: saved_attr_name = GetSavedName(aname) SET_ATTR_METHOD_TEMPLATE = """ - void SetAttribute{}({} {}) {{ + void SetAttribute{}({} {}) {{ {} = {}; }} """ @@ -552,25 +572,37 @@ class {} : public egr::GradNodeBase {{ ~{}() override = default; virtual std::vector> operator()( - const std::vector>& grads) override; + const std::vector>& grads, bool create_graph = false) override; std::string name() override {{ return \" {} \"; }} + + void ClearTensorWrappers() override {{ + {} + is_tensor_wrappers_cleared = true; + }} + // SetTensorWrapperX, SetTensorWrapperY, ... {} // SetAttributes {} + + bool IsTensorWrappersCleared() override {{ + return is_tensor_wrappers_cleared; + }} private: // TensorWrappers {} + bool is_tensor_wrappers_cleared = false; + // Attributes {} }}; """ node_declaration_str = NODE_DECLARATION_TEMPLATE.format( grad_node_name, grad_node_name, grad_node_name, grad_node_name, - grad_node_name, set_tensor_wrapper_methods_str, - set_attribute_methods_str, tensor_wrapper_members_str, - attribute_members_str) + grad_node_name, clear_tensor_wrapper_str, + set_tensor_wrapper_methods_str, set_attribute_methods_str, + tensor_wrapper_members_str, attribute_members_str) return node_declaration_str @@ -624,6 +656,7 @@ def GenerateNodeDefinition(fwd_api_name, bwd_api_name, backward_fwd_input_map, else: # Rearrange output order accordingly returns_str += f"returns[{fwd_position}] = grad_api_returns[{grad_api_position}];\n" + returns_str += f"if(NeedComplexToRealConversion()) HandleComplexGradToRealGrad(&returns);\n" returns_str += f"return returns;\n" grad_node_name = GetGradNodeName(fwd_api_name) @@ -634,7 +667,7 @@ def GenerateNodeDefinition(fwd_api_name, bwd_api_name, backward_fwd_input_map, grad_api_namespace = f"paddle::experimental" FUNCTION_TEMPLATE = """ -std::vector> {}::operator()(const std::vector>& grads) {{ +std::vector> {}::operator()(const std::vector>& grads, bool create_graph) {{ // Call grad_api function auto grad_api_returns = {}::{}({}); {} @@ -697,7 +730,7 @@ def GenerateNodeCreationCodes( else: # Tuple api_result if IsPlainTensorType(rtype): - output_autograd_meta = f" egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&api_result[{pos}]);" + output_autograd_meta = f" egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&std::get<{pos}>(api_result));" else: assert IsVectorTensorType(rtype) output_autograd_meta = f" std::vector {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&api_result[{pos}]);\n" @@ -734,8 +767,11 @@ def GenerateNodeCreationCodes( else: set_tensor_wrappers = f" grad_node->SetTensorWrapper{name}({name}, true);" else: - if IsVectorTensorType(atype): - tw_name = f"api_result[{pos}]" + if num_fwd_outputs > 1: + # Aligned with forward output position + assert name in forward_outputs_position_map.keys() + fwd_output_pos = forward_outputs_position_map[name][1] + tw_name = f"std::get<{fwd_output_pos}>(api_result)" else: tw_name = f"api_result" @@ -751,7 +787,7 @@ def GenerateNodeCreationCodes( set_edges_list = [] for name, (_, pos) in forward_inputs_position_map.items(): input_autograd_meta_name = GetAutoGradMetaName(name) - set_grad_out_meta = f" grad_node->SetGradOutMeta({input_autograd_meta_name}, {pos});" + set_grad_out_meta = f" grad_node->SetGradOutMeta({name}, {pos});" set_edges = f" grad_node->AddEdges({input_autograd_meta_name}, {pos});" set_grad_out_meta_list.append(set_grad_out_meta) set_edges_list.append(set_edges) @@ -768,17 +804,18 @@ def GenerateNodeCreationCodes( output_autograd_meta_name = GetAutoGradMetaName(name) set_out_rank = f" egr::EagerUtils::SetOutRankWithSlot({output_autograd_meta_name}, {pos});" set_history = f" egr::EagerUtils::SetHistory({output_autograd_meta_name}, grad_node);" - set_grad_in_meta = f" grad_node->SetGradInMeta({output_autograd_meta_name}, {pos});" + if num_outputs == 1: + set_retain_grad = f" egr::EagerUtils::CheckAndRetainGrad(api_result);" + set_grad_in_meta = f" grad_node->SetGradInMeta(api_result, {pos});" + else: + set_retain_grad = f" egr::EagerUtils::CheckAndRetainGrad(std::get<{pos}>(api_result));" + set_grad_in_meta = f" grad_node->SetGradInMeta(std::get<{pos}>(api_result), {pos});" set_out_rank_list.append(set_out_rank) set_history_list.append(set_history) set_grad_in_meta_list.append(set_grad_in_meta) - - if num_outputs == 1: - set_retain_grad = f" egr::EagerUtils::CheckAndRetainGrad(api_result);" - else: - set_retain_grad = f" egr::EagerUtils::CheckAndRetainGrad(api_result[{pos}]);" set_retain_grad_list.append(set_retain_grad) + set_out_rank_str = "\n".join(set_out_rank_list) set_history_str = "\n".join(set_history_list) set_grad_in_meta_str = "\n".join(set_grad_in_meta_list) @@ -900,7 +937,7 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name, returns_list[0] = f"api_result" else: # Tuple api_result - returns_list[pos] = f"api_result[{pos}]" + returns_list[pos] = f"std::get<{pos}>(api_result)" if IsPlainTensorType(rtype): returns_type_list[pos] = "paddle::experimental::Tensor" @@ -1050,7 +1087,7 @@ def GenerateNodeCCFile(filepath, node_definition_str): #include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h" #include "paddle/fluid/eager/to_static/run_program_op_node.h" -#include "paddle/phi/api/include/sparse_api.h" +#include "paddle/phi/api/backward/sparse_bw_api.h" """ file_contents += node_definition_str with open(filepath, 'a') as f: @@ -1245,7 +1282,7 @@ if __name__ == "__main__": # Node Definition Generation definition_declaration_pair = GenerateForwardDefinition( fwd_api_name, bwd_api_name, forward_inputs_position_map, - forward_outputs_position_map, forward_attrs_list, + forward_outputs_position_map, orig_forward_attrs_list, backward_fwd_input_map, backward_grad_input_map, backward_grad_output_map, backward_attrs_list, optional_inputs, intermediate_outputs) @@ -1257,7 +1294,7 @@ if __name__ == "__main__": # For python-level API dispatch CollectCoreOpsInformation(fwd_api_name, forward_inputs_position_map, forward_outputs_position_map, - forward_attrs_list) + orig_forward_attrs_list) if len(namespace) > 0: forward_definition_str += f"""namespace {namespace} {{ diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py index 9b77f0449e01d6555cd3a25f101e4867ccc6ffd3..e1c2cf871ea4276d6423c8ed9c62076d13a024ec 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py @@ -14,9 +14,18 @@ import os import argparse +import logging from eager_gen import namespace, yaml_types_mapping, ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap -skipped_fwd_api_names = set(["scale"]) +########################### +## Global Configurations ## +########################### +skipped_forward_api_names = set(["scale"]) + + +def SkipAPIGeneration(forward_api_name): + return (forward_api_name in skipped_forward_api_names) + atype_to_parsing_function = { "bool": "CastPyArg2Boolean", @@ -24,7 +33,7 @@ atype_to_parsing_function = { "long": "CastPyArg2Long", "int64_t": "CastPyArg2Long", "float": "CastPyArg2Float", - "string": "CastPyArg2String", + "std::string": "CastPyArg2String", "std::vector": "CastPyArg2Booleans", "std::vector": "CastPyArg2Ints", "std::vector": "CastPyArg2Longs", @@ -39,64 +48,31 @@ atype_to_parsing_function = { } -def ParseArguments(): - parser = argparse.ArgumentParser( - description='Eager Code Generator Args Parser') - parser.add_argument('--api_yaml_path', type=str) - parser.add_argument('--output_path', type=str) - - args = parser.parse_args() - return args - - def FindParsingFunctionFromAttributeType(atype): if atype not in atype_to_parsing_function.keys(): - print(f"Unable to find {atype} in atype_to_parsing_function.") - assert False + assert False, f"Unable to find {atype} in atype_to_parsing_function." return atype_to_parsing_function[atype] -def GeneratePythonCFunction(fwd_api_name, forward_inputs_position_map, - forward_attrs_list, forward_outputs_position_map, - optional_inputs, is_forward_only): - # forward_inputs_position_map = { "name" : [type, fwd_position] } - # forward_outputs_position_map = { "name" : [type, fwd_position] } - # forward_attrs_list = [ [attr_name, attr_type, default_value, orig_position], ...] - # optional_inputs = [name0, ...] - - # Get EagerTensor from args - # Get dygraph function call args - num_args = len(forward_inputs_position_map.keys()) + len(forward_attrs_list) - num_input_tensors = len(forward_inputs_position_map.keys()) - dygraph_function_call_list = ["" for i in range(num_args)] - get_eager_tensor_str = "" - for name, (ttype, pos) in forward_inputs_position_map.items(): - is_optional = (name in optional_inputs) - if IsVectorTensorType(ttype): - get_eager_tensor_str += f" auto {name} = GetTensorListFromArgs(\"{fwd_api_name}\", \"{name}\", args, {pos}, false);\n" - else: - if is_optional: - get_eager_tensor_str += f" auto {name} = GetOptionalTensorFromArgs(\"{fwd_api_name}\", \"{name}\", args, {pos}, false);\n" - else: - get_eager_tensor_str += f" auto {name} = GetTensorFromArgs(\"{fwd_api_name}\", \"{name}\", args, {pos}, false);\n" - dygraph_function_call_list[pos] = f"{name}" +########################## +## Refactored Functions ## +########################## +PARSE_PYTHON_C_TENSORS_TEMPLATE = \ +" auto {} = {}(\"{}\", \"{}\", args, {}, false);\n" + - parse_attributes_str = "" - # Get Attributes - for name, atype, _, pos in forward_attrs_list: - parsing_function = FindParsingFunctionFromAttributeType(atype) - key = f"{name}" +PARSE_PYTHON_C_ARGS_TEMPLATE = \ +""" PyObject* {}_obj = PyTuple_GET_ITEM(args, {});\n + {} {} = {}({}_obj, \"{}\", {});\n""" - parse_attributes_str += f" PyObject* {name}_obj = PyTuple_GET_ITEM(args, {pos});\n" - parse_attributes_str += f" {atype} {name} = {parsing_function}({name}_obj, \"{fwd_api_name}\", {pos});\n" - dygraph_function_call_list[pos] = f"{name}" - dygraph_function_call_str = ",".join(dygraph_function_call_list) +RECORD_EVENT_TEMPLATE = \ +" paddle::platform::RecordEvent {}(\"{} {}\", paddle::platform::TracerEventType::Operator, 1);" - pythonc_event_str = f"paddle::platform::RecordEvent pythonc_record_event(\"{fwd_api_name} pybind_imperative_func\", paddle::platform::TracerEventType::Operator, 1);" - PYTHON_C_FUNCTION_TEMPLATE = """ +PYTHON_C_FUNCTION_TEMPLATE = \ +""" static PyObject * eager_final_state_api_{}(PyObject *self, PyObject *args, PyObject *kwargs) {{ {} @@ -130,26 +106,50 @@ static PyObject * eager_final_state_api_{}(PyObject *self, PyObject *args, PyObj }} """ - namespace_str = "" - if len(namespace) > 0: - namespace_str = f"{namespace}::" - if is_forward_only: - fwd_function_name = "paddle::experimental::" + namespace_str + fwd_api_name - else: - fwd_function_name = namespace_str + GetForwardFunctionName(fwd_api_name) - python_c_function_str = PYTHON_C_FUNCTION_TEMPLATE.format( - fwd_api_name, pythonc_event_str, fwd_api_name, get_eager_tensor_str, - parse_attributes_str, fwd_function_name, dygraph_function_call_str) +FUNCTION_NAME_TEMPLATE = \ +"{}{}{}" - python_c_function_reg_str = f"{{\"final_state_{fwd_api_name}\", (PyCFunction)(void(*)(void)) {namespace_str}eager_final_state_api_{fwd_api_name}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {fwd_api_name} in dygraph.\"}}\n" - return python_c_function_str, python_c_function_reg_str +PYTHON_C_FUNCTION_REG_TEMPLATE = \ +"{{\"final_state_{}\", (PyCFunction)(void(*)(void)) {}eager_final_state_api_{}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {} in dygraph.\"}}" -def GenerateCoreOpsInfoMap(): - result = """ +PYTHON_C_WRAPPER_TEMPLATE = \ +""" +#pragma once + +#include "pybind11/detail/common.h" +#include "paddle/phi/api/all.h" +#include "paddle/phi/api/lib/dygraph_api.h" +#include "paddle/phi/common/backend.h" +#include "paddle/phi/common/data_type.h" +#include "paddle/phi/common/scalar.h" +#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/api/include/sparse_api.h" +#include "paddle/fluid/pybind/op_function_common.h" +#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h" +#include "paddle/fluid/pybind/exception.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" +#include + +namespace paddle {{ +namespace pybind {{ + +{} + +static PyMethodDef EagerFinalStateMethods[] = {{ + {} +}}; + +}} // namespace pybind +}} // namespace paddle +""" + + +CORE_OPS_INFO = \ +""" static PyObject * eager_get_final_state_core_ops_args_info(PyObject *self) { PyThreadState *tstate = nullptr; try @@ -194,9 +194,11 @@ static PyObject * eager_get_final_state_core_ops_returns_info(PyObject *self) { return nullptr; } } - """ +""" + - core_ops_infos_registry = """ +CORE_OPS_INFO_REGISTRY = \ +""" {\"get_final_state_core_ops_args_info\", (PyCFunction)(void(*)(void))eager_get_final_state_core_ops_args_info, METH_NOARGS, \"C++ interface function for eager_get_final_state_core_ops_args_info.\"}, @@ -209,7 +211,259 @@ static PyObject * eager_get_final_state_core_ops_returns_info(PyObject *self) { METH_NOARGS, \"C++ interface function for eager_get_final_state_core_ops_returns_info.\"}, """ - return result, core_ops_infos_registry +NAMESPACE_WRAPPER_TEMPLATE = \ +"""namespace {} {{ + {} +}} +""" + + +####################### +## Generator Classes ## +####################### +class PythonCSingleFunctionGenerator: + def __init__(self, fwd_api_contents, namespace): + self.fwd_api_contents = fwd_api_contents + self.namespace = namespace + + # Raw Contents + self.forward_api_name = "" + self.forward_args_str = "" + self.forward_returns_str = "" + + # Raw Data + self.forward_attrs_list = None #[ [attr_name, attr_type, default_value, orig_position], ...] + self.forward_inputs_list = None #[ [arg_name, arg_type, orig_position], ...] + self.forward_returns_list = None #[ [ret_name, ret_type, orig_position], ...] + + # Processed Data + self.forward_inputs_position_map = None #{ "name" : [type, fwd_position] } + self.forward_outputs_position_map = None #{ "name" : [type, fwd_position] } + + # Special Op Attributes + self.optional_inputs = [] #[name, ...] + self.is_forward_only = True + + # Generated Results + self.python_c_function_str = "" + self.python_c_function_reg_str = "" + + def CollectRawContents(self): + fwd_api_contents = self.fwd_api_contents + + assert 'api' in fwd_api_contents.keys( + ), "Unable to find \"api\" in fwd_api_contents keys" + assert 'args' in fwd_api_contents.keys( + ), "Unable to find \"args\" in fwd_api_contents keys" + assert 'output' in fwd_api_contents.keys( + ), "Unable to find \"output\" in fwd_api_contents keys" + + self.forward_api_name = fwd_api_contents['api'] + self.forward_args_str = fwd_api_contents['args'] + self.forward_returns_str = fwd_api_contents['output'] + + def CollectIsForwardOnly(self): + fwd_api_contents = self.fwd_api_contents + self.is_forward_only = False if 'backward' in fwd_api_contents.keys( + ) else True + + def CollectOptionalInputs(self): + fwd_api_contents = self.fwd_api_contents + if 'optional' in fwd_api_contents.keys(): + self.optional_inputs = ParseDispensable(fwd_api_contents[ + 'optional']) + + def CollectForwardInOutAttr(self): + forward_args_str = self.forward_args_str + forward_returns_str = self.forward_returns_str + + self.forward_inputs_list, self.forward_attrs_list, self.forward_returns_list = ParseYamlForward( + forward_args_str, forward_returns_str) + + def CollectForwardPositionMap(self): + forward_inputs_list = self.forward_inputs_list + forward_returns_list = self.forward_returns_list + + self.forward_inputs_position_map, self.forward_outputs_position_map = DetermineForwardPositionMap( + forward_inputs_list, forward_returns_list) + + def GeneratePythonCFunction(self): + namespace = self.namespace + forward_api_name = self.forward_api_name + forward_attrs_list = self.forward_attrs_list + forward_inputs_position_map = self.forward_inputs_position_map + forward_outputs_position_map = self.forward_outputs_position_map + optional_inputs = self.optional_inputs + is_forward_only = self.is_forward_only + + # Generate Python-C Tensors Parsing Logic + get_eager_tensor_str = "" + for name, (ttype, pos) in forward_inputs_position_map.items(): + is_optional = (name in optional_inputs) + if IsVectorTensorType(ttype): + get_eager_tensor_str += PARSE_PYTHON_C_TENSORS_TEMPLATE.format( + name, "GetTensorListFromArgs", forward_api_name, name, pos) + else: + if is_optional: + get_eager_tensor_str += PARSE_PYTHON_C_TENSORS_TEMPLATE.format( + name, "GetOptionalTensorFromArgs", forward_api_name, + name, pos) + else: + get_eager_tensor_str += PARSE_PYTHON_C_TENSORS_TEMPLATE.format( + name, "GetTensorFromArgs", forward_api_name, name, pos) + + parse_attributes_str = "" + + # Generate Python-C Attributes Parsing Logic + for name, atype, _, pos in forward_attrs_list: + parsing_function_name = FindParsingFunctionFromAttributeType(atype) + parse_attributes_str += PARSE_PYTHON_C_ARGS_TEMPLATE.format( + name, pos, atype, name, parsing_function_name, name, + forward_api_name, pos) + + # Generate Dygraph Function Call Logic + num_args = len(forward_inputs_position_map.keys()) + len( + forward_attrs_list) + dygraph_function_call_list = ["" for i in range(num_args)] + for name, (_, pos) in forward_inputs_position_map.items(): + dygraph_function_call_list[pos] = f"{name}" + for name, _, _, pos in forward_attrs_list: + dygraph_function_call_list[pos] = f"{name}" + dygraph_function_call_str = ",".join(dygraph_function_call_list) + + # Generate Python-C Function Definitions + if is_forward_only: + fwd_function_name = FUNCTION_NAME_TEMPLATE.format( + "paddle::experimental::", namespace, forward_api_name) + else: + fwd_function_name = FUNCTION_NAME_TEMPLATE.format( + "::", namespace, GetForwardFunctionName(forward_api_name)) + + # Generate Record Event for performance profiling + pythonc_record_event_str = RECORD_EVENT_TEMPLATE.format( + "pythonc_record_event", forward_api_name, "pybind_imperative_func") + self.python_c_function_str = PYTHON_C_FUNCTION_TEMPLATE.format( + forward_api_name, pythonc_record_event_str, forward_api_name, + get_eager_tensor_str, parse_attributes_str, fwd_function_name, + dygraph_function_call_str) + + # Generate Python-C Function Registration + self.python_c_function_reg_str = PYTHON_C_FUNCTION_REG_TEMPLATE.format( + forward_api_name, namespace, forward_api_name, forward_api_name) + + def run(self): + # Initialized is_forward_only + self.CollectIsForwardOnly() + + # Initialized forward_api_name, forward_args_str, forward_returns_str + self.CollectRawContents() + if SkipAPIGeneration(self.forward_api_name): return False + + # Initialized optional_inputs + self.CollectOptionalInputs() + + # Initialized forward_inputs_list, forward_returns_list, forward_attrs_list + self.CollectForwardInOutAttr() + logging.info( + f"Parsed Original Forward Inputs List: \n{self.forward_inputs_list}") + logging.info( + f"Prased Original Forward Attrs List: \n{self.forward_attrs_list}") + logging.info( + f"Parsed Original Forward Returns List: \n{self.forward_returns_list}" + ) + + # Initialized forward_inputs_position_map, forward_outputs_position_map + self.CollectForwardPositionMap() + logging.info( + f"Generated Forward Input Position Map: {self.forward_inputs_position_map}" + ) + logging.info( + f"Generated Forward Output Position Map: {self.forward_outputs_position_map}" + ) + + # Code Generation + self.GeneratePythonCFunction() + logging.info( + f"Generated Python-C Function: {self.python_c_function_str}") + logging.info( + f"Generated Python-C Function Declaration: {self.python_c_function_reg_str}" + ) + + return True + + +class PythonCYamlGenerator: + def __init__(self, path): + self.yaml_path = path + + self.namespace = "" + self.forward_api_list = [] + + # Generated Result + self.python_c_functions_reg_str = "" + self.python_c_functions_str = "" + + def ParseYamlContents(self): + yaml_path = self.yaml_path + self.forward_api_list = ReadFwdFile(yaml_path) + + def GeneratePythonCFunctions(self): + namespace = self.namespace + forward_api_list = self.forward_api_list + + for forward_api_content in forward_api_list: + f_generator = PythonCSingleFunctionGenerator(forward_api_content, + namespace) + status = f_generator.run() + + if status == True: + self.python_c_functions_reg_str += f_generator.python_c_function_reg_str + ",\n" + self.python_c_functions_str += f_generator.python_c_function_str + "\n" + + def InferNameSpace(self): + yaml_path = self.yaml_path + if "sparse" in yaml_path: + self.namespace = "sparse::" + + def AttachNamespace(self): + namespace = self.namespace + python_c_functions_str = self.python_c_functions_str + + if namespace != "": + if namespace.endswith("::"): + namespace = namespace[:-2] + self.python_c_functions_str = NAMESPACE_WRAPPER_TEMPLATE.format( + namespace, python_c_functions_str) + + def run(self): + # Infer namespace from yaml_path + self.InferNameSpace() + + # Read Yaml file + self.ParseYamlContents() + + # Code Generation + self.GeneratePythonCFunctions() + + # Wrap with namespace + self.AttachNamespace() + + +############################ +## Code Generation Helper ## +############################ +def ParseArguments(): + parser = argparse.ArgumentParser( + description='Eager Code Generator Args Parser') + parser.add_argument('--api_yaml_path', type=str) + parser.add_argument('--output_path', type=str) + + args = parser.parse_args() + return args + + +def GenerateCoreOpsInfoMap(): + return CORE_OPS_INFO, CORE_OPS_INFO_REGISTRY def GeneratePythonCWrappers(python_c_function_str, python_c_function_reg_str): @@ -221,36 +475,6 @@ def GeneratePythonCWrappers(python_c_function_str, python_c_function_reg_str): python_c_function_reg_str += core_ops_infos_registry python_c_function_reg_str += "\n {nullptr,nullptr,0,nullptr}" - PYTHON_C_WRAPPER_TEMPLATE = """ -#pragma once - -#include "pybind11/detail/common.h" -#include "paddle/phi/api/all.h" -#include "paddle/phi/api/lib/dygraph_api.h" -#include "paddle/phi/common/backend.h" -#include "paddle/phi/common/data_type.h" -#include "paddle/phi/common/scalar.h" -#include "paddle/phi/common/scalar_array.h" -#include "paddle/phi/api/include/sparse_api.h" -#include "paddle/fluid/pybind/op_function_common.h" -#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h" -#include "paddle/fluid/pybind/exception.h" -#include "paddle/fluid/platform/profiler/event_tracing.h" -#include - -namespace paddle {{ -namespace pybind {{ - -{} - -static PyMethodDef EagerFinalStateMethods[] = {{ - {} -}}; - -}} // namespace pybind -}} // namespace paddle - -""" python_c_str = PYTHON_C_WRAPPER_TEMPLATE.format(python_c_function_str, python_c_function_reg_str) @@ -264,86 +488,23 @@ def GeneratePythonCFile(filepath, python_c_str): if __name__ == "__main__": args = ParseArguments() - api_yaml_paths = args.api_yaml_path.split(",") - python_c_functions_reg_str = "" - python_c_functions_str = "" - + generated_python_c_functions = "" + generated_python_c_registration = "" for i in range(len(api_yaml_paths)): api_yaml_path = api_yaml_paths[i] - if "sparse" in api_yaml_path: - namespace = "sparse" - else: - namespace = "" - - fwd_api_list = ReadFwdFile(api_yaml_path) - - python_c_function_list = [] - python_c_function_reg_list = [] - for fwd_api in fwd_api_list: - - # We only generate Ops with grad - is_forward_only = False - if 'backward' not in fwd_api.keys(): - is_forward_only = True - - assert 'api' in fwd_api.keys() - assert 'args' in fwd_api.keys() - assert 'output' in fwd_api.keys() - - fwd_api_name = fwd_api['api'] - fwd_args_str = fwd_api['args'] - fwd_returns_str = fwd_api['output'] - - if fwd_api_name in skipped_fwd_api_names: - continue - - # Parse Dispensable Inputs - optional_inputs = [] - if 'optional' in fwd_api.keys(): - optional_inputs = ParseDispensable(fwd_api['optional']) - - # Collect Original Forward Inputs/Outputs and then perform validation checks - forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForward( - fwd_args_str, fwd_returns_str) - print("Parsed Original Forward Inputs List: ", forward_inputs_list) - print("Prased Original Forward Attrs List: ", forward_attrs_list) - print("Parsed Original Forward Returns List: ", - forward_returns_list) - - forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap( - forward_inputs_list, forward_returns_list) - print("Generated Forward Input Position Map: ", - forward_inputs_position_map) - print("Generated Forward Output Position Map: ", - forward_outputs_position_map) - - python_c_function_str, python_c_function_reg_str = GeneratePythonCFunction( - fwd_api_name, forward_inputs_position_map, forward_attrs_list, - forward_outputs_position_map, optional_inputs, is_forward_only) - python_c_function_list.append(python_c_function_str) - python_c_function_reg_list.append(python_c_function_reg_str) - print("Generated Python-C Function: ", python_c_function_str) - - # Append Namespace - python_c_functions_reg_str += ",\n".join( - python_c_function_reg_list) + "," - python_c_functions = "\n".join(python_c_function_list) - if len(namespace) > 0: - python_c_functions_str += f"""namespace {namespace} {{ - {python_c_functions} -}} -""" + y_generator = PythonCYamlGenerator(api_yaml_path) + y_generator.run() - else: - python_c_functions_str += python_c_functions + generated_python_c_functions += y_generator.python_c_functions_str + "\n" + generated_python_c_registration += y_generator.python_c_functions_reg_str + "\n" - python_c_str = GeneratePythonCWrappers(python_c_functions_str, - python_c_functions_reg_str) + python_c_str = GeneratePythonCWrappers(generated_python_c_functions, + generated_python_c_registration) - print("Generated Python-C Codes: ", python_c_str) + logging.info(f"Generated Python-C Codes: \n{python_c_str}") output_path = args.output_path for path in [output_path]: diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc index 1987d024d8f3e34121f54962c45f0f8c1e91b723..17bc2441488aa3c4fc62a37e825eeb94cafea9bb 100644 --- a/paddle/fluid/eager/backward.cc +++ b/paddle/fluid/eager/backward.cc @@ -39,12 +39,21 @@ std::unordered_map getInDegreeMap( // Copy nodes std::queue queue = init_queue; std::unordered_set visited; + size_t potential_startup_ops_cnt = queue.size(); + size_t cnt = 0; // Visit each node exactly once in any order while (!queue.empty()) { GradNodeBase* node = queue.front(); queue.pop(); + if (cnt < potential_startup_ops_cnt) { + if (!node_in_degree_map.count(node)) { + node_in_degree_map[node] = 0; + } + cnt += 1; + } + if (visited.count(node)) { continue; } @@ -76,23 +85,248 @@ std::unordered_map getInDegreeMap( return node_in_degree_map; } -void RunBackward(const std::vector& tensors, - const std::vector& grad_tensors, - bool retain_graph) { - paddle::platform::RecordEvent backward_record_event( - "backward", paddle::platform::TracerEventType::Operator, 1); +// Remove some nodes those doesn't need to be +// stored in potential_stop_nodes、potential_startup_nodes +void UpdateGraphInfo( + std::unordered_map* + target_nodes_inputmeta_map, + std::unordered_map>* + depending_nodes, + std::unordered_set* potential_stop_nodes, + std::unordered_set* potential_startup_nodes) { + // Updated potential_sotp_nodes by depending_nodes, + // make sure the path from root to target_node is ok + std::unordered_set _startup_ops; + VLOG(6) << "Running in UpdateGraphInfo"; + std::queue queue; + for (auto& target_nodes_inputmeta_pair : *target_nodes_inputmeta_map) { + queue.emplace(target_nodes_inputmeta_pair.first); + } + + while (!queue.empty()) { + auto* target_node = queue.front(); + queue.pop(); + if (!(*depending_nodes)[target_node].empty()) { + auto precedding_nodes = (*depending_nodes)[target_node]; + for (auto pre_nodes : precedding_nodes) { + queue.emplace(pre_nodes); + if (potential_stop_nodes->find(pre_nodes) != + potential_stop_nodes->end()) { + potential_stop_nodes->erase(pre_nodes); + } + } + } else { // startup_ops have no precedding nodes + VLOG(6) << "Emplace _startup_ops"; + _startup_ops.emplace(target_node); + } + } + // Purify potential_startup_nodes again, remove some + // potential startup_nodes that unreach to input target nodes + if (!_startup_ops.empty()) { + std::unordered_set potential_startup_nodes_to_be_erased; + for (auto node : *potential_startup_nodes) { + if (_startup_ops.count(node) == 0) { + VLOG(6) << "Set up potential_startup_nodes_to_be_erased"; + potential_startup_nodes_to_be_erased.emplace(node); + } + } + if (!potential_startup_nodes_to_be_erased.empty()) { + for (auto node : potential_startup_nodes_to_be_erased) { + VLOG(6) << "Erase nodes in potential_startup_nodes_to_be_erased"; + potential_startup_nodes->erase(node); + } + } + } +} + +// Get Graph Info Betweent input target gradnode and outputs, +// record depending_nodes、 potential_stop_nodes、potential_startup_nodes +void GetGraphInfoBetweenTargets( + const std::queue& init_queue, + std::unordered_map* + input_target_nodes_inputmeta_map, + std::unordered_map>* + depending_nodes, + std::unordered_set* potential_stop_nodes, + std::unordered_set* potential_startup_nodes) { + if (input_target_nodes_inputmeta_map->empty()) return; + + VLOG(6) << "Runing In GetGraphInfoBetweenTargets"; + + // Calculate in_degree for each node + std::unordered_map node_in_degree_map; + + // Copy nodes + std::queue queue = init_queue; + std::unordered_set visited; + + // Visit each node exactly once in any order + while (!queue.empty()) { + GradNodeBase* node = queue.front(); + queue.pop(); + + if (visited.count(node)) { + continue; + } + visited.insert(node); + + // Check node is target_nodes or not, if node is not target_node, + // all the next_node will be marked in potential_stop_nodes + bool is_potential_stop_nodes = + input_target_nodes_inputmeta_map->count(node); + + // Find and append next nodes + const std::vector>& edges = node->GetEdges(); + for (const auto& edge_list : edges) { + for (const Edge& edge : edge_list) { + GradNodeBase* next_node = edge.GetMutableGradNode().get(); + + // Next node could be nullptr if it is leaf tensor with no + // AccumulationNode attached + // Or it could also originated from dispensable inputs + if (!next_node) continue; + + // if node not in input_target_nodes, + // all the next_nodes of current node will be inserted to + // potential_stop_node + if (is_potential_stop_nodes) { + potential_stop_nodes->emplace(next_node); + } + + // Update in_degree + if (!node_in_degree_map.count(next_node)) + node_in_degree_map[next_node] = 0; + node_in_degree_map[next_node]++; + + // Record depending relationship + (*depending_nodes)[next_node].emplace(node); + queue.push(next_node); + } + } + } + // Update Graph Info, remove some stop_node in potential_stop_nodes + UpdateGraphInfo(input_target_nodes_inputmeta_map, depending_nodes, + potential_stop_nodes, potential_startup_nodes); +} + +void GetTargetNodesInfo(const std::vector& inputs, + std::unordered_map* + target_nodes_inputmeta_map) { + VLOG(6) << "Running in GetTargetNodesInfo"; + if (!inputs.empty()) { + VLOG(6) << "Inputs are not empty"; + size_t num_inputs = inputs.size(); + for (size_t i = 0; i < num_inputs; i++) { + AutogradMeta* auto_grad_meta = + EagerUtils::unsafe_autograd_meta(inputs[i]); + auto target_node = auto_grad_meta->GetMutableGradNode().get(); + + PADDLE_ENFORCE_NOT_NULL(target_node, + paddle::platform::errors::Fatal( + "There is no grad op for input:%d or it's" + "stop_gradient=True", + i)); + (*target_nodes_inputmeta_map)[target_node] = auto_grad_meta; + } + } +} + +std::vector GetResults( + const std::vector& inputs, + std::unordered_map* + results_map, + bool allow_unused, bool create_graph) { + VLOG(6) << "Running in GetResults"; + if (inputs.empty()) return {}; + + std::vector results; + results.reserve(inputs.size()); + + for (size_t i = 0; i < inputs.size(); ++i) { + auto& input = inputs[i]; + AutogradMeta* auto_grad_meta = EagerUtils::unsafe_autograd_meta(input); + auto target_node = auto_grad_meta->GetMutableGradNode().get(); + + auto iter = results_map->find(target_node); + if (iter != results_map->end()) { + // set StopGradient = !create_graph + AutogradMeta* tensor_auto_grad_meta = + EagerUtils::autograd_meta(&(iter->second)); + tensor_auto_grad_meta->SetStopGradient(!create_graph); + results.emplace_back(iter->second); + } else { + PADDLE_ENFORCE_EQ(allow_unused, true, + paddle::platform::errors::InvalidArgument( + "The %d-th input does not appear in the backward " + "graph. Please check the input variable or set " + "allow_unused=True to get None result.", + i)); + results.emplace_back(); + } + } + return results; +} + +// Enforce GradNode has TensorWrappers as Input +void EnforceGradNodeHasInput(GradNodeBase* node) { + VLOG(6) << "Running in EnforceGradNodeHasInput"; + PADDLE_ENFORCE_NE( + node->IsTensorWrappersCleared(), true, + paddle::platform::errors::Fatal( + "The TensorWrappers of %s do not exist. This may be because:\n" + "You calculate backward twice for the same subgraph without " + "setting retain_graph=True. Please set retain_graph=True in the " + "first backward/grad call.\n", + node->name())); +} +// Purify potential_startup_nodes, remove nodes those are the same as +// input_target_nodes +void PurifyPotentialStartUpNodes( + std::unordered_set* potential_startup_nodes, + std::unordered_map* + input_target_nodes_inputmeta_map) { + VLOG(6) << "Running in PurifyPotentialStartUpNodes"; + if (input_target_nodes_inputmeta_map->empty()) return; + std::unordered_set potential_startup_nodes_to_be_erased; + for (auto startup_op : *potential_startup_nodes) { + auto iter = input_target_nodes_inputmeta_map->find(startup_op); + if (iter != input_target_nodes_inputmeta_map->end()) { + potential_startup_nodes_to_be_erased.emplace(iter->first); + } + } + if (!potential_startup_nodes_to_be_erased.empty()) { + for (auto nodes : potential_startup_nodes_to_be_erased) { + potential_startup_nodes->erase(nodes); + } + } +} + +std::vector RunBackward( + const std::vector& tensors, // output + const std::vector& grad_tensors, + bool retain_graph, bool create_graph = false, + const std::vector& inputs = {}, + bool allow_unused = false, + const std::vector& no_grad_vars = {}) { VLOG(6) << "Start Backward"; // *Gradient Hook should happen at node-level // *Inplace version check should perform at node-level // *Cross-batch accumulation happens at forward pass + std::unordered_map + no_grad_var_nodes_inputmeta_map; + // Get no_grad_vars's GradNodes and InputMeta Info + GetTargetNodesInfo(no_grad_vars, &no_grad_var_nodes_inputmeta_map); + /* --- Initialization --- */ // 1. Init queue with starting nodes // 2. Prepare initial input buffers std::queue queue; std::unordered_map> node_input_buffers_dict; + std::unordered_set potential_startup_nodes; for (size_t i = 0; i < tensors.size(); i++) { const paddle::experimental::Tensor& tensor = tensors[i]; @@ -132,8 +366,17 @@ void RunBackward(const std::vector& tensors, "size = 0 or same size as tensors")); // Feed given tensor if it's provided VLOG(6) << "Fill grad input tensor " << i << "with give grad tensor"; - node_input_buffers_dict[grad_node]->add( - input_info.first, input_info.second, grad_tensors[i]); + + if (grad_tensors[i].is_initialized()) { + // Deep copy + paddle::experimental::Tensor tmp_tensor; + tmp_tensor.copy_(grad_tensors[i], grad_tensors[i].inner_place(), true); + node_input_buffers_dict[grad_node]->add(input_info.first, + input_info.second, tmp_tensor); + } else { + node_input_buffers_dict[grad_node]->add( + input_info.first, input_info.second, grad_tensors[i]); + } } else { VLOG(6) << "Fill grad input tensor " << i << " with 1.0"; @@ -146,8 +389,9 @@ void RunBackward(const std::vector& tensors, input_info.first, input_info.second, tensor, true /*fill_one=true*/); } - // Prepare queue + // Prepare queue, potential startup_nodes queue.push(grad_node); + potential_startup_nodes.emplace(grad_node); } VLOG(6) << "Update In degree Map for backward"; @@ -155,25 +399,74 @@ void RunBackward(const std::vector& tensors, std::unordered_map node_in_degree_map = getInDegreeMap(queue); + // Get input's GradNodes and InputMeta Info + std::unordered_map + input_target_nodes_inputmeta_map; + GetTargetNodesInfo(inputs, &input_target_nodes_inputmeta_map); + + // Purify potential_startup_ops, remove those nodes that are the same as + // input_target_nodes + PurifyPotentialStartUpNodes(&potential_startup_nodes, + &input_target_nodes_inputmeta_map); + + // Get Graph Info Betweent input target gradnode and outputs + // Record the depending_nodes and potential_stop_nodes + std::unordered_map /* father node */> + depending_nodes; + std::unordered_set potential_stop_nodes; + // std::unordered_set startup_ops; + + GetGraphInfoBetweenTargets(queue, &input_target_nodes_inputmeta_map, + &depending_nodes, &potential_stop_nodes, + &potential_startup_nodes); + + // ready_queue store all startup nodes + std::queue ready_queue; + // startup op's indegree should be 0 + for (auto node : potential_startup_nodes) { + if (node_in_degree_map[node] == 0) { + ready_queue.emplace(node); + } + } + + VLOG(1) << " startup_ops' size is :" << ready_queue.size(); + + std::unordered_map results_map; + + // read_queue is empty only when 1.input equals to output. 2.input can not + // reach to output. + if (ready_queue.size() == 0) { + for (auto input_target_node : input_target_nodes_inputmeta_map) { + // out rank_info of forward op + auto rank_info = input_target_node.second->OutRankInfo(); + if (node_input_buffers_dict[input_target_node.first]) { + auto& target_result = + node_input_buffers_dict[input_target_node.first] + ->Buffers()[rank_info.first][rank_info.second]; + // save the target result + results_map[input_target_node.first] = target_result; + } + } + } + /* --- Topological Visit --- */ // 1. Pop queue // 2. Run node + // |- Check and capture target result // |- node(grads) // |- Prepare for next node // 3. Update queue VLOG(6) << "Run Backward"; - while (!queue.empty()) { - GradNodeBase* node = queue.front(); + while (!ready_queue.empty()) { + GradNodeBase* node = ready_queue.front(); + VLOG(6) << "Running GradNode:" << node->name(); + ready_queue.pop(); paddle::platform::RecordEvent node_record_event( std::string(typeid(*node).name()) + " grad_node", paddle::platform::TracerEventType::Operator, 1); - if (queue.size() > 1 && node_in_degree_map[node] != 0) { - queue.pop(); - continue; - } - queue.pop(); // Run node: This is where Hook happens PADDLE_ENFORCE( node_input_buffers_dict.count(node), @@ -184,16 +477,51 @@ void RunBackward(const std::vector& tensors, std::unique_ptr node_input_buffer = std::move(node_input_buffers_dict[node]); + // get target grad_var from node_input_buffer by inputmeta + if (input_target_nodes_inputmeta_map.find(node) != + input_target_nodes_inputmeta_map.end()) { + VLOG(6) << "Get target result by by inputmeta"; + // out rank_info of forward op + auto rank_info = input_target_nodes_inputmeta_map[node]->OutRankInfo(); + // rank_info is a pair, first means slot_id, second means rank. + auto& target_result = + node_input_buffer->Buffers()[rank_info.first][rank_info.second]; + // save the target result + results_map[node] = target_result; + } + + // no_grad_vars + if (no_grad_var_nodes_inputmeta_map.find(node) != + no_grad_var_nodes_inputmeta_map.end()) { + VLOG(6) << "Change the input buffer[slot][rank] by Zeros"; + auto rank_info = no_grad_var_nodes_inputmeta_map[node]->OutRankInfo(); + node_input_buffer->SetBufferSlotRankZeros(rank_info.first, + rank_info.second); + } + + VLOG(6) << "Running GradNode:" << node->name(); + + // check input + EnforceGradNodeHasInput(node); + VLOG(6) << "Run Backward Kernel with GradTensorHolder"; // Run Pre Backward Node and get outputs std::vector> grad_output_tensors = - (*node)(node_input_buffer->Buffers()); + (*node)(node_input_buffer->Buffers(), create_graph); + + // retain_grad or not + if (!retain_graph) { + VLOG(6) + << "retain_graph is false, need to clear the TensorWrapper of nodes."; + node->ClearTensorWrappers(); + } + // TODO(jiabin): Should we erase it or find a more efficient way. + node_input_buffers_dict.erase(node); // Prepare GradTensorHolder for next node const std::vector>& edges = node->GetEdges(); - PADDLE_ENFORCE(edges.size() == grad_output_tensors.size() || edges.empty(), paddle::platform::errors::Fatal( "Number of edges should be either empty ( for leaf node " @@ -204,6 +532,7 @@ void RunBackward(const std::vector& tensors, for (size_t i = 0; i < edges.size(); i++) { for (size_t j = 0; j < edges[i].size(); j++) { const Edge& edge = edges[i][j]; + auto edge_rank = edge.GetEdgeRankInfo(); // Since we make edge has as same rank as bwd outputs, we indexing them // with @@ -217,6 +546,7 @@ void RunBackward(const std::vector& tensors, grad_output_tensors[i].empty()) { continue; } + PADDLE_ENFORCE_LT( j, grad_output_tensors[i].size(), paddle::platform::errors::Fatal( @@ -252,18 +582,44 @@ void RunBackward(const std::vector& tensors, // Update queue node_in_degree_map[next_node]--; + PADDLE_ENFORCE( node_in_degree_map[next_node] >= 0, paddle::platform::errors::Fatal( "Detected in-degree value smaller than zero. For Node: %s" "Node's in-degree cannot be negative", next_node->name())); - if (node_in_degree_map[next_node] == 0) { - queue.emplace(std::move(next_node)); + + bool is_potential_stop_node = potential_stop_nodes.count(next_node); + + if (node_in_degree_map[next_node] == 0 && !is_potential_stop_node) { + ready_queue.emplace(std::move(next_node)); } } } } + + return GetResults(inputs, &results_map, allow_unused, create_graph); } +void Backward( + const std::vector& tensors, // output + const std::vector& grad_tensors, + bool retain_graph) { + VLOG(6) << "Run in Backward"; + paddle::platform::RecordEvent backward_record_event( + "backward", paddle::platform::TracerEventType::Operator, 1); + RunBackward(tensors, grad_tensors, retain_graph); +} + +std::vector Grad( + const std::vector& tensors, // output + const std::vector& inputs, + const std::vector& grad_tensors, + bool retain_graph, bool create_graph, bool only_inputs, bool allow_unused, + const std::vector& no_grad_vars) { + VLOG(6) << "Run in Grad"; + return RunBackward(tensors, grad_tensors, retain_graph, create_graph, inputs, + allow_unused, no_grad_vars); +} } // namespace egr diff --git a/paddle/fluid/eager/backward.h b/paddle/fluid/eager/backward.h index 2856d9fb87f34b1066bb59eb38bcaee786d2a260..bebe664838e6c1f98219ceee6e6733b49c319b3c 100644 --- a/paddle/fluid/eager/backward.h +++ b/paddle/fluid/eager/backward.h @@ -19,12 +19,20 @@ namespace egr { -// run_backward(): +// Backward(): // tensors corresponds to those lived in the backward graph // each grad_tensors[i] keeps the value for its corresponding tensors[i] -void RunBackward(const std::vector &tensors, - const std::vector &grad_tensors, - bool retain_graph = false); +void Backward(const std::vector& tensors, + const std::vector& grad_tensors, + bool retain_graph = false); + +std::vector Grad( + const std::vector& tensors, + const std::vector& inputs, + const std::vector& grad_tensors = {}, + bool retain_graph = false, bool create_graph = false, + bool only_inputs = false, bool allow_unused = false, + const std::vector& no_grad_vars = {}); // Reserved for gradient() diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.cc b/paddle/fluid/eager/custom_operator/custom_operator_node.cc index 48ac8c8358afd68cee9d22b8ea0a4e8fd7c3c92e..72af1cc4b068679e72ae6bdc5e09fab8f56bac04 100644 --- a/paddle/fluid/eager/custom_operator/custom_operator_node.cc +++ b/paddle/fluid/eager/custom_operator/custom_operator_node.cc @@ -20,8 +20,8 @@ namespace egr { std::vector> RunCustomOpNode:: -operator()( - const std::vector>& grads) { +operator()(const std::vector>& grads, + bool create_graph) { paddle::CustomOpKernelContext ctx; auto grad_inputs_name = paddle::framework::OpMetaInfoHelper::GetInputs( egr::Controller::Instance().GetOpMetaInfoMap().at(op_type_)[1]); diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.h b/paddle/fluid/eager/custom_operator/custom_operator_node.h index e5ddef9c062149282d790a5fd6bf31b25a20cf5a..6ece2658575c795856438904c2716d61f0985879 100644 --- a/paddle/fluid/eager/custom_operator/custom_operator_node.h +++ b/paddle/fluid/eager/custom_operator/custom_operator_node.h @@ -37,8 +37,8 @@ class RunCustomOpNode : public GradNodeBase { // Functor: perform backward computations virtual std::vector> operator()( - const std::vector>& grads) - override; + const std::vector>& grads, + bool create_graph) override; std::string name() { return paddle::string::Sprintf("RunCustomOpNode: %s_grad", op_type_); @@ -62,6 +62,12 @@ class RunCustomOpNode : public GradNodeBase { return res; } + void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; } + bool IsTensorWrappersCleared() override { + VLOG(6) << "Do nothing here now"; + return false; + } + void SetAttrs(const std::vector& attr) { attrs_ = attr; } public: diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc index 7eb2902d935c4fd8d5990c81fbf6bcf3fd6e6e66..891ad4d8983b5b37b31ab5f5f980e74ccff47069 100644 --- a/paddle/fluid/eager/grad_node_info.cc +++ b/paddle/fluid/eager/grad_node_info.cc @@ -15,10 +15,16 @@ #include "paddle/fluid/eager/grad_node_info.h" #include "paddle/fluid/eager/accumulation/accumulation_node.h" #include "paddle/fluid/eager/autograd_meta.h" +#include "paddle/fluid/eager/utils.h" + #include "paddle/phi/common/data_type.h" #include "paddle/phi/core/dense_tensor.h" +#include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/data_type_transform.h" #include "paddle/fluid/framework/var_type.h" + #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/errors.h" @@ -33,7 +39,6 @@ GradNodeBase::GradNodeBase(size_t bwd_in_slot_num, size_t bwd_out_slot_num) { VLOG(6) << "Construct GradNodeBase"; bwd_in_meta_.resize(bwd_in_slot_num); bwd_out_meta_.resize(bwd_out_slot_num); - // adj_edges has the same num as backward outputs adj_edges_.resize(bwd_out_slot_num); } @@ -44,24 +49,20 @@ void GradNodeBase::AddEdges(std::vector* metas, size_t slot_id) { "Given slot id is out of range of adj_edges outter size, " "adj_edges is designed to has the same size of grad " "inputs's slot num.")); - for (const auto& meta : *metas) { + + for (size_t i = 0; i < metas->size(); i++) { + const auto& meta = (*metas)[i]; // adj_edges has as same rank as fwd inputs, and record it's output rank // from // its pre-ops if (meta && !meta->StopGradient()) { auto node = meta->GetMutableGradNode(); - if (node && node.get()) { - VLOG(6) << "Add Edges for slot: " << slot_id - << " which is: " << meta->GetMutableGradNode()->name(); - adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), - meta->OutRankInfo()); - } else { + if (!node || !node.get()) { meta->SetGradNode(std::make_shared(meta)); - VLOG(6) << "Add Edges for slot: " << slot_id - << " which is: " << meta->GetMutableGradNode()->name(); - adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), - meta->OutRankInfo()); } + + adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), + meta->OutRankInfo()); } } } @@ -73,130 +74,205 @@ void GradNodeBase::AddEdges(AutogradMeta* meta, size_t slot_id) { "Given slot id is out of range of adj_edges outter size, " "adj_edges is designed to has the same size of grad " "inputs's slot num.")); + if (meta && !meta->StopGradient()) { auto node = meta->GetMutableGradNode(); - if (node && node.get()) { - VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from " - << this->name() << " to " << meta->GetMutableGradNode()->name(); - adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), - meta->OutRankInfo()); - } else { + if (!node || !node.get()) { meta->SetGradNode(std::make_shared(meta)); - VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from " - << this->name() << " to " << meta->GetMutableGradNode()->name(); - adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), - meta->OutRankInfo()); } + VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from " + << this->name() << " to " << meta->GetMutableGradNode()->name(); + + adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), + meta->OutRankInfo()); } } -const std::vector& GradNodeBase::InputMeta() const { +const std::vector>& GradNodeBase::InputMeta() const { return bwd_in_meta_; } -const std::vector& GradNodeBase::OutputMeta() const { +const std::vector>& GradNodeBase::OutputMeta() const { return bwd_out_meta_; } -void GradNodeBase::SetGradInMeta(std::vector* fwd_out, +void GradNodeBase::SetGradInMeta(const paddle::experimental::Tensor& fwd_out, size_t slot_rank) { - size_t slot_size = fwd_out->size(); + auto* fwd_out_meta = egr::EagerUtils::nullable_autograd_meta(fwd_out); PADDLE_ENFORCE_LE( slot_rank, (bwd_in_meta_.size() - 1), paddle::platform::errors::InvalidArgument( "Slot Rank should less equal than bwd_in_meta_ size, since " "bwd_in_meta_ is designed to hold as same num as backward " "inputs.")); - auto& meta = bwd_in_meta_.at(slot_rank); - PADDLE_ENFORCE_EQ(meta.IsInitialized(), false, - paddle::platform::errors::PreconditionNotMet( - "Bwd_in_meta should only be init once, addition " - "initialization for it is forbidden. If you got this " - "error, it indicates bugs in framework.")); - // Init stop gradient vector before use to avoid push back - meta.Init(slot_size); - for (size_t i = 0; i < slot_size; i++) { - PADDLE_ENFORCE_NOT_NULL((*fwd_out)[i], - paddle::platform::errors::PreconditionNotMet( - "Bwd_in_meta should only be called while " - "autograd_meta is not null. If you got this " - "error, it indicates bugs in framework.")); - if ((*fwd_out)[i]->StopGradient()) { - // Set Stop Gradient only when its true or non-initialized autograd_meta, - // since all default value is false. - meta.SetStopGradient(i, (*fwd_out)[i]->StopGradient()); + auto& metas = bwd_in_meta_.at(slot_rank); + if (metas.size() == 0) { + metas.resize(1); + } + + auto& meta = metas[0]; + meta.SetStopGradient(fwd_out_meta->StopGradient()); + + // Record TensorMeta + if (phi::DenseTensor::classof(fwd_out.impl().get())) { + // Only Copy Meta + phi::DenseTensor* dense_tensor = + static_cast(fwd_out.impl().get()); + + PADDLE_ENFORCE_NE( + dense_tensor->meta().dtype, phi::DataType::UNDEFINED, + paddle::platform::errors::Fatal( + "Attempting to copy DenseTensorMeta with phi::DataType::UNDEFINED," + "which is illegal.")); + meta.SetTensorMeta(dense_tensor->meta()); + + if (paddle::framework::IsComplexType( + paddle::framework::TransToProtoVarType(dense_tensor->type()))) { + need_complex_to_real_ = true; } + } else { + VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta with " + "non-DenseTensor argument."; } } -void GradNodeBase::SetGradInMeta(AutogradMeta* fwd_out, size_t slot_rank) { +void GradNodeBase::SetGradInMeta( + const std::vector& fwd_out, + size_t slot_rank) { + size_t slot_size = fwd_out.size(); PADDLE_ENFORCE_LE( slot_rank, (bwd_in_meta_.size() - 1), paddle::platform::errors::InvalidArgument( "Slot Rank should less equal than bwd_in_meta_ size, since " "bwd_in_meta_ is designed to hold as same num as backward " "inputs.")); - auto& meta = bwd_in_meta_.at(slot_rank); - PADDLE_ENFORCE_EQ(meta.IsInitialized(), false, - paddle::platform::errors::PreconditionNotMet( - "Bwd_in_meta should only be init once, Additional " - "initialization for it is forbidden. If you got this " - "error, it indicates bugs in framework.")); + auto& metas = bwd_in_meta_.at(slot_rank); // Init stop gradient vector before use to avoid push back - VLOG(7) << "Init bwd_in_meta_ with slot rank: " << slot_rank; - meta.Init(1); - meta.SetStopGradient(0, fwd_out->StopGradient()); + if (metas.size() < slot_size) { + VLOG(7) << "Init bwd_in_meta_ with slot rank: " << slot_rank; + metas.resize(slot_size); + } + for (size_t i = 0; i < slot_size; i++) { + auto& meta = metas[i]; + const auto& fwd_out_tensor = fwd_out[i]; + auto* fwd_out_meta = + egr::EagerUtils::nullable_autograd_meta(fwd_out_tensor); + PADDLE_ENFORCE_NOT_NULL(fwd_out_meta, + paddle::platform::errors::PreconditionNotMet( + "Bwd_in_meta should only be called while " + "autograd_meta is not null. If you got this " + "error, it indicates bugs in framework.")); + if (fwd_out_meta->StopGradient()) { + // Set Stop Gradient only when its true or non-initialized autograd_meta, + // since all default value is false. + meta.SetStopGradient(fwd_out_meta->StopGradient()); + } + + // Record TensorMeta + if (phi::DenseTensor::classof(fwd_out_tensor.impl().get())) { + // Only Copy Meta + phi::DenseTensor* dense_tensor = + static_cast(fwd_out_tensor.impl().get()); + + PADDLE_ENFORCE_NE( + dense_tensor->meta().dtype, phi::DataType::UNDEFINED, + paddle::platform::errors::Fatal("Attempting to copy DenseTensorMeta " + "with phi::DataType::UNDEFINED," + "which is illegal.")); + meta.SetTensorMeta(dense_tensor->meta()); + if (paddle::framework::IsComplexType( + paddle::framework::TransToProtoVarType(dense_tensor->type()))) { + need_complex_to_real_ = true; + } + } else { + VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta " + "with non-DenseTensor argument."; + } + } } -void GradNodeBase::SetGradOutMeta(std::vector* fwd_in, +void GradNodeBase::SetGradOutMeta(const paddle::experimental::Tensor& fwd_in, size_t slot_rank) { - size_t slot_size = fwd_in->size(); + auto* fwd_in_meta = egr::EagerUtils::nullable_autograd_meta(fwd_in); PADDLE_ENFORCE_LE( - slot_rank, (bwd_out_meta_.size() - 1), + (slot_rank + 1), bwd_out_meta_.size(), paddle::platform::errors::InvalidArgument( "Slot Rank should less equal than bwd_out_meta_ size, " "since bwd_out_meta_ is designed to hold as same num as " "backward outputs.")); - auto& meta = bwd_out_meta_.at(slot_rank); - PADDLE_ENFORCE_EQ(meta.IsInitialized(), false, - paddle::platform::errors::PreconditionNotMet( - "Bwd_out_meta should only be init once. Additional " - "initialization for it is forbidden. If you got this " - "error, it indicates bugs in framework.")); + auto& metas = bwd_out_meta_.at(slot_rank); // Init stop gradient vector before use to avoid push back - meta.Init(slot_size); - for (size_t i = 0; i < slot_size; i++) { - if (!(*fwd_in)[i]) { - meta.SetStopGradient(i, true); - continue; - } - if ((*fwd_in)[i]->StopGradient()) { - // Set Stop Gradient only when its true or non-initialized autograd_meta, - // since all default value is false. - meta.SetStopGradient(i, (*fwd_in)[i]->StopGradient()); + if (metas.size() == 0) { + metas.resize(1); + } + auto& meta = metas[0]; + if (fwd_in_meta) { + meta.SetStopGradient(fwd_in_meta->StopGradient()); + } else { + meta.SetStopGradient(true); + } + + // Record TensorMeta + if (fwd_in.impl() && fwd_in.impl().get()) { + if (phi::DenseTensor::classof(fwd_in.impl().get())) { + // Only Copy Meta + phi::DenseTensor* dense_tensor = + static_cast(fwd_in.impl().get()); + PADDLE_ENFORCE_NE( + dense_tensor->meta().dtype, phi::DataType::UNDEFINED, + paddle::platform::errors::Fatal("Attempting to copy DenseTensorMeta " + "with phi::DataType::UNDEFINED," + "which is illegal.")); + meta.SetTensorMeta(dense_tensor->meta()); } + } else { + VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta with " + "non-DenseTensor argument."; } } -void GradNodeBase::SetGradOutMeta(AutogradMeta* fwd_in, size_t slot_rank) { +void GradNodeBase::SetGradOutMeta( + const std::vector& fwd_in, size_t slot_rank) { + size_t slot_size = fwd_in.size(); PADDLE_ENFORCE_LE( - (slot_rank + 1), bwd_out_meta_.size(), + slot_rank, (bwd_out_meta_.size() - 1), paddle::platform::errors::InvalidArgument( "Slot Rank should less equal than bwd_out_meta_ size, " "since bwd_out_meta_ is designed to hold as same num as " "backward outputs.")); - auto& meta = bwd_out_meta_.at(slot_rank); - PADDLE_ENFORCE_EQ(meta.IsInitialized(), false, - paddle::platform::errors::PreconditionNotMet( - "Bwd_out_meta should only be init once. Additional " - "initialization for it is forbidden. If you got this " - "error, it indicates bugs in framework.")); + auto& metas = bwd_out_meta_.at(slot_rank); // Init stop gradient vector before use to avoid push back - meta.Init(1); - if (fwd_in) { - meta.SetStopGradient(0, fwd_in->StopGradient()); - } else { - meta.SetStopGradient(0, true); + if (metas.size() < slot_size) { + metas.resize(slot_size); + } + for (size_t i = 0; i < slot_size; i++) { + const auto& fwd_in_tensor = fwd_in[i]; + auto& meta = metas[i]; + auto* fwd_in_meta = egr::EagerUtils::nullable_autograd_meta(fwd_in_tensor); + if (fwd_in_meta) { + // Set Stop Gradient only when its true or non-initialized autograd_meta, + // since all default value is false. + meta.SetStopGradient(fwd_in_meta->StopGradient()); + } + + // Record TensorMeta + if (fwd_in_tensor.impl() && fwd_in_tensor.impl().get()) { + if (phi::DenseTensor::classof(fwd_in_tensor.impl().get())) { + // Only Copy Meta + phi::DenseTensor* dense_tensor = + static_cast(fwd_in_tensor.impl().get()); + + PADDLE_ENFORCE_NE(dense_tensor->meta().dtype, phi::DataType::UNDEFINED, + paddle::platform::errors::Fatal( + "Attempting to copy DenseTensorMeta with " + "phi::DataType::UNDEFINED," + "which is illegal.")); + meta.SetTensorMeta(dense_tensor->meta()); + } + } else { + VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta " + "with non-DenseTensor argument."; + } } } @@ -207,12 +283,8 @@ void GradNodeBase::SetDefaultGradInOutMeta() { "meta setter, other size of inputs and outputs should " "create with Setter and Getters")); // Default stop_gradient is false and slot id is 0, slot size is 1; - bwd_out_meta_[0].Init(1); - bwd_in_meta_[0].Init(1); -} - -const std::vector>& GradNodeBase::GetEdges() const { - return adj_edges_; + bwd_out_meta_[0].resize(1); + bwd_in_meta_[0].resize(1); } int64_t GradNodeBase::RegisterGradientHook( @@ -222,6 +294,10 @@ int64_t GradNodeBase::RegisterGradientHook( return next_hook_id_++; } +const std::vector>& GradNodeBase::GetEdges() const { + return adj_edges_; +} + std::vector> GradNodeBase::ApplyGradientHooks( const std::vector>& tensors) { @@ -270,4 +346,45 @@ GradNodeBase::ApplyGradientHooks( return outs; } +void GradNodeBase::HandleComplexGradToRealGrad( + std::vector>* out_grads) { + for (size_t slot_id = 0; slot_id < out_grads->size(); slot_id++) { + const std::vector& slot_out_grads = + (*out_grads)[slot_id]; + for (size_t rank_id = 0; rank_id < slot_out_grads.size(); rank_id++) { + const GradSlotMeta& slot_meta = bwd_out_meta_[slot_id][rank_id]; + + PADDLE_ENFORCE( + slot_meta.HasTensorMeta() > 0, + paddle::platform::errors::Fatal( + "We require TensorMeta in GradInputMeta() to obtain forward data " + "types." + "However, no TensorMeta is detected in bwd_out_meta_.")); + + auto fwd_data_type = paddle::framework::TransToProtoVarType( + slot_meta.GetTensorMeta().dtype); + const paddle::experimental::Tensor& grad = slot_out_grads[rank_id]; + + if (paddle::framework::IsComplexType(fwd_data_type)) continue; + + // Only Handle Complex To Real for DenseTensor for now + if (phi::DenseTensor::classof(grad.impl().get())) { + phi::DenseTensor* grad_dense_tensor = + static_cast(grad.impl().get()); + + auto curr_data_type = + paddle::framework::TransToProtoVarType(grad_dense_tensor->type()); + if (!paddle::framework::IsComplexType(curr_data_type)) continue; + + // Convert Complex GradOut to Real + auto out = std::make_shared(); + paddle::framework::TransComplexToReal(fwd_data_type, curr_data_type, + *grad_dense_tensor, out.get()); + + (*out_grads)[slot_id][rank_id].set_impl(out); + } + } + } +} + } // namespace egr diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h index 16513f05e0777a8e57f54c925d68867dda656612..4b21a193ee021f06538e1a11bbffb898376739a7 100644 --- a/paddle/fluid/eager/grad_node_info.h +++ b/paddle/fluid/eager/grad_node_info.h @@ -57,21 +57,28 @@ class AutogradMeta; class GradSlotMeta { public: GradSlotMeta() = default; - void Init(size_t size) { - size_ = static_cast(size); - stop_gradient_.resize(size, false); + bool IsStopGradient() const { return stop_gradient_; } + void SetStopGradient(bool stop_gradient = true) { + stop_gradient_ = stop_gradient; } - bool IsInitialized() const { return size_ != -1; } - bool IsStopGradient(size_t rank) const { return stop_gradient_[rank]; } - int Size() const { return size_; } - void SetStopGradient(size_t rank, bool stop_gradient = true) { - stop_gradient_.at(rank) = stop_gradient; + void SetTensorMeta(const phi::DenseTensorMeta& meta) { + meta_ = std::make_shared(meta); + } + bool HasTensorMeta() const { return meta_ && meta_.get(); } + const phi::DenseTensorMeta& GetTensorMeta() const { + if (!HasTensorMeta()) { + PADDLE_THROW(paddle::platform::errors::Fatal( + "meta_ of GradSlotMeta has not been initialized yet." + "You're expected to check Edge availability with HasTensorMeta()" + "before calling GetTensorMeta() interface.")); + } + return *meta_.get(); } private: - int size_{-1}; - std::vector stop_gradient_{false}; + bool stop_gradient_{false}; + std::shared_ptr meta_ = nullptr; }; class GradNodeBase { @@ -95,8 +102,12 @@ class GradNodeBase { * is better choice to fit this format. * **/ virtual std::vector> operator()( - const std::vector>& grads) = 0; + const std::vector>& grads, + bool create_graph = false) = 0; + + virtual void ClearTensorWrappers() = 0; + virtual bool IsTensorWrappersCleared() = 0; /** * AddEdges is designed to set input tensors' backward Node as current * node's Edges. @@ -108,25 +119,30 @@ class GradNodeBase { void AddEdges(std::vector* metas, size_t slot_id); void AddEdges(AutogradMeta* meta, size_t slot_id); - /** - * GetEdges is designed to get all edges of current node**/ - const std::vector>& GetEdges() const; + // adj_edges were moved inside OutputMeta(), so no available direct access + // from GradNodeBase. + // To access Edges, get GradSlotMeta by calling OutputMeta(), then use + // slot_meta.GetEdge() /** * Get Input Meta of current Grad node**/ - const std::vector& InputMeta() const; + const std::vector>& InputMeta() const; /** * Get Output Meta of current Grad node**/ - const std::vector& OutputMeta() const; + const std::vector>& OutputMeta() const; /** * Set bwd ins and outs info with forward vars * **/ - void SetGradInMeta(std::vector* fwd_out, size_t slot_rank); - void SetGradInMeta(AutogradMeta* fwd_out, size_t slot_rank); + void SetGradInMeta(const std::vector& fwd_out, + size_t slot_rank); + void SetGradInMeta(const paddle::experimental::Tensor& fwd_out, + size_t slot_rank); - void SetGradOutMeta(std::vector* fwd_in, size_t slot_rank); - void SetGradOutMeta(AutogradMeta* fwd_in, size_t slot_rank); + void SetGradOutMeta(const std::vector& fwd_in, + size_t slot_rank); + void SetGradOutMeta(const paddle::experimental::Tensor& fwd_in, + size_t slot_rank); /** * Default setters for Grad in/out meta this should be used for same special @@ -158,11 +174,21 @@ class GradNodeBase { std::vector> ApplyGradientHooks( const std::vector>& tensors); + /** + * Handle Complex - Real Type Promotion + * **/ + void HandleComplexGradToRealGrad( + std::vector>* out_grads); + bool NeedComplexToRealConversion() { return need_complex_to_real_; } + virtual std::string name() { return "GradNodeBase"; } - private: - // TODO(jiabin): Use SmallVector instead after merge PR from develop + /** + * GetEdges is designed to get all edges of current node**/ + const std::vector>& GetEdges() const; + private: + // TODO(zhanlve): Merge adj_edges_ into GradOutMeta // Edges recorded the backward related node info, which indicate all edges // linked // by this Grad Node. @@ -170,10 +196,10 @@ class GradNodeBase { std::vector> adj_edges_; // bwd_out_meta_ is used to record Grad output info for backward - std::vector bwd_out_meta_; + std::vector> bwd_out_meta_; // bwd_in_meta_ used to record Grad input info for backward - std::vector bwd_in_meta_; + std::vector> bwd_in_meta_; // Gradient Hooks // Customer may register a list of hooks which will be called in order during // backward @@ -184,6 +210,8 @@ class GradNodeBase { /* hook */ std::shared_ptr>> gradient_hooks_; + // We handle complex to real conversion only if any complex GradIn is involved + bool need_complex_to_real_ = false; int64_t next_hook_id_{0}; }; diff --git a/paddle/fluid/eager/grad_tensor_holder.cc b/paddle/fluid/eager/grad_tensor_holder.cc index 69fc7df2f1420382735cf59fbe85f7e2207d0f77..163d25e85ce8c085087331c6e3273075aed5e5f4 100644 --- a/paddle/fluid/eager/grad_tensor_holder.cc +++ b/paddle/fluid/eager/grad_tensor_holder.cc @@ -21,6 +21,11 @@ namespace egr { +void GradTensorHolder::SetBufferSlotRankZeros(size_t slot_id, size_t rank) { + buffer_[slot_id][rank] = + paddle::experimental::zeros_like(buffer_[slot_id][rank]); +} + void GradTensorHolder::add(size_t slot_id, size_t rank, const paddle::experimental::Tensor& t, bool fill_one) { diff --git a/paddle/fluid/eager/grad_tensor_holder.h b/paddle/fluid/eager/grad_tensor_holder.h index d66a81fe8285980bad4159d5414985dc9c744549..8c00f9161b629f7a3f093a1225d3d5b0b9bcca8b 100644 --- a/paddle/fluid/eager/grad_tensor_holder.h +++ b/paddle/fluid/eager/grad_tensor_holder.h @@ -26,12 +26,13 @@ namespace egr { * GradTensorHolder should have as same format as forward output **/ class GradTensorHolder { public: - explicit GradTensorHolder(const std::vector& meta) { - VLOG(7) << "Init GradTensorHolder with meta size: " << meta.size(); - buffer_.resize(meta.size()); + explicit GradTensorHolder( + const std::vector>& metas) { + VLOG(7) << "Init GradTensorHolder with meta size: " << metas.size(); + buffer_.resize(metas.size()); for (size_t i = 0; i < buffer_.size(); i++) { - VLOG(7) << "Init GradTensorHolder with meta rank: " << meta[i].Size(); - buffer_[i].resize(meta[i].Size()); + VLOG(7) << "Init GradTensorHolder with meta rank: " << metas[i].size(); + buffer_[i].resize(metas[i].size()); } } @@ -56,6 +57,8 @@ class GradTensorHolder { return buffer_; } + void SetBufferSlotRankZeros(size_t slot_id, size_t rank); + private: std::vector> buffer_; }; diff --git a/paddle/fluid/eager/tensor_wrapper.h b/paddle/fluid/eager/tensor_wrapper.h index 31aaa93c41643f565836c536d7001c01d2a0826d..8da27f3bb8a13a759bd12737746ce6add4b1aaa5 100644 --- a/paddle/fluid/eager/tensor_wrapper.h +++ b/paddle/fluid/eager/tensor_wrapper.h @@ -36,6 +36,15 @@ class TensorWrapper { explicit TensorWrapper(const paddle::experimental::Tensor& tensor, bool full_reserved = false, bool no_need_buffer = false) { + // set inplace_version_snapshot_ according to tensor's current inplace + // version. + if (tensor.impl() && phi::DenseTensor::classof(tensor.impl().get())) { + phi::DenseTensor* dense_tensor = + static_cast(tensor.impl().get()); + auto& inplace_version_counter = dense_tensor->InplaceVersionCounter(); + inplace_version_snapshot_ = inplace_version_counter.CurrentVersion(); + } + /** * Normally, we should fully reserved all non-output or non-leaf fwd tensor * here. And for fwd output tensor, we should not reserve its autogradmeta, @@ -49,6 +58,7 @@ class TensorWrapper { } // shallow copy tensor_impl here + no_need_buffer_ = no_need_buffer; if (no_need_buffer) { if (phi::DenseTensor::classof(tensor.impl().get())) { // Only Copy Meta @@ -86,6 +96,7 @@ class TensorWrapper { // if it's full_reserved just return the full copy of tensor if (full_reserved_) { + check_inplace_version(); return intermidiate_tensor_; } else { std::shared_ptr new_grad_node = grad_node; @@ -94,13 +105,52 @@ class TensorWrapper { intermidiate_tensor_.set_autograd_meta( std::static_pointer_cast( p_ab_autograd_meta)); + check_inplace_version(); return intermidiate_tensor_; } } + void check_inplace_version() { + if (no_need_buffer_) { + VLOG(6) << "There's no need to check inplace_version because " + "no_need_buffer_ is true."; + return; + } + if (intermidiate_tensor_.impl() && + phi::DenseTensor::classof(intermidiate_tensor_.impl().get())) { + phi::DenseTensor* dense_tensor = + static_cast(intermidiate_tensor_.impl().get()); + auto& inplace_version_counter = dense_tensor->InplaceVersionCounter(); + + uint32_t current_inplace_version = + inplace_version_counter.CurrentVersion(); + PADDLE_ENFORCE_EQ( + current_inplace_version, inplace_version_snapshot_, + paddle::platform::errors::PermissionDenied( + "Tensor '%s' used in gradient computation has been " + "modified by an inplace operation. " + "Its version is %d but the expected version is %d. " + "Please fix your code to void calling an inplace operator " + "after using the Tensor which will used in gradient " + "computation.", + intermidiate_tensor_.name(), current_inplace_version, + inplace_version_snapshot_)); + VLOG(6) << " The inplace_version_snapshot_ of Tensor '" + << intermidiate_tensor_.name() << "' is [ " + << inplace_version_snapshot_ << " ]"; + VLOG(6) << " The current_inplace_version of Tensor '" + << intermidiate_tensor_.name() << "' is [ " + << current_inplace_version << " ]"; + } + } + + void clear() { intermidiate_tensor_.reset(); } + private: bool full_reserved_ = false; + bool no_need_buffer_ = false; std::pair out_rank_info_; paddle::experimental::Tensor intermidiate_tensor_; + uint32_t inplace_version_snapshot_ = 0; }; } // namespace egr diff --git a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc b/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc index 1683f4ed5fbe5e4b014e9b369e0231d149c187f1..c8b2d22dcf95139db47704be86a6f64554f7c0ba 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc +++ b/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc @@ -17,6 +17,14 @@ #include "paddle/fluid/eager/eager_tensor.h" #include "paddle/phi/api/lib/utils/allocator.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(copy_sr, CPU, ALL_LAYOUT); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(copy_sr, GPU, ALL_LAYOUT); +#endif namespace eager_test { using AbstractAutogradMeta = paddle::experimental::AbstractAutogradMeta; @@ -151,5 +159,50 @@ TEST(EagerVariable, Constructor) { CHECK_EQ(dt3_tmp_ptr[1], 10.0f); t4.reset(); CHECK(t4.defined() == false); + + VLOG(6) << "Check Tensor Copy_"; + std::vector rows = {1, 2}; + std::vector dims = {2}; + paddle::experimental::Tensor t7(std::make_shared(rows, 2)); + std::dynamic_pointer_cast(t7.impl()) + ->mutable_value() + ->Resize(phi::make_ddim(dims)); + auto* dt7_tmp_ptr = std::dynamic_pointer_cast(t7.impl()) + ->mutable_value() + ->mutable_data(paddle::platform::CPUPlace()); + dt7_tmp_ptr[0] = 6.0f; + dt7_tmp_ptr[1] = 11.0f; + + paddle::experimental::Tensor t8; + paddle::experimental::Tensor t5; +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + paddle::experimental::Tensor t6; + paddle::experimental::Tensor t9; + VLOG(6) << "Check Tensor Copy_ Selected Rows"; + t8.copy_(t7, paddle::platform::CUDAPlace(0), true); + t9.copy_(t8, paddle::platform::CPUPlace(), true); + auto* dt9_tmp_ptr = std::dynamic_pointer_cast(t9.impl()) + ->value() + .data(); + CHECK_EQ(dt9_tmp_ptr[0], 6.0f); + CHECK_EQ(dt9_tmp_ptr[1], 11.0f); + CHECK_EQ(std::dynamic_pointer_cast(t9.impl())->height(), + 2); + + VLOG(6) << "Check Tensor Copy_ Dense Tensor"; + t5.copy_(t3, paddle::platform::CUDAPlace(0), true); + t6.copy_(t5, paddle::platform::CPUPlace(), true); + auto* dt6_tmp_ptr = + std::dynamic_pointer_cast(t6.impl())->data(); + CHECK_EQ(dt6_tmp_ptr[0], 5.0f); + CHECK_EQ(dt6_tmp_ptr[1], 10.0f); +#else + t5.copy_(t3, paddle::platform::CPUPlace(), true); + auto* dt5_tmp_ptr = + std::dynamic_pointer_cast(t5.impl())->data(); + CHECK_EQ(dt5_tmp_ptr[0], 5.0f); + CHECK_EQ(dt5_tmp_ptr[1], 10.0f); +#endif + VLOG(6) << "Finish"; } diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc b/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc index e3db309c4016a512c5379fb352beb4af690a271e..d592b5ccf66ffc8532214a72612e9308b7e51fe5 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc +++ b/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc @@ -11,6 +11,7 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. + #include "glog/logging.h" #include "gtest/gtest.h" @@ -23,14 +24,9 @@ TEST(GradNodeInfo, GradSlotMeta) { auto grad_slot = egr::GradSlotMeta(); - CHECK(grad_slot.IsInitialized() == false); - VLOG(6) << "Init GradSlotMeta"; - grad_slot.Init(2); - CHECK(grad_slot.IsInitialized() == true); VLOG(6) << "Set SetStopGradient"; - grad_slot.SetStopGradient(0); - CHECK(grad_slot.IsStopGradient(0) == true); - CHECK_EQ(grad_slot.Size(), 2); + grad_slot.SetStopGradient(); + CHECK(grad_slot.IsStopGradient() == true); } void TestGradNodeBase(bool is_remove_gradient_hook) { @@ -56,18 +52,22 @@ void TestGradNodeBase(bool is_remove_gradient_hook) { ->data()[0], 6.0f); VLOG(6) << "Test Add Edges"; - egr::Edge edge0(grad_test_node1, 1, 2); - auto auto_grad0 = std::make_shared(edge0); + egr::Edge tmp_edge0(grad_test_node1, 1, 2); + auto auto_grad0 = std::make_shared(tmp_edge0); auto_grad0->SetStopGradient(false); - egr::Edge edge1(grad_test_node1, 3, 4); - auto auto_grad1 = std::make_shared(edge1); + + egr::Edge tmp_edge1(grad_test_node1, 3, 4); + auto auto_grad1 = std::make_shared(tmp_edge1); + et1.set_autograd_meta(auto_grad1); auto_grad1->SetStopGradient(false); grad_test_node0->AddEdges(auto_grad0.get(), 0); + CHECK_EQ(grad_test_node0->GetEdges()[0][0].GetEdgeRankInfo().first, size_t(1)); CHECK_EQ(grad_test_node0->GetEdges()[0][0].GetEdgeRankInfo().second, size_t(2)); std::vector metas = {auto_grad1.get()}; + grad_test_node0->AddEdges(&metas, 1); CHECK_EQ(grad_test_node0->GetEdges()[1][0].GetEdgeRankInfo().first, size_t(3)); @@ -76,22 +76,30 @@ void TestGradNodeBase(bool is_remove_gradient_hook) { VLOG(6) << "Test Set Meta and Get Meta"; auto_grad1->SetStopGradient(true); - grad_test_node0->SetGradInMeta(&metas, 0); - grad_test_node0->SetGradInMeta(auto_grad1.get(), 1); - grad_test_node0->SetGradOutMeta(&metas, 0); - grad_test_node0->SetGradOutMeta(auto_grad1.get(), 1); - CHECK_EQ(grad_test_node0->InputMeta()[0].Size(), 1); - CHECK_EQ(grad_test_node0->InputMeta()[1].Size(), 1); - CHECK(grad_test_node0->OutputMeta()[0].IsStopGradient(0)); - CHECK(grad_test_node0->OutputMeta()[1].IsStopGradient(0)); + grad_test_node0->SetGradInMeta(et1, 0); + grad_test_node0->SetGradInMeta({et1}, 1); + grad_test_node0->SetGradOutMeta(et1, 0); + grad_test_node0->SetGradOutMeta({et1}, 1); + CHECK_EQ(grad_test_node0->InputMeta()[0].size(), size_t(1)); + CHECK_EQ(grad_test_node0->InputMeta()[1].size(), size_t(1)); + CHECK_EQ(grad_test_node0->InputMeta()[0][0].GetTensorMeta().dtype, + meta.dtype); + CHECK_EQ(grad_test_node0->InputMeta()[1][0].GetTensorMeta().dtype, + meta.dtype); + CHECK(grad_test_node0->OutputMeta()[0][0].IsStopGradient()); + CHECK(grad_test_node0->OutputMeta()[1][0].IsStopGradient()); + CHECK_EQ(grad_test_node0->OutputMeta()[0][0].GetTensorMeta().dtype, + meta.dtype); + CHECK_EQ(grad_test_node0->OutputMeta()[1][0].GetTensorMeta().dtype, + meta.dtype); VLOG(6) << "Test Default Set Meta and Get Meta"; auto grad_test_node2 = std::make_shared( /* val */ 5.0, /* in_num */ 1, /* out_num */ 1); grad_test_node2->SetDefaultGradInOutMeta(); - CHECK(grad_test_node2->OutputMeta()[0].IsInitialized()); - CHECK(grad_test_node2->OutputMeta()[0].IsStopGradient(0) == false); - CHECK_EQ(grad_test_node2->OutputMeta()[0].Size(), 1); + CHECK_GT(grad_test_node2->OutputMeta()[0].size(), size_t(0)); + CHECK(grad_test_node2->OutputMeta()[0][0].IsStopGradient() == false); + CHECK_EQ(grad_test_node2->OutputMeta()[0].size(), size_t(1)); VLOG(6) << "Test Gradient Hook"; auto gradient_hook = []( @@ -135,7 +143,17 @@ TEST(GradNodeInfo, GradNodeBase) { } TEST(GradNodeInfo, Edge) { + phi::DenseTensorMeta meta = + phi::DenseTensorMeta(phi::DataType::FLOAT32, phi::make_ddim({1, 1})); + std::shared_ptr dt = std::make_shared( + std::make_unique( + paddle::platform::CPUPlace()) + .get(), + meta); + paddle::experimental::Tensor et1(dt); + auto grad_test_node0 = std::make_shared(5, 2, 2); + auto auto_grad1 = std::make_shared(); VLOG(6) << "Test Construct Edge"; egr::Edge edge0 = egr::Edge(); CHECK(edge0.IsInitialized() == false); @@ -145,13 +163,12 @@ TEST(GradNodeInfo, Edge) { egr::Edge(grad_test_node0, std::make_pair(size_t(1), size_t(0))); VLOG(6) << "Test Set Edge's Grad Node"; auto* grad_node = edge1.GetGradNode(); + et1.set_autograd_meta(auto_grad1); + grad_node->SetGradInMeta(et1, 0); + CHECK_EQ(grad_node->InputMeta().size(), size_t(2)); - auto mt_grad_node = edge1.GetMutableGradNode(); - auto auto_grad1 = std::make_shared(); std::vector metas = {auto_grad1.get()}; - // Uninitialized AutogradMeta indicates - mt_grad_node->SetGradInMeta(&metas, 0); - CHECK(grad_node->InputMeta()[0].IsStopGradient(0) == true); + CHECK(grad_node->InputMeta()[0][0].IsStopGradient() == true); VLOG(6) << "Test Get/Set Edge Rank Info"; CHECK_EQ(edge2.GetEdgeRankInfo().first, size_t(1)); CHECK_EQ(edge2.GetEdgeRankInfo().second, size_t(0)); diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h index 535c93ac53b1751d9634476e47f32dc0cbe22708..0b167203735d65683b0f978fa34fe7f457aae4f2 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h +++ b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h @@ -32,8 +32,8 @@ class GradTestNode : public egr::GradNodeBase { GradTestNode() : GradNodeBase() { val_ = 1.0; } std::string name() override { return "GradTestNode"; } std::vector> operator()( - const std::vector>& grads) - override { + const std::vector>& grads, + bool create_graph = false) override { val_ = std::dynamic_pointer_cast(grads[0][0].impl()) ->data()[0]; phi::DenseTensorMeta meta = @@ -49,6 +49,11 @@ class GradTestNode : public egr::GradNodeBase { std::vector> res = {{et1}}; return res; } + void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; } + bool IsTensorWrappersCleared() override { + VLOG(6) << "Do nothing here now"; + return false; + } float val_; }; } // namespace eager_test diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc b/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc index 384fdcd6f97c4b318341db68cdd88b644d42d22a..645eac06ddda519bba952abb460571c9667c6d4a 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc +++ b/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc @@ -30,8 +30,7 @@ PD_DECLARE_KERNEL(full_like, CPU, ALL_LAYOUT); using namespace egr; // NOLINT TEST(GradTensorHolder, Constructor) { - GradSlotMeta slot_meta; - slot_meta.Init(1); + std::vector slot_meta(1); GradTensorHolder grad_tensor_holder = GradTensorHolder({slot_meta}); GradTensorHolder grad_tensor_holder2 = GradTensorHolder(grad_tensor_holder); @@ -72,8 +71,7 @@ TEST(GradTensorHolder, Interfaces) { paddle::experimental::Tensor et1 = paddle::experimental::Tensor(dt1); // Constructor empty GradTensorHolder - GradSlotMeta slot_meta; - slot_meta.Init(1); + std::vector slot_meta(1); GradTensorHolder grad_tensor_holder = GradTensorHolder({slot_meta, slot_meta}); @@ -138,8 +136,7 @@ TEST(GradTensorHolder, SelectedRowsMergeAdd) { paddle::experimental::Tensor t2(sr2); // Constructor empty GradTensorHolder - GradSlotMeta slot_meta; - slot_meta.Init(1); + std::vector slot_meta(1); GradTensorHolder grad_tensor_holder = GradTensorHolder({slot_meta, slot_meta}); diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc index 769bd7f687f4584d44bbfa30b73611a3128289bf..c8fb6050e9d450d598ea722ac74da924e8857f0e 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc @@ -37,7 +37,7 @@ #include "paddle/fluid/imperative/tracer.h" #include "paddle/fluid/memory/memcpy.h" -static size_t max_num_benchmark_runs = 5000; +static size_t max_num_benchmark_runs = 4000; namespace egr { @@ -58,7 +58,7 @@ void benchmark_eager_scale(const paddle::experimental::Tensor& tensor, } std::vector target_tensors = {input_tensor}; - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); if (accuracy_check) { // Examine Forward Grad (w.r.t max_num_runs = 10) @@ -80,7 +80,7 @@ void benchmark_eager_matmul(const paddle::experimental::Tensor& X, } std::vector target_tensors = {input_tensor0}; - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); if (accuracy_check) { // Examine Forward Grad (w.r.t max_num_runs = 2) @@ -106,7 +106,7 @@ void benchmark_eager_intermediate_matmul(const paddle::experimental::Tensor& X, } std::vector target_tensors = {input_tensor0}; - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); if (accuracy_check) { // Examine Forward Grad (w.r.t max_num_runs = 2) @@ -137,7 +137,7 @@ void benchmark_eager_intermediate_mlp( reduce_sum_dygraph_function(input0, {{"reduce_all", true}}); std::vector target_tensors = {Out}; - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); if (accuracy_check) { std::unordered_map result = diff --git a/paddle/fluid/eager/tests/task_tests/CMakeLists.txt b/paddle/fluid/eager/tests/task_tests/CMakeLists.txt index c65ad4641cf2206cc0f97d91f1fb24e50b7b63cd..52dba6b9218c7be8a29ae1aff619facd25a6f3b6 100644 --- a/paddle/fluid/eager/tests/task_tests/CMakeLists.txt +++ b/paddle/fluid/eager/tests/task_tests/CMakeLists.txt @@ -5,6 +5,7 @@ cc_test(test_egr_task_backward SRCS backward_test.cc DEPS ${eager_deps} ${fluid_ cc_test(test_egr_task_hook SRCS hook_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node) cc_test(test_egr_task_cross_batch SRCS cross_batch_accumulation_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node) cc_test(test_egr_task_fwd_bwd_joint SRCS fwd_bwd_joint_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node) +cc_test(test_egr_task_grad SRCS grad_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node) if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) cc_test(test_egr_task_hook_intermidiate SRCS hook_test_intermidiate.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps} dygraph_node) diff --git a/paddle/fluid/eager/tests/task_tests/backward_test.cc b/paddle/fluid/eager/tests/task_tests/backward_test.cc index 0c894ed267fcdd08d44d4df08bfaf0554874aebf..87f8f6eca1f88fe9a54583ee19586dd75c7e231e 100644 --- a/paddle/fluid/eager/tests/task_tests/backward_test.cc +++ b/paddle/fluid/eager/tests/task_tests/backward_test.cc @@ -33,6 +33,7 @@ #include "paddle/phi/core/kernel_registry.h" PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT); namespace egr { @@ -79,7 +80,7 @@ TEST(Backward, SingleNodeEmptyGrad) { } std::vector outs = {target_tensor}; // Run Backward - RunBackward(outs, {}); + Backward(outs, {}); // Check Output Value eager_test::CompareGradTensorWithValue(leaf_tensor, 5.0); @@ -138,7 +139,7 @@ TEST(Backward, SingleNodeCustomGrad) { } // Run Backward - RunBackward(target_tensors, grad_tensors); + Backward(target_tensors, grad_tensors); // Check Output Value eager_test::CompareGradTensorWithValue(leaf_tensor, 50.0); @@ -211,7 +212,7 @@ TEST(Backward, LinearNodes) { } // Use Empty Grad Tensor - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); // Check Output Value eager_test::CompareGradTensorWithValue(leaf_tensor, 50.0); @@ -315,7 +316,7 @@ TEST(Backward, WithAccumulation) { node2_ptr->AddEdges(&res2, 0); } - RunBackward(target_tensors, grad_tensors); + Backward(target_tensors, grad_tensors); eager_test::CompareGradTensorWithValue(leaf_tensor, 2500.0); } diff --git a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc index 36594f1aac8cdb131bb77f1396dca19a0c2e8cc0..8b0759c17ed3712079e8954df60e35afaaf02a9e 100644 --- a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc +++ b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc @@ -71,12 +71,12 @@ TEST(CrossBatchAccumulation, SingleScaleNode) { std::vector res = {meta}; scale_node_ptr->AddEdges(&res, 0); - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); eager_test::CompareGradTensorWithValue(target_tensor, 1.0); eager_test::CompareGradTensorWithValue(leaf_tensor, 5.0); - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); eager_test::CompareGradTensorWithValue(target_tensor, 1.0); eager_test::CompareGradTensorWithValue(leaf_tensor, 10.0); diff --git a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc index f7fa642ea8dd17d20816e74c9bfb4cd92b184b4a..882695e98d109e09340223e21322a02d1b48c6ea 100644 --- a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc +++ b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc @@ -86,7 +86,7 @@ TEST(FwdBwdJoint, SingleNode) { std::vector outs = {out}; // 4. Run Backward - RunBackward(outs, {}); + Backward(outs, {}); VLOG(7) << "Target Grad is: " << std::static_pointer_cast( @@ -137,7 +137,7 @@ TEST(FwdBwdJoint, LinearNodes) { std::vector outs = {out1}; // 4. Run Backward - RunBackward(outs, {}); + Backward(outs, {}); // Examine Backward Grad eager_test::CompareGradTensorWithValue(tensor, 10.0); @@ -203,7 +203,7 @@ TEST(FwdBwdJoint, BranchedNodes) { // 4. Run Backward std::vector outs = {out1, out2}; - RunBackward(outs, {}); + Backward(outs, {}); // Examine Backward Grad eager_test::CompareGradTensorWithValue(tensor, 30.0); @@ -260,7 +260,7 @@ TEST(FwdBwdJoint, GradientHook) { // 4. Run Backward std::vector outs = {out1, out2}; - RunBackward(outs, {}); + Backward(outs, {}); // Examine Backward Grad // leaf grad @@ -318,13 +318,13 @@ TEST(FwdBwdJoint, CrossBatchAccumulation) { // 4. Run Backward std::vector outs = {out1, out2}; - RunBackward(outs, {}); + Backward(outs, {}); // Examine Backward Grad eager_test::CompareGradTensorWithValue(tensor, 30.0); // Cross Batch Accumulation - RunBackward(outs, {}); + Backward(outs, {}); // Examine Backward Grad eager_test::CompareGradTensorWithValue(tensor, 60.0); @@ -356,7 +356,7 @@ TEST(FwdBwdJoint, SingleNodeCUDA) { std::vector outs = {out}; // 4. Run Backward - RunBackward(outs, {}); + Backward(outs, {}); // Examine Backward Grad eager_test::CompareGradTensorWithValue(tensor, 2.0); @@ -412,7 +412,7 @@ TEST(FwdBwdJoint, BranchedNodesCUDA) { // TODO(jiabin): fix this with add functor // 4. Run Backward std::vector outs = {out1, out2}; - RunBackward(outs, {}); + Backward(outs, {}); // Examine Backward Grad eager_test::CompareGradTensorWithValue(tensor, 30.0); diff --git a/paddle/fluid/eager/tests/task_tests/generated_test.cc b/paddle/fluid/eager/tests/task_tests/generated_test.cc index 2a5ad53204a6201149bec0b3dac0fa3baf441f2e..49e517dc9b3f3271ef26dfbece46f799ef805c57 100644 --- a/paddle/fluid/eager/tests/task_tests/generated_test.cc +++ b/paddle/fluid/eager/tests/task_tests/generated_test.cc @@ -57,7 +57,7 @@ TEST(Generated, Sigmoid) { std::vector target_tensors = {output_tensor}; VLOG(6) << "Runing Backward"; - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); VLOG(6) << "Finish Backward"; eager_test::CompareGradTensorWithValue(tensor, 0.25); @@ -89,7 +89,7 @@ TEST(Generated, Matmul_v2) { eager_test::CompareTensorWithValue(output_tensor, 96); std::vector target_tensors = {output_tensor}; - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); eager_test::CompareGradTensorWithValue(X, 2.0 * 20); eager_test::CompareGradTensorWithValue(Y, 3.0 * 4); @@ -120,7 +120,7 @@ TEST(Generated, ElementwiseAdd) { eager_test::CompareTensorWithValue(output_tensor, 5); std::vector target_tensors = {output_tensor}; - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); eager_test::CompareGradTensorWithValue(X, 1.0); eager_test::CompareGradTensorWithValue(Y, 1.0); @@ -128,6 +128,6 @@ TEST(Generated, ElementwiseAdd) { } // namespace egr -USE_OP(sigmoid); +USE_OP_ITSELF(sigmoid); USE_OP_ITSELF(elementwise_add); USE_OP_ITSELF(matmul_v2); diff --git a/paddle/fluid/eager/tests/task_tests/grad_test.cc b/paddle/fluid/eager/tests/task_tests/grad_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..6b03799c48659c579938df6efc0f7cf57bbc0bec --- /dev/null +++ b/paddle/fluid/eager/tests/task_tests/grad_test.cc @@ -0,0 +1,339 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "glog/logging.h" +#include "gtest/gtest.h" + +#include "paddle/fluid/eager/accumulation/accumulation_node.h" +#include "paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h" +#include "paddle/fluid/eager/api/utils/tensor_utils.h" +#include "paddle/fluid/eager/autograd_meta.h" +#include "paddle/fluid/eager/backward.h" +#include "paddle/fluid/eager/grad_node_info.h" +#include "paddle/fluid/eager/tests/test_utils.h" + +#include "paddle/fluid/eager/api/all.h" + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/tensor_meta.h" + +PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT); +namespace egr { + +TEST(Grad, SingleNodeEmptyGrad) { + // Prepare Device Contexts + eager_test::InitEnv(paddle::platform::CPUPlace()); + + // Prepare Inputs + paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32}); + + // Create Target Tensor (output) + paddle::experimental::Tensor output_tensor = + egr_utils_api::CreateTensorWithValue( + ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/); + + // Create input tensor + const paddle::experimental::Tensor leaf_tensor = + egr_utils_api::CreateTensorWithValue( + ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 1.0 /*value*/, true /*is_leaf*/); + + { + // Create Scale Node + auto node0_ptr = std::make_shared(1, 1); + node0_ptr->SetAttributes_scale(5.0 /*scale*/); + + // Set grad in/out meta + node0_ptr->SetDefaultGradInOutMeta(); + + // Output_tensor set GradNode、OutRank、StopGradient propertis + AutogradMeta* auto_grad_meta = EagerUtils::autograd_meta(&output_tensor); + auto_grad_meta->SetGradNode( + std::dynamic_pointer_cast(node0_ptr)); + auto_grad_meta->SetSingleOutRankWithSlot(0, 0); + auto_grad_meta->SetStopGradient(false); + + // Get autograd_meta from input tensor + AutogradMeta* auto_grad_meta1 = + EagerUtils::unsafe_autograd_meta(leaf_tensor); + + // Connect Tensor and AccumulationNode via AutoGradMeta + auto acc_node_ptr = + std::make_shared(auto_grad_meta1); + + // input tensor set GradNode、OutRank、StopGradient propertis + auto_grad_meta1->SetGradNode( + std::dynamic_pointer_cast(acc_node_ptr)); + auto_grad_meta1->SetSingleOutRankWithSlot(0, 0); + auto_grad_meta1->SetStopGradient(false); + + // grad_node Add Edges + std::vector res = {auto_grad_meta1}; + node0_ptr->AddEdges(&res, 0); + } + std::vector outs = {output_tensor}; + + // Run Grad + auto result = Grad(outs, {leaf_tensor}, {}); + // Check Output Value + eager_test::CompareTensorWithValue(result[0], 5.0); +} + +TEST(Grad, SingleNodeCustomGrad) { + // Prepare Device Contexts + eager_test::InitEnv(paddle::platform::CPUPlace()); + + // Prepare Inputs + std::vector target_tensors; + paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32}); + + // Create Target Tensor + paddle::experimental::Tensor tensor = egr_utils_api::CreateTensorWithValue( + ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/); + target_tensors.emplace_back(std::move(tensor)); + + std::vector grad_tensors; + // Create Grad Tensor + paddle::experimental::Tensor grad_tensor = + egr_utils_api::CreateTensorWithValue( + ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 10.0 /*value*/, false /*is_leaf*/); + grad_tensors.emplace_back(std::move(grad_tensor)); + + paddle::experimental::Tensor leaf_tensor = + egr_utils_api::CreateTensorWithValue( + ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 1.0 /*value*/, true /*is_leaf*/); + + { + // Create Scale Node + auto node0_ptr = std::make_shared(1, 1); + node0_ptr->SetAttributes_scale(5.0 /*scale*/); + + // Set grad in/out meta + node0_ptr->SetDefaultGradInOutMeta(); + + // Connect Tensor and Node via AutoGradMeta + AutogradMeta* auto_grad_meta = + EagerUtils::autograd_meta(&(target_tensors[0])); + auto_grad_meta->SetGradNode( + std::dynamic_pointer_cast(node0_ptr)); + auto_grad_meta->SetSingleOutRankWithSlot(0, 0); + auto_grad_meta->SetStopGradient(false); + + AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor); + // Connect Tensor and AccumulationNode via AutoGradMeta + auto acc_node_ptr = + std::make_shared(auto_grad_meta1); + + auto_grad_meta1->SetGradNode( + std::dynamic_pointer_cast(acc_node_ptr)); + auto_grad_meta1->SetSingleOutRankWithSlot(0, 0); + auto_grad_meta1->SetStopGradient(false); + std::vector res = {auto_grad_meta1}; + node0_ptr->AddEdges(&res, 0); + } + + auto result = Grad(target_tensors, {leaf_tensor}, grad_tensors); + + // Check Output Value + eager_test::CompareTensorWithValue(result[0], 50.0); +} + +/* +Node1 + | +Node0 + | + { } // empty grad tensor +*/ +TEST(Grad, LinearNodes) { + // Prepare Device Contexts + eager_test::InitEnv(paddle::platform::CPUPlace()); + + // Prepare Target Tensor + std::vector target_tensors; + paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32}); + + // Create Target Tensor + paddle::experimental::Tensor tensor = egr_utils_api::CreateTensorWithValue( + ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/); + target_tensors.emplace_back(std::move(tensor)); + + paddle::experimental::Tensor leaf_tensor = + egr_utils_api::CreateTensorWithValue( + ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 1.0 /*value*/, true /*is_leaf*/); + { + // Create Node0 + auto node0_ptr = std::make_shared(1, 1); + node0_ptr->SetAttributes_scale(5.0 /*scale*/); + + // Set grad in/out meta for node0 + node0_ptr->SetDefaultGradInOutMeta(); + + // Create Node1 + auto node1_ptr = std::make_shared(1, 1); + node1_ptr->SetAttributes_scale(10.0 /*scale*/); + + // Set grad in/out meta for node1 + node1_ptr->SetDefaultGradInOutMeta(); + + // Connect Input Tensor and Node0 via AutoGradMeta + AutogradMeta* auto_grad_meta = + EagerUtils::autograd_meta(&(target_tensors[0])); + auto_grad_meta->SetGradNode( + std::dynamic_pointer_cast(node0_ptr)); + auto_grad_meta->SetSingleOutRankWithSlot(0, 0); + auto_grad_meta->SetStopGradient(false); + // Connect Node0 -> Node1 via Edge + auto meta0 = egr::AutogradMeta(); + meta0.SetStopGradient(false); + meta0.SetSingleOutRankWithSlot(0, 0); + meta0.SetGradNode(node1_ptr); + std::vector res0 = {&meta0}; + node0_ptr->AddEdges(&res0, 0); + + AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor); + // Connect Tensor and AccumulationNode via AutoGradMeta + auto acc_node_ptr = + std::make_shared(auto_grad_meta1); + + auto_grad_meta1->SetGradNode( + std::dynamic_pointer_cast(acc_node_ptr)); + auto_grad_meta1->SetSingleOutRankWithSlot(0, 0); + + auto_grad_meta1->SetStopGradient(false); + std::vector res1 = {auto_grad_meta1}; + node1_ptr->AddEdges(&res1, 0); + } + + // Use Empty Grad Tensor + auto result = Grad(target_tensors, {leaf_tensor}, {}); + + // Check Output Value + eager_test::CompareTensorWithValue(result[0], 50.0); +} + +/* + Node2 + | | +Node0 Node1 + | | + in0 in1 +*/ +TEST(Grad, WithAccumulation) { + // Prepare Device Contexts + eager_test::InitEnv(paddle::platform::CPUPlace()); + + // Prepare Inputs + paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32}); + + // Create Target Tensor + std::vector target_tensors; + paddle::experimental::Tensor tensor0 = egr_utils_api::CreateTensorWithValue( + ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/); + paddle::experimental::Tensor tensor1 = egr_utils_api::CreateTensorWithValue( + ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/); + target_tensors.emplace_back(std::move(tensor0)); + target_tensors.emplace_back(std::move(tensor1)); + + // Create Grad Tensor + std::vector grad_tensors; + paddle::experimental::Tensor grad_tensor0 = + egr_utils_api::CreateTensorWithValue( + ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 5.0 /*value*/, false /*is_leaf*/); + paddle::experimental::Tensor grad_tensor1 = + egr_utils_api::CreateTensorWithValue( + ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 10.0 /*value*/, false /*is_leaf*/); + grad_tensors.emplace_back(std::move(grad_tensor0)); + grad_tensors.emplace_back(std::move(grad_tensor1)); + + paddle::experimental::Tensor leaf_tensor; + { + // Create Node0 + auto node0_ptr = std::make_shared(1, 1); + node0_ptr->SetAttributes_scale(5.0 /*scale*/); + node0_ptr->SetDefaultGradInOutMeta(); + + // Create Node1 + auto node1_ptr = std::make_shared(1, 1); + node1_ptr->SetAttributes_scale(10.0 /*scale*/); + node1_ptr->SetDefaultGradInOutMeta(); + // Create Node2 + auto node2_ptr = std::make_shared(1, 1); + node2_ptr->SetAttributes_scale(20.0 /*scale*/); + node2_ptr->SetDefaultGradInOutMeta(); + // Connect Inp0 and Node0 via AutoGradMeta + AutogradMeta* auto_grad_meta0 = + EagerUtils::autograd_meta(&(target_tensors[0])); + auto_grad_meta0->SetGradNode( + std::dynamic_pointer_cast(node0_ptr)); + auto_grad_meta0->SetSingleOutRankWithSlot(0, 0); + auto_grad_meta0->SetStopGradient(false); + // Connect Inp1 and Node1 via AutoGradMeta + AutogradMeta* auto_grad_meta1 = + EagerUtils::autograd_meta(&(target_tensors[1])); + auto_grad_meta1->SetGradNode( + std::dynamic_pointer_cast(node1_ptr)); + auto_grad_meta1->SetSingleOutRankWithSlot(0, 0); + auto_grad_meta1->SetStopGradient(false); + + // Connect Node0 -> Node2 via Edge + auto meta0 = egr::AutogradMeta(); + meta0.SetStopGradient(false); + meta0.SetSingleOutRankWithSlot(0, 0); + meta0.SetGradNode(node2_ptr); + std::vector res0 = {&meta0}; + node0_ptr->AddEdges(&res0, 0); + + // Connect Node1 -> Node2 via Edge + auto meta1 = egr::AutogradMeta(); + meta1.SetStopGradient(false); + meta1.SetSingleOutRankWithSlot(0, 0); + meta1.SetGradNode(node2_ptr); + std::vector res1 = {&meta1}; + node1_ptr->AddEdges(&res1, 0); + + AutogradMeta* auto_grad_meta2 = EagerUtils::autograd_meta(&leaf_tensor); + // Connect Tensor and AccumulationNode via AutoGradMeta + auto acc_node_ptr = + std::make_shared(auto_grad_meta2); + + auto_grad_meta2->SetGradNode( + std::dynamic_pointer_cast(acc_node_ptr)); + auto_grad_meta2->SetSingleOutRankWithSlot(0, 0); + + auto_grad_meta2->SetStopGradient(false); + std::vector res2 = {auto_grad_meta2}; + node2_ptr->AddEdges(&res2, 0); + } + + auto result = Grad(target_tensors, {leaf_tensor}, grad_tensors); + + eager_test::CompareTensorWithValue(result[0], 2500.0); +} + +} // namespace egr diff --git a/paddle/fluid/eager/tests/task_tests/hook_test.cc b/paddle/fluid/eager/tests/task_tests/hook_test.cc index d546df4ed087a99a28096a5336fab3826991534a..2c53fc89f650e36f1435c7e1e805453fe7822cf2 100644 --- a/paddle/fluid/eager/tests/task_tests/hook_test.cc +++ b/paddle/fluid/eager/tests/task_tests/hook_test.cc @@ -132,7 +132,7 @@ TEST(RetainGrad, HookBeforeRetainGrad) { leaf_tensor); // result: 4.0*5.0 + 3.0 = 23.0 } - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); eager_test::CompareGradTensorWithValue(target_tensor, 4.0); eager_test::CompareGradTensorWithValue(leaf_tensor, 23.0); @@ -199,7 +199,7 @@ TEST(RetainGrad, HookAfterRetainGrad) { leaf_tensor, std::make_shared(hook_function)); } - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); eager_test::CompareGradTensorWithValue(target_tensor, 1.0); eager_test::CompareGradTensorWithValue(leaf_tensor, 23.0); } diff --git a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc index 56813c498d2410caa452da7a334c393b230c65bf..b86865e2d126fbfc0b00495a6e3208932ac6de39 100644 --- a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc +++ b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc @@ -108,7 +108,7 @@ void test_sigmoid(bool is_remove_gradient_hook) { } VLOG(6) << "Runing Backward"; - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); VLOG(6) << "Finish Backward"; eager_test::CompareGradTensorWithValue( @@ -166,7 +166,7 @@ void test_elementwiseAdd(bool is_remove_gradient_hook) { grad_node_tmp->RemoveGradientHook(hook_id); } - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); eager_test::CompareGradTensorWithValue(X, 1.0); eager_test::CompareGradTensorWithValue( @@ -224,7 +224,7 @@ void test_matmul(bool is_remove_gradient_hook) { grad_node_tmp->RemoveGradientHook(hook_id); } - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); eager_test::CompareGradTensorWithValue(X, 2.0 * 20); eager_test::CompareGradTensorWithValue( @@ -255,6 +255,6 @@ TEST(Hook_intermidiate, Matmul_v2) { } } // namespace egr -USE_OP(sigmoid); +USE_OP_ITSELF(sigmoid); USE_OP_ITSELF(elementwise_add); USE_OP_ITSELF(matmul_v2); diff --git a/paddle/fluid/eager/to_static/run_program_op_func.h b/paddle/fluid/eager/to_static/run_program_op_func.h index 9967d8c36900f45fdd76272bc4416df1d30f2a6a..277319bc700b652855576db248463b424846e2e9 100644 --- a/paddle/fluid/eager/to_static/run_program_op_func.h +++ b/paddle/fluid/eager/to_static/run_program_op_func.h @@ -66,10 +66,10 @@ inline void run_program_dygraph_function( grad_node->SetStepScope(step_scope); // Set Grad out rank as same as fwd input and set stop gradient to bwd - grad_node->SetGradOutMeta(&p_autograd_x, /*slot id*/ 0); - grad_node->SetGradOutMeta(&p_autograd_params, /*slot id*/ 1); + grad_node->SetGradOutMeta(x, /*slot id*/ 0); + grad_node->SetGradOutMeta(params, /*slot id*/ 1); - grad_node->SetGradInMeta(&p_autograd_outs, 0); + grad_node->SetGradInMeta(deref_out, 0); // Set Next Edges grad_node->AddEdges(&p_autograd_x, /*slot id*/ 0); grad_node->AddEdges(&p_autograd_params, /*slot id*/ 1); diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h index d99624e49324853d513a20a725c1a3d12b6aaab5..4eaa64d3ac659ca0ec76083b70855d8b6b241556 100644 --- a/paddle/fluid/eager/to_static/run_program_op_node.h +++ b/paddle/fluid/eager/to_static/run_program_op_node.h @@ -370,8 +370,8 @@ class GradNodeRunProgram : public egr::GradNodeBase { ~GradNodeRunProgram() override = default; // Functor: perform backward computations virtual std::vector> operator()( - const std::vector> &grads) - override { + const std::vector> &grads, + bool create_graph) override { VLOG(3) << "Running Eager Backward Node: GradNodeRunProgram"; PADDLE_ENFORCE_EQ( grads.size(), 1, @@ -415,6 +415,12 @@ class GradNodeRunProgram : public egr::GradNodeBase { // return {x_grad, details::DereferenceTensors(params_grad_ptr)}; } + void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; } + bool IsTensorWrappersCleared() override { + VLOG(6) << "Do nothing here now"; + return false; + } + // SetAttrMap void SetAttrMap(const paddle::framework::AttributeMap &attrs) { attrs_ = attrs; diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc index 8a57d2694535e9c27e88416468fe5a67ce020b43..048087903a47c1699a7d7f32199c313146bd37ab 100644 --- a/paddle/fluid/eager/utils.cc +++ b/paddle/fluid/eager/utils.cc @@ -212,6 +212,27 @@ std::vector> EagerUtils::CreateVars( return res; } +void EagerUtils::ModifyInplaceInput( + const std::shared_ptr& inplace_variable, + paddle::experimental::Tensor* inplace_tensor) { + // Only modify the meta information of the inplace tensor, because + // EagerVariable cannot modify Tensor's meta information after inplace + // op (such as ``reshape``) is executed. + PADDLE_ENFORCE_NOT_NULL(inplace_tensor, + paddle::platform::errors::Fatal( + "Inplace Tensor is null and cannot be modified. " + "We are tring to Modify Inplace Input from its " + "shared_ptr, this error may indicate the inplace " + " input is nullptr")); + if (phi::DenseTensor::classof(inplace_variable->GetTensorBase().get())) { + phi::DenseTensor* variable_dense_tensor = + static_cast(inplace_variable->GetTensorBase().get()); + phi::DenseTensor* tensor_dense_tensor = + static_cast(inplace_tensor->impl().get()); + tensor_dense_tensor->set_meta(variable_dense_tensor->meta()); + } +} + std::vector EagerUtils::GetOutputs( const std::vector>& outs) { std::vector res; diff --git a/paddle/fluid/eager/utils.h b/paddle/fluid/eager/utils.h index fa5735e6f32a0ca7762b9ba94cce26ac8ac567dd..fbd080ef70e25408abcb979360610ad08d752f96 100644 --- a/paddle/fluid/eager/utils.h +++ b/paddle/fluid/eager/utils.h @@ -14,6 +14,7 @@ #pragma once +#include "paddle/fluid/eager/api/utils/tensor_utils.h" #include "paddle/fluid/eager/autograd_meta.h" #include "paddle/fluid/eager/eager_tensor.h" #include "paddle/fluid/eager/grad_node_info.h" @@ -144,6 +145,19 @@ class EagerUtils { iter.apply(std::forward(args)...); } + static void CheckInplace(const paddle::experimental::Tensor& target, + const AutogradMeta* autograd_meta, + bool require_any_grad) { + if (require_any_grad && autograd_meta) { + PADDLE_ENFORCE_EQ(!autograd_meta->StopGradient() && + egr::egr_utils_api::IsLeafTensor(target), + false, paddle::platform::errors::InvalidArgument( + "Leaf Var (%s) that doesn't stop gradient " + "can't use inplace strategy.", + target.name())); + } + } + // TensorWrapper Utils static paddle::experimental::Tensor RecoverTensorWrapper( TensorWrapper* tw, const std::shared_ptr& grad_node); @@ -171,6 +185,9 @@ class EagerUtils { static std::vector> CreateVars( const size_t num); // Construct Tensor From var + static void ModifyInplaceInput( + const std::shared_ptr& inplace_variable, + paddle::experimental::Tensor* inplace_tensor); static std::vector GetOutputs( const std::vector>& outs); static paddle::experimental::Tensor GetOutput( diff --git a/paddle/fluid/framework/data_device_transform.cc b/paddle/fluid/framework/data_device_transform.cc index 1a4f283f511da4300d26e764907998ad647eeebf..589d09bf81c1d95795cd80ed22581e52156ae417 100644 --- a/paddle/fluid/framework/data_device_transform.cc +++ b/paddle/fluid/framework/data_device_transform.cc @@ -34,6 +34,14 @@ void TransDataDevice(const Tensor &in, const platform::Place &dst_place, return; } + // NOTE(hqp): Special case for CPU->MLU, avoid stream sync. + if (platform::is_cpu_place(in.place()) && platform::is_mlu_place(dst_place)) { + paddle::framework::TensorCopy( + in, dst_place, *platform::DeviceContextPool::Instance().Get(dst_place), + out); + return; + } + // NOTE(yy): TransDataDevice should wait for computation of input. if (!platform::is_cuda_pinned_place(in.place())) { platform::DeviceContextPool::Instance().Get(in.place())->Wait(); diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 48850d4624a14c32d30e4562b322115127823c6b..f951b5d0f507039decfc3b4d0081f17cc9f8f50e 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -174,10 +174,11 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, bool force_disable_gc, bool keep_kid_scopes) { platform::RecordBlock b(block_id); if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc); + auto ctx = Prepare(pdesc, block_id, skip_ref_cnt_vars, force_disable_gc); #ifdef PADDLE_WITH_MKLDNN platform::AttachPointerHashToMKLDNNKey(this, place_); + platform::RegisterModelLayout(ctx->ops_, place_); #endif - auto ctx = Prepare(pdesc, block_id, skip_ref_cnt_vars, force_disable_gc); RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars, keep_kid_scopes); } diff --git a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt index 17346f5fd939324e6c2d709fb09be2cb65669429..2b8b4b3ff9573f601f8da3092c18433a49a93869 100644 --- a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt +++ b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt @@ -10,8 +10,9 @@ IF(WITH_GPU) nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h mem_pool.h DEPS ${HETERPS_DEPS}) nv_test(test_heter_comm SRCS feature_value.h DEPS heter_comm) nv_library(heter_ps SRCS heter_ps.cu DEPS heter_comm) - nv_library(graph_gpu_ps SRCS graph_gpu_ps_table.h DEPS heter_comm) + nv_library(graph_gpu_ps SRCS graph_gpu_ps_table.h DEPS heter_comm table) nv_test(test_graph_comm SRCS test_graph.cu DEPS graph_gpu_ps) + nv_test(test_cpu_graph_sample SRCS test_cpu_graph_sample.cu DEPS graph_gpu_ps) ENDIF() IF(WITH_ROCM) hip_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS cub device_context) diff --git a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h new file mode 100644 index 0000000000000000000000000000000000000000..235f7a226ad17649960d1e72d7907e8013e406fe --- /dev/null +++ b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h @@ -0,0 +1,120 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#ifdef PADDLE_WITH_HETERPS +namespace paddle { +namespace framework { +struct GpuPsGraphNode { + int64_t node_id; + int neighbor_size, neighbor_offset; + // this node's neighbor is stored on [neighbor_offset,neighbor_offset + + // neighbor_size) of int64_t *neighbor_list; +}; + +struct GpuPsCommGraph { + int64_t *neighbor_list; + GpuPsGraphNode *node_list; + int neighbor_size, node_size; + // the size of neighbor array and graph_node_list array + GpuPsCommGraph() + : neighbor_list(NULL), node_list(NULL), neighbor_size(0), node_size(0) {} + GpuPsCommGraph(int64_t *neighbor_list_, GpuPsGraphNode *node_list_, + int neighbor_size_, int node_size_) + : neighbor_list(neighbor_list_), + node_list(node_list_), + neighbor_size(neighbor_size_), + node_size(node_size_) {} +}; + +/* +suppose we have a graph like this + +0----3-----5----7 + \ |\ |\ + 17 8 9 1 2 + +we save the nodes in arbitrary order, +in this example,the order is +[0,5,1,2,7,3,8,9,17] +let us name this array u_id; +we record each node's neighbors: +0:3,17 +5:3,7 +1:7 +2:7 +7:1,2,5 +3:0,5,8,9 +8:3 +9:3 +17:0 + +by concatenating each node's neighbor_list in the order we save the node id. +we get [3,17,3,7,7,7,1,2,5,0,5,8,9,3,3,0] +this is the neighbor_list of GpuPsCommGraph +given this neighbor_list and the order to save node id, +we know, +node 0's neighbors are in the range [0,1] of neighbor_list +node 5's neighbors are in the range [2,3] of neighbor_list +node 1's neighbors are in the range [4,4] of neighbor_list +node 2:[5,5] +node 7:[6,6] +node 3:[9,12] +node 8:[13,13] +node 9:[14,14] +node 17:[15,15] +... +by the above information, +we generate a node_list:GpuPsGraphNode *graph_node_list in GpuPsCommGraph +of size 9, +where node_list[i].id = u_id[i] +then we have: +node_list[0]-> node_id:0, neighbor_size:2, neighbor_offset:0 +node_list[1]-> node_id:5, neighbor_size:2, neighbor_offset:2 +node_list[2]-> node_id:1, neighbor_size:1, neighbor_offset:4 +node_list[3]-> node_id:2, neighbor_size:1, neighbor_offset:5 +node_list[4]-> node_id:7, neighbor_size:3, neighbor_offset:6 +node_list[5]-> node_id:3, neighbor_size:4, neighbor_offset:9 +node_list[6]-> node_id:8, neighbor_size:1, neighbor_offset:13 +node_list[7]-> node_id:9, neighbor_size:1, neighbor_offset:14 +node_list[8]-> node_id:17, neighbor_size:1, neighbor_offset:15 +*/ +struct NeighborSampleResult { + int64_t *val; + int *actual_sample_size, sample_size, key_size; + NeighborSampleResult(int _sample_size, int _key_size) + : sample_size(_sample_size), key_size(_key_size) { + actual_sample_size = NULL; + val = NULL; + }; + ~NeighborSampleResult() { + if (val != NULL) cudaFree(val); + if (actual_sample_size != NULL) cudaFree(actual_sample_size); + } +}; + +struct NodeQueryResult { + int64_t *val; + int actual_sample_size; + NodeQueryResult() { + val = NULL; + actual_sample_size = 0; + }; + ~NodeQueryResult() { + if (val != NULL) cudaFree(val); + } +}; +} +}; +#endif diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h index a6508bf96c00f835da4aee79503f16fa5451e794..b8f9f0bfec9b2a0bf6b6fb1e122e40b3eaa90fa8 100644 --- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h +++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h @@ -14,114 +14,25 @@ #pragma once #include "heter_comm.h" +#include "paddle/fluid/distributed/ps/table/common_graph_table.h" +#include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h" #include "paddle/fluid/platform/enforce.h" #ifdef PADDLE_WITH_HETERPS namespace paddle { namespace framework { -struct GpuPsGraphNode { - int64_t node_id; - int neighbor_size, neighbor_offset; - // this node's neighbor is stored on [neighbor_offset,neighbor_offset + - // neighbor_size) of int64_t *neighbor_list; -}; - -struct GpuPsCommGraph { - int64_t *neighbor_list; - GpuPsGraphNode *node_list; - int neighbor_size, node_size; - // the size of neighbor array and graph_node_list array - GpuPsCommGraph() - : neighbor_list(NULL), node_list(NULL), neighbor_size(0), node_size(0) {} - GpuPsCommGraph(int64_t *neighbor_list_, GpuPsGraphNode *node_list_, - int neighbor_size_, int node_size_) - : neighbor_list(neighbor_list_), - node_list(node_list_), - neighbor_size(neighbor_size_), - node_size(node_size_) {} -}; - -/* -suppose we have a graph like this -0----3-----5----7 - \ |\ |\ - 17 8 9 1 2 - -we save the nodes in arbitrary order, -in this example,the order is -[0,5,1,2,7,3,8,9,17] -let us name this array u_id; -we record each node's neighbors: -0:3,17 -5:3,7 -1:7 -2:7 -7:1,2,5 -3:0,5,8,9 -8:3 -9:3 -17:0 - -by concatenating each node's neighbor_list in the order we save the node id. -we get [3,17,3,7,7,7,1,2,5,0,5,8,9,3,3,0] -this is the neighbor_list of GpuPsCommGraph -given this neighbor_list and the order to save node id, -we know, -node 0's neighbors are in the range [0,1] of neighbor_list -node 5's neighbors are in the range [2,3] of neighbor_list -node 1's neighbors are in the range [4,4] of neighbor_list -node 2:[5,5] -node 7:[6,6] -node 3:[9,12] -node 8:[13,13] -node 9:[14,14] -node 17:[15,15] -... -by the above information, -we generate a node_list:GpuPsGraphNode *graph_node_list in GpuPsCommGraph -of size 9, -where node_list[i].id = u_id[i] -then we have: -node_list[0]-> node_id:0, neighbor_size:2, neighbor_offset:0 -node_list[1]-> node_id:5, neighbor_size:2, neighbor_offset:2 -node_list[2]-> node_id:1, neighbor_size:1, neighbor_offset:4 -node_list[3]-> node_id:2, neighbor_size:1, neighbor_offset:5 -node_list[4]-> node_id:7, neighbor_size:3, neighbor_offset:6 -node_list[5]-> node_id:3, neighbor_size:4, neighbor_offset:9 -node_list[6]-> node_id:8, neighbor_size:1, neighbor_offset:13 -node_list[7]-> node_id:9, neighbor_size:1, neighbor_offset:14 -node_list[8]-> node_id:17, neighbor_size:1, neighbor_offset:15 -*/ -struct NeighborSampleResult { - int64_t *val; - int *actual_sample_size, sample_size, key_size; - NeighborSampleResult(int _sample_size, int _key_size) - : sample_size(_sample_size), key_size(_key_size) { - actual_sample_size = NULL; - val = NULL; - }; - ~NeighborSampleResult() { - if (val != NULL) cudaFree(val); - if (actual_sample_size != NULL) cudaFree(actual_sample_size); - } -}; - -struct NodeQueryResult { - int64_t *val; - int actual_sample_size; - NodeQueryResult() { - val = NULL; - actual_sample_size = 0; - }; - ~NodeQueryResult() { - if (val != NULL) cudaFree(val); - } -}; class GpuPsGraphTable : public HeterComm { public: GpuPsGraphTable(std::shared_ptr resource) : HeterComm(1, resource) { load_factor_ = 0.25; + rw_lock.reset(new pthread_rwlock_t()); + cpu_table_status = -1; + } + ~GpuPsGraphTable() { + if (cpu_table_status != -1) { + end_graph_sampling(); + } } void build_graph_from_cpu(std::vector &cpu_node_list); NodeQueryResult *graph_node_sample(int gpu_id, int sample_size); @@ -134,9 +45,19 @@ class GpuPsGraphTable : public HeterComm { int *h_right, int64_t *src_sample_res, int *actual_sample_size); + int init_cpu_table(const paddle::distributed::GraphParameter &graph); + int load(const std::string &path, const std::string ¶m); + virtual int32_t end_graph_sampling() { + return cpu_graph_table->end_graph_sampling(); + } private: std::vector gpu_graph_list; + std::shared_ptr cpu_graph_table; + std::shared_ptr rw_lock; + mutable std::mutex mutex_; + std::condition_variable cv_; + int cpu_table_status; }; } }; diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h index 839c7e5468c6c6938c6b4cda3dd879c7366e7d6e..16a6857ae96eecaaa06b92b9912387f22612f53e 100644 --- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h @@ -14,6 +14,7 @@ #pragma once #ifdef PADDLE_WITH_HETERPS +//#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h" namespace paddle { namespace framework { /* @@ -45,6 +46,33 @@ __global__ void neighbor_sample_example(GpuPsCommGraph graph, int* index, } } +int GpuPsGraphTable::init_cpu_table( + const paddle::distributed::GraphParameter& graph) { + cpu_graph_table.reset(new paddle::distributed::GraphTable); + cpu_table_status = cpu_graph_table->initialize(graph); + if (cpu_table_status != 0) return cpu_table_status; + std::function&)> callback = + [this](std::vector& res) { + pthread_rwlock_wrlock(this->rw_lock.get()); + this->clear_graph_info(); + this->build_graph_from_cpu(res); + pthread_rwlock_unlock(this->rw_lock.get()); + cv_.notify_one(); + }; + cpu_graph_table->set_graph_sample_callback(callback); + return cpu_table_status; +} + +int GpuPsGraphTable::load(const std::string& path, const std::string& param) { + int status = cpu_graph_table->load(path, param); + if (status != 0) { + return status; + } + std::unique_lock lock(mutex_); + cpu_graph_table->start_graph_sampling(); + cv_.wait(lock); + return 0; +} /* comment 1 @@ -68,6 +96,7 @@ __global__ void neighbor_sample_example(GpuPsCommGraph graph, int* index, that's what fill_dvals does. */ + void GpuPsGraphTable::move_neighbor_sample_result_to_source_gpu( int gpu_id, int gpu_num, int sample_size, int* h_left, int* h_right, int64_t* src_sample_res, int* actual_sample_size) { @@ -258,7 +287,7 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id, auto d_shard_keys = memory::Alloc(place, len * sizeof(int64_t)); int64_t* d_shard_keys_ptr = reinterpret_cast(d_shard_keys->ptr()); - auto d_shard_vals = memory::Alloc(place, len * sizeof(int64_t)); + auto d_shard_vals = memory::Alloc(place, sample_size * len * sizeof(int64_t)); int64_t* d_shard_vals_ptr = reinterpret_cast(d_shard_vals->ptr()); auto d_shard_actual_sample_size = memory::Alloc(place, len * sizeof(int)); int* d_shard_actual_sample_size_ptr = diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h index 2cf702969f99a02cd2b89d69c94f42b265d46135..f85ed330dc8ea4eb4199b6ab006ac54be1b30b0d 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifdef PADDLE_WITH_HETERPS +//#include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h" #include namespace paddle { diff --git a/paddle/fluid/framework/fleet/heter_ps/test_cpu_graph_sample.cu b/paddle/fluid/framework/fleet/heter_ps/test_cpu_graph_sample.cu new file mode 100644 index 0000000000000000000000000000000000000000..8c7ea10b26565a4181230f6150272babd315105f --- /dev/null +++ b/paddle/fluid/framework/fleet/heter_ps/test_cpu_graph_sample.cu @@ -0,0 +1,108 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" +#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h" +#include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h" +#include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h" +#include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h" +#include "paddle/fluid/platform/cuda_device_guard.h" + +using namespace paddle::framework; +void prepare_file(char file_name[], std::vector data) { + std::ofstream ofile; + ofile.open(file_name); + for (auto x : data) { + ofile << x << std::endl; + } + + ofile.close(); +} +char edge_file_name[] = "edges.txt"; +TEST(TEST_FLEET, graph_sample) { + std::vector edges; + int gpu_count = 3; + std::vector dev_ids; + dev_ids.push_back(0); + dev_ids.push_back(1); + dev_ids.push_back(2); + + std::shared_ptr resource = + std::make_shared(dev_ids); + resource->enable_p2p(); + GpuPsGraphTable g(resource); + int node_count = 10; + std::vector> neighbors(node_count); + int ind = 0; + int64_t node_id = 0; + // std::vector graph_list(gpu_count); + while (ind < node_count) { + int neighbor_size = ind + 1; + while (neighbor_size--) { + edges.push_back(std::to_string(ind) + "\t" + std::to_string(node_id) + + "\t1.0"); + node_id++; + } + ind++; + } + /* + gpu 0: + 0,3,6,9 + gpu 1: + 1,4,7 + gpu 2: + 2,5,8 + + query(2,6) returns nodes [6,9,1,4,7,2] + */ + ::paddle::distributed::GraphParameter table_proto; + table_proto.set_gpups_mode(true); + table_proto.set_gpups_mode_shard_num(127); + table_proto.set_gpu_num(3); + table_proto.set_gpups_graph_sample_class("BasicBfsGraphSampler"); + table_proto.set_gpups_graph_sample_args("5,5,1,1"); + prepare_file(edge_file_name, edges); + g.init_cpu_table(table_proto); + g.load(std::string(edge_file_name), std::string("e>")); + /* + node x's neighbor list = [(1+x)*x/2,(1+x)*x/2 + 1,.....,(1+x)*x/2 + x] + so node 6's neighbors are [21,22...,27] + node 7's neighbors are [28,29,..35] + node 0's neighbors are [0] + query([7,0,6],sample_size=3) should return [28,29,30,0,x,x,21,22,23] + 6 --index-->2 + 0 --index--->0 + 7 --index-->2 + */ + int64_t cpu_key[3] = {7, 0, 6}; + void *key; + cudaMalloc((void **)&key, 3 * sizeof(int64_t)); + cudaMemcpy(key, cpu_key, 3 * sizeof(int64_t), cudaMemcpyHostToDevice); + auto neighbor_sample_res = g.graph_neighbor_sample(0, (int64_t *)key, 3, 3); + int64_t *res = new int64_t[9]; + cudaMemcpy(res, neighbor_sample_res->val, 72, cudaMemcpyDeviceToHost); + std::sort(res, res + 3); + std::sort(res + 6, res + 9); + int64_t expected_sample_val[] = {28, 29, 30, 0, -1, -1, 21, 22, 23}; + for (int i = 0; i < 9; i++) { + if (expected_sample_val[i] != -1) { + ASSERT_EQ(res[i], expected_sample_val[i]); + } + } + delete[] res; + delete neighbor_sample_res; +} diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc index 31a30f72e3aa6120efbaa158b1d1786dda155145..432e57107e84d9399c76f21343fe9ef5dd879473 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc @@ -148,7 +148,7 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr gpu_task) { t.join(); } timeline.Pause(); - VLOG(1) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds."; + VLOG(0) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds."; } else { CHECK(data_set_name.find("MultiSlotDataset") != std::string::npos); VLOG(0) << "ps_gpu_wrapper use MultiSlotDataset"; @@ -182,7 +182,7 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr gpu_task) { t.join(); } timeline.Pause(); - VLOG(1) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds."; + VLOG(0) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds."; } timeline.Start(); @@ -300,7 +300,7 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { int32_t cnt = 0; while (true) { auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr( - reinterpret_cast(local_ptr[i].data()), this->table_id_, + i, reinterpret_cast(local_ptr[i].data()), this->table_id_, local_keys[i].data(), key_size); bool flag = true; @@ -378,8 +378,8 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { int32_t cnt = 0; while (true) { auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr( - reinterpret_cast(local_dim_ptr[i][j].data()), this->table_id_, - local_dim_keys[i][j].data(), key_size); + i, reinterpret_cast(local_dim_ptr[i][j].data()), + this->table_id_, local_dim_keys[i][j].data(), key_size); bool flag = true; tt.wait(); @@ -431,7 +431,7 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { t.join(); } timeline.Pause(); - VLOG(1) << "pull sparse from CpuPS into GpuPS cost " << timeline.ElapsedSec() + VLOG(0) << "pull sparse from CpuPS into GpuPS cost " << timeline.ElapsedSec() << " seconds."; if (multi_node_) { auto gloo_wrapper = paddle::framework::GlooWrapper::GetInstance(); @@ -603,7 +603,7 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { t.join(); } timeline.Pause(); - VLOG(1) << "GpuPs prepare for build hbm cost " << timeline.ElapsedSec() + VLOG(0) << "GpuPs prepare for build hbm cost " << timeline.ElapsedSec() << " seconds."; } @@ -746,7 +746,7 @@ void PSGPUWrapper::BeginPass() { "[BeginPass] after build_task, current task is not null.")); } - VLOG(1) << "BeginPass end, cost time: " << timer.ElapsedSec() << "s"; + VLOG(0) << "BeginPass end, cost time: " << timer.ElapsedSec() << "s"; } void PSGPUWrapper::EndPass() { @@ -769,7 +769,7 @@ void PSGPUWrapper::EndPass() { current_task_ = nullptr; gpu_free_channel_->Put(current_task_); timer.Pause(); - VLOG(1) << "EndPass end, cost time: " << timer.ElapsedSec() << "s"; + VLOG(0) << "EndPass end, cost time: " << timer.ElapsedSec() << "s"; } void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc index b1d7059f311cd370a40e83d7b0016d5af8cdb163..2babecc6ddf933e19b9d704ee7515f56f7431839 100644 --- a/paddle/fluid/framework/infershape_utils.cc +++ b/paddle/fluid/framework/infershape_utils.cc @@ -78,6 +78,11 @@ class InferShapeArgumentMappingContext : public phi::ArgumentMappingContext { return var_types[0] == proto::VarType::SELECTED_ROWS; } + bool IsDenseTensorVectorInput(const std::string& name) const override { + auto var_types = ctx_.GetInputsVarType(name); + return var_types[0] == proto::VarType::LOD_TENSOR_ARRAY; + } + bool IsDenseTensorOutput(const std::string& name) const override { auto var_types = ctx_.GetOutputsVarType(name); return var_types[0] == proto::VarType::LOD_TENSOR; @@ -125,9 +130,14 @@ class CompatMetaTensor : public phi::MetaTensor { return var->Get().dims(); } else if (var->IsType()) { return var->Get().dims(); + } else if (var->IsType()) { + // use tensor array size as dims + auto& tensor_array = var->Get(); + return phi::make_ddim({static_cast(tensor_array.size())}); } else { PADDLE_THROW(platform::errors::Unimplemented( - "Currently, only can get dims from DenseTensor or SelectedRows.")); + "Currently, only can get dims from DenseTensor or SelectedRows or " + "DenseTensorArray.")); } } else { auto* var = BOOST_GET_CONST(VarDesc*, var_); @@ -144,6 +154,10 @@ class CompatMetaTensor : public phi::MetaTensor { return var->Get().dtype(); } else if (var->IsType()) { return var->Get().dtype(); + } else if (var->IsType()) { + // NOTE(chenweihang): do nothing + // Unsupported get dtype from LoDTensorArray now + return phi::DataType::UNDEFINED; } else { PADDLE_THROW(platform::errors::Unimplemented( "Currently, only can get dtype from DenseTensor or SelectedRows.")); @@ -157,7 +171,19 @@ class CompatMetaTensor : public phi::MetaTensor { DataLayout layout() const override { if (is_runtime_) { auto* var = BOOST_GET_CONST(Variable*, var_); - return var->Get().layout(); + if (var->IsType()) { + return var->Get().layout(); + } else if (var->IsType()) { + return var->Get().layout(); + } else if (var->IsType()) { + // NOTE(chenweihang): do nothing + // Unsupported get layout from LoDTensorArray now + return phi::DataLayout::UNDEFINED; + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Currently, only can get layout from DenseTensor or " + "SelectedRows.")); + } } else { // NOTE(chenweihang): do nothing // Unsupported get layout for VarDesc now @@ -174,6 +200,16 @@ class CompatMetaTensor : public phi::MetaTensor { } else if (var->IsType()) { auto* tensor = var->GetMutable()->mutable_value(); phi::DenseTensorUtils::GetMutableMeta(tensor)->dims = dims; + } else if (var->IsType()) { + auto* tensor_array = var->GetMutable(); + // Note: Here I want enforce `tensor_array->size() == 0UL`, because + // inplace using on LoDTensorArray is dangerous, but the unittest + // `test_list` contains this behavior + PADDLE_ENFORCE_EQ(dims.size(), 1UL, + platform::errors::InvalidArgument( + "LoDTensorArray can only have one dimension.")); + // only set the array size for LoDTensorArray input + tensor_array->resize(dims[0]); } else { PADDLE_THROW(platform::errors::Unimplemented( "Currently, only can set dims from DenseTensor or SelectedRows.")); @@ -193,6 +229,9 @@ class CompatMetaTensor : public phi::MetaTensor { } else if (var->IsType()) { auto* tensor = var->GetMutable()->mutable_value(); phi::DenseTensorUtils::GetMutableMeta(tensor)->dtype = dtype; + } else if (var->IsType()) { + // NOTE(chenweihang): do nothing + // Unsupported set dtype for LoDTensorArray now } else { PADDLE_THROW(platform::errors::Unimplemented( "Currently, only can set dtype from DenseTensor or SelectedRows.")); @@ -206,10 +245,20 @@ class CompatMetaTensor : public phi::MetaTensor { void set_layout(DataLayout layout) override { if (is_runtime_) { auto* var = BOOST_GET(Variable*, var_); - LoDTensor* tensor = var->GetMutable(); - phi::DenseTensorUtils::GetMutableMeta( - static_cast(tensor)) - ->layout = layout; + if (var->IsType()) { + auto* tensor = var->GetMutable(); + phi::DenseTensorUtils::GetMutableMeta(tensor)->layout = layout; + } else if (var->IsType()) { + auto* tensor = var->GetMutable()->mutable_value(); + phi::DenseTensorUtils::GetMutableMeta(tensor)->layout = layout; + } else if (var->IsType()) { + // NOTE(chenweihang): do nothing + // Unsupported set dtype for LoDTensorArray now + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Currently, only can set layout from DenseTensor or " + "SelectedRows.")); + } } else { // NOTE(chenweihang): do nothing // Unsupported set layout for VarDesc now @@ -251,9 +300,7 @@ class CompatMetaTensor : public phi::MetaTensor { void share_meta(const MetaTensor& meta_tensor) override { share_dims(meta_tensor); set_dtype(meta_tensor.dtype()); - // VarDesc doesn't contains layout, so we cannot share layout - // set_layout(meta_tensor.layout()); - + set_layout(meta_tensor.layout()); // special case: share lod of LoDTensor share_lod(meta_tensor); } @@ -442,6 +489,51 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, attr_name, infershape_input.size())); } } + } else if (attr_defs[i].type_index == + std::type_index(typeid(std::vector))) { + auto& attr = attr_reader.GetAttr(attr_name); + if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + infer_meta_context.EmplaceBackAttr(std::move(scalar_list)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + infer_meta_context.EmplaceBackAttr(std::move(scalar_list)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + infer_meta_context.EmplaceBackAttr(std::move(scalar_list)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + infer_meta_context.EmplaceBackAttr(std::move(scalar_list)); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported cast op attribute `%s` to vector when " + "construct InferMetaContext.", + attr_names[i])); + } } else if (ctx->HasAttr(attr_name)) { // Emplace Back Attr according to the type of attr. auto& attr = attr_reader.GetAttr(attr_name); diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 623c8a048c2417ab51772c55b681031d9bcfd925..7aaaef712a6e9186058b579d1c69b0cfb201d899 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -97,6 +97,7 @@ pass_library(layer_norm_fuse_pass inference) pass_library(add_support_int8_pass inference) pass_library(matmul_scale_fuse_pass inference) pass_library(gpu_cpu_map_matmul_to_mul_pass inference) +pass_library(mixed_precision_configure_pass inference) pass_library(generate_pass DEPS pass_desc_proto) target_link_libraries(generate_pass pass_desc_proto) diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index 036fde8fac6d911f8a97dbc097fae7f9fdd2ab6f..f5f6f3ecb855cfa9acb6c2169f1fc43458578a2a 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -95,6 +95,7 @@ std::map> Graph::InitFromBlock( std::unordered_map> name_to_desc_block_id; + block_id_ = block.ID(); const BlockDesc *block_var_visible = █ while (block_var_visible != nullptr) { for (auto *var : block_var_visible->AllVars()) { diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h index 21e743e3587d80536b7bd4805298f22a99482217..10645f08dc3ba833c3a4ca75a1ac623ee2c1e8e9 100644 --- a/paddle/fluid/framework/ir/graph.h +++ b/paddle/fluid/framework/ir/graph.h @@ -230,6 +230,7 @@ class Graph { auto *x = AddNode(new ir::Node(var_desc, block_id == -1 ? block_id_ : block_id)); x->SetId(num_node_created_++); + x->SetGraphId(block_id_); return x; } @@ -245,6 +246,7 @@ class Graph { "The OpDesc used to create operator node is null.")); auto *x = AddNode(new ir::Node(op_desc)); x->SetId(num_node_created_++); + x->SetGraphId(block_id_); return x; } @@ -263,6 +265,7 @@ class Graph { num_node_created_); auto *x = AddNode(new ir::Node(name, ir::Node::Type::kVariable, block_id_)); x->SetId(num_node_created_++); + x->SetGraphId(block_id_); return x; } @@ -276,6 +279,7 @@ class Graph { } auto *x = AddNode(new ir::Node(name, type, block_id_)); x->SetId(num_node_created_++); + x->SetGraphId(block_id_); return x; } diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 18068e22b7f3c31d59636bc7ab6a234e109d5ee6..164a13d1560f4d0008c2bdb5a56d8ad6f875157b 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -2052,18 +2052,19 @@ PDNode *patterns::Pool::operator()() { return output_var; } -PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var, PDNode *y_var) { - auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr()) - ->assert_is_op("elementwise_add"); - - x_var->AsInput()->assert_is_op_input("elementwise_add", "X"); - y_var->AsInput()->assert_is_op_input("elementwise_add", "Y"); - auto out_var = pattern->NewNode(elementwise_add_out_repr()) +PDNode *patterns::Elementwise::operator()(PDNode *x_var, PDNode *y_var, + const std::string elementwise_type) { + auto elementwise_op = + pattern->NewNode(elementwise_op_repr())->assert_is_op(elementwise_type); + + x_var->AsInput()->assert_is_op_input(elementwise_type, "X"); + y_var->AsInput()->assert_is_op_input(elementwise_type, "Y"); + auto out_var = pattern->NewNode(elementwise_out_repr()) ->AsOutput() - ->assert_is_op_output("elementwise_add", "Out"); + ->assert_is_op_output(elementwise_type, "Out"); - elementwise_add_op->LinksFrom({x_var, y_var}); - elementwise_add_op->LinksTo({out_var}); + elementwise_op->LinksFrom({x_var, y_var}); + elementwise_op->LinksTo({out_var}); return out_var; } diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index 062d2f9dedce65f6e16b70f0b201a4ca63b0531a..17c70ace301d39db6fcf14d01c11baab0dc7d403 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -1016,20 +1016,20 @@ struct Pool : public PatternBase { PATTERN_DECL_NODE(pool_output); }; -// ElementwiseAdd used in residual connections. -// y_var is used and convolution output. -// The operator is removed, when residual -// connection fusion is on. -struct ElementwiseAdd : public PatternBase { - ElementwiseAdd(PDPattern* pattern, const std::string& name_scope) - : PatternBase(pattern, name_scope, "elementwise_add") {} - - PDNode* operator()(PDNode* x_var, PDNode* y_var); - - PATTERN_DECL_NODE(elementwise_add_op); - PATTERN_DECL_NODE(elementwise_add_x); - PATTERN_DECL_NODE(elementwise_add_y); - PATTERN_DECL_NODE(elementwise_add_out); +// Elementwise ops +// Forward pass for element-wise operators (add, mul) +// elementwise_mul_out is the result of the operator +struct Elementwise : public PatternBase { + Elementwise(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "elementwise") {} + + PDNode* operator()(PDNode* x_var, PDNode* y_var, + const std::string elementwise_type); + + PATTERN_DECL_NODE(elementwise_op); + PATTERN_DECL_NODE(elementwise_x); + PATTERN_DECL_NODE(elementwise_y); + PATTERN_DECL_NODE(elementwise_out); }; // Transpose op diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc index 1b2a62695fb135925d43a3341aaacdf956da8da3..9fc6de3c8c1725707edd9f3b9f8de87706c16cc9 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc +++ b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc @@ -73,8 +73,10 @@ static void ShareVarInfoToCinnLaunch( varinfo_maps.at(cinn_launch_op->GetScopeIdx()); // collect all MemOptVarInfos of external variables - // that would be eager deleted after the cinn_launch subgraph executed, - // and store them as attribute of the subgraph + // that were eager deleted after the cinn_launch subgraph executed, + // and we will delete them in advance among eager_deletion_ops + // inside cinn_launch subgraph, so store them as attribute of the subgraph + // to pass to the inner eager_deletion_ops. for (const auto& var_name : vars_to_delete) { auto it = src_varinfo_map.find(var_name); PADDLE_ENFORCE_NE(it, src_varinfo_map.end(), @@ -82,6 +84,8 @@ static void ShareVarInfoToCinnLaunch( "MemOptVarInfo of var[%s] not found", var_name)); dst_varinfo_map.emplace(var_name, it->second); } + // skip running of the followed eager_deletion_op + followed_eager_deletion_op->SetSkipRunning(true); } static void TakeVarInfoFromMainGraph( diff --git a/paddle/fluid/framework/ir/mixed_precision_configure_pass.cc b/paddle/fluid/framework/ir/mixed_precision_configure_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..4aa59d9196b1b4d73fffa8f1b2a9bba08d6091be --- /dev/null +++ b/paddle/fluid/framework/ir/mixed_precision_configure_pass.cc @@ -0,0 +1,149 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/mixed_precision_configure_pass.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/op_version_registry.h" + +namespace paddle { +namespace framework { +namespace ir { + +void MixedPrecisionConfigurePass::InsertCastOps( + Graph* graph, const StringSet& blacklist) const { + VLOG(3) << "Insert the cast op before and after the kernel that does not " + "supports fp16 precision"; + + auto update_cast_desc = [&]( + framework::OpDesc& desc, const std::string& x_name, + const std::string& out_name, const int in_dtype, const int out_dtype) { + desc.SetType("cast"); + desc.SetInput("X", {x_name}); + desc.SetOutput("Out", {out_name}); + desc.SetAttr("in_dtype", in_dtype); + desc.SetAttr("out_dtype", out_dtype); + desc.SetAttr("use_mkldnn", false); + desc.SetAttr("with_quant_attr", false); + desc.Flush(); + }; + + auto cast_input = [&](Graph* graph, Node* op_node, + const StringSet& cast_list) { + auto inlinks = op_node->inputs; + for (auto* pre_node : inlinks) { + if (pre_node->IsVar()) { + const auto is_persistable = pre_node->Var()->Persistable(); + const auto is_float = + pre_node->Var()->GetDataType() == proto::VarType::FP16 || + pre_node->Var()->GetDataType() == proto::VarType::FP32 || + pre_node->Var()->GetDataType() == proto::VarType::FP64; + if (!is_persistable && is_float) { + int suffix = 0; + for (auto* pre_node_input : pre_node->inputs) { + if (!pre_node_input->IsOp()) continue; + const auto& type = pre_node_input->Op()->Type(); + if (!cast_list.count(type) && type != "cast") { + std::string old_name = pre_node->Name(); + std::string new_name = + old_name + "_cast.tmp_" + std::to_string(suffix); + suffix++; + + framework::OpDesc new_op_desc(op_node->Op()->Block()); + // 4 for fp16, 5 for fp32 + update_cast_desc(new_op_desc, old_name, new_name, 4, 5); + auto* new_op = graph->CreateOpNode(&new_op_desc); + + VarDesc out_var(new_name); + out_var.SetPersistable(false); + auto* node_var = graph->CreateVarNode(&out_var); + + op_node->Op()->RenameInput(old_name, new_name); + IR_NODE_LINK_TO(pre_node, new_op); + IR_NODE_LINK_TO(new_op, node_var); + IR_NODE_LINK_TO(node_var, op_node); + } + } + } + } + } + }; + + auto cast_output = [&](Graph* graph, Node* op_node, + const StringSet& cast_list) { + auto outlinks = op_node->outputs; + for (auto* next_node : outlinks) { + if (next_node->IsVar()) { + const auto is_persistable = next_node->Var()->Persistable(); + const auto is_float = + next_node->Var()->GetDataType() == proto::VarType::FP16 || + next_node->Var()->GetDataType() == proto::VarType::FP32 || + next_node->Var()->GetDataType() == proto::VarType::FP64; + if (!is_persistable && is_float) { + int suffix = 0; + for (auto* next_node_output : next_node->outputs) { + if (!next_node_output->IsOp()) continue; + + const auto& type = next_node_output->Op()->Type(); + if (!cast_list.count(type) && type != "cast") { + std::string old_name = next_node->Name(); + std::string new_name = + old_name + "_cast.tmp_" + std::to_string(suffix); + suffix++; + + framework::OpDesc new_op_desc(op_node->Op()->Block()); + // 4 for fp16, 5 for fp32 + update_cast_desc(new_op_desc, old_name, new_name, 5, 4); + auto* new_op = graph->CreateOpNode(&new_op_desc); + + VarDesc out_var(new_name); + out_var.SetPersistable(false); + auto* node_var = graph->CreateVarNode(&out_var); + + next_node_output->Op()->RenameInput(old_name, new_name); + IR_NODE_LINK_TO(next_node, new_op); + IR_NODE_LINK_TO(new_op, node_var); + IR_NODE_LINK_TO(node_var, next_node_output); + } + } + } + } + } + }; + + for (auto* op_node : + ir::TopologyVarientSort(*graph, static_cast(0))) { + if (!op_node->IsOp() || op_node->Op()->Type() == "feed" || + op_node->Op()->Type() == "fetch") + continue; + + const auto& type = op_node->Op()->Type(); + if (blacklist.count(type)) { + cast_input(graph, op_node, blacklist); + cast_output(graph, op_node, blacklist); + } + } +} + +void MixedPrecisionConfigurePass::ApplyImpl(Graph* graph) const { + const auto blacklist = + Get>("gpu_fp16_disabled_op_types"); + InsertCastOps(graph, blacklist); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(mixed_precision_configure_pass, + paddle::framework::ir::MixedPrecisionConfigurePass); diff --git a/paddle/fluid/framework/ir/mixed_precision_configure_pass.h b/paddle/fluid/framework/ir/mixed_precision_configure_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..fc5a612ecb833d2a5117a2dab58747d21226df8d --- /dev/null +++ b/paddle/fluid/framework/ir/mixed_precision_configure_pass.h @@ -0,0 +1,39 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" + +namespace paddle { +namespace framework { +namespace ir { + +using StringSet = std::unordered_set; + +class MixedPrecisionConfigurePass : public FusePassBase { + public: + MixedPrecisionConfigurePass() = default; + virtual ~MixedPrecisionConfigurePass() {} + + protected: + void ApplyImpl(Graph* graph) const override; + + private: + void InsertCastOps(Graph* graph, const StringSet& blacklist) const; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc index 2403e60df3918394e99c3284b2a417e336fc3bae..fc2758c27345032c1ad0831b4ee0016fa84b3f5c 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc @@ -118,7 +118,7 @@ ResidualConnectionMKLDNNFusePass::ResidualConnectionMKLDNNFusePass() { .IsType>() .End() .AddAttr("data_format") - .IsStringIn({"NCHW", "AnyLayout"}) + .IsStringIn({"NHWC", "NCHW", "AnyLayout"}) .End(); AddOpCompat(OpCompat("elementwise_add")) @@ -145,10 +145,10 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsX( patterns::Conv conv_pattern{pattern, name_scope}; auto conv_output = conv_pattern(); - patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope}; - elementwise_add_pattern( - conv_output, - pattern->NewNode(elementwise_add_pattern.elementwise_add_y_repr())); + patterns::Elementwise elementwise_pattern{pattern, name_scope}; + elementwise_pattern( + conv_output, pattern->NewNode(elementwise_pattern.elementwise_y_repr()), + "elementwise_add"); conv_output->AsIntermediate(); int found_conv_as_x_count = 0; @@ -160,16 +160,16 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsX( GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern); GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op, - elementwise_add_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_identity, elementwise_add_y, - elementwise_add_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out, - elementwise_add_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_op, elementwise_op, + elementwise_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_identity, elementwise_y, + elementwise_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out, + elementwise_pattern); - if (FindFuseOption(*conv_op, *elementwise_add_op) != FUSE_MKLDNN) return; + if (FindFuseOption(*conv_op, *elementwise_op) != FUSE_MKLDNN) return; - if (!IsReachable(g, elementwise_add_identity, conv_output)) return; + if (!IsReachable(g, elementwise_identity, conv_output)) return; if (HasFusedActivation(conv_op)) return; @@ -179,14 +179,14 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsX( return; } - conv_op->Op()->SetInput("ResidualData", {elementwise_add_identity->Name()}); - conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()}); + conv_op->Op()->SetInput("ResidualData", {elementwise_identity->Name()}); + conv_op->Op()->SetOutput("Output", {elementwise_out->Name()}); conv_op->Op()->SetAttr("fuse_residual_connection", true); - GraphSafeRemoveNodes(g, {conv_output, elementwise_add_op}); + GraphSafeRemoveNodes(g, {conv_output, elementwise_op}); - IR_NODE_LINK_TO(elementwise_add_identity, conv_op); - IR_NODE_LINK_TO(conv_op, elementwise_add_out); + IR_NODE_LINK_TO(elementwise_identity, conv_op); + IR_NODE_LINK_TO(conv_op, elementwise_out); found_conv_as_x_count++; }; @@ -212,10 +212,10 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsY( patterns::Conv conv_pattern{pattern, name_scope}; auto conv_output = conv_pattern(); - patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope}; - elementwise_add_pattern( - pattern->NewNode(elementwise_add_pattern.elementwise_add_x_repr()), - conv_output); + patterns::Elementwise elementwise_pattern{pattern, name_scope}; + elementwise_pattern( + pattern->NewNode(elementwise_pattern.elementwise_x_repr()), conv_output, + "elementwise_add"); conv_output->AsIntermediate(); int found_conv_as_y_count = 0; @@ -227,16 +227,16 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsY( GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern); GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op, - elementwise_add_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_x, elementwise_add_x, - elementwise_add_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out, - elementwise_add_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_op, elementwise_op, + elementwise_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_x, elementwise_x, + elementwise_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out, + elementwise_pattern); - if (FindFuseOption(*conv_op, *elementwise_add_op) != FUSE_MKLDNN) return; + if (FindFuseOption(*conv_op, *elementwise_op) != FUSE_MKLDNN) return; - if (!IsReachable(g, elementwise_add_x, conv_output)) return; + if (!IsReachable(g, elementwise_x, conv_output)) return; if (HasFusedActivation(conv_op)) return; @@ -246,14 +246,14 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsY( return; } - conv_op->Op()->SetInput("ResidualData", {elementwise_add_x->Name()}); - conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()}); + conv_op->Op()->SetInput("ResidualData", {elementwise_x->Name()}); + conv_op->Op()->SetOutput("Output", {elementwise_out->Name()}); conv_op->Op()->SetAttr("fuse_residual_connection", true); - GraphSafeRemoveNodes(g, {conv_output, elementwise_add_op}); + GraphSafeRemoveNodes(g, {conv_output, elementwise_op}); - IR_NODE_LINK_TO(elementwise_add_x, conv_op); - IR_NODE_LINK_TO(conv_op, elementwise_add_out); + IR_NODE_LINK_TO(elementwise_x, conv_op); + IR_NODE_LINK_TO(conv_op, elementwise_out); found_conv_as_y_count++; }; @@ -282,8 +282,8 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv( patterns::Conv conv_y_pattern{pattern, name_scope}; auto conv_y_output = conv_y_pattern(); - patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope}; - elementwise_add_pattern(conv_x_output, conv_y_output); + patterns::Elementwise elementwise_pattern{pattern, name_scope}; + elementwise_pattern(conv_x_output, conv_y_output, "elementwise_add"); conv_x_output->AsIntermediate(); conv_y_output->AsIntermediate(); @@ -301,10 +301,10 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv( GET_IR_NODE_FROM_SUBGRAPH(conv_y_filter, conv_filter, conv_y_pattern); GET_IR_NODE_FROM_SUBGRAPH(conv_y_output, conv_output, conv_y_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op, - elementwise_add_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out, - elementwise_add_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_op, elementwise_op, + elementwise_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out, + elementwise_pattern); if (!IsCompat(subgraph, g)) { LOG(WARNING) @@ -312,8 +312,8 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv( return; } - if (FindFuseOption(*conv_x_op, *elementwise_add_op) != FUSE_MKLDNN) return; - if (FindFuseOption(*conv_y_op, *elementwise_add_op) != FUSE_MKLDNN) return; + if (FindFuseOption(*conv_x_op, *elementwise_op) != FUSE_MKLDNN) return; + if (FindFuseOption(*conv_y_op, *elementwise_op) != FUSE_MKLDNN) return; Node* projection_node; Node* residual_conv_op; @@ -333,14 +333,14 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv( if (HasFusedActivation(residual_conv_op)) return; residual_conv_op->Op()->SetInput("ResidualData", {projection_node->Name()}); - residual_conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()}); + residual_conv_op->Op()->SetOutput("Output", {elementwise_out->Name()}); residual_conv_op->Op()->SetAttr("fuse_residual_connection", true); - GraphSafeRemoveNodes(g, {residual_conv_output, elementwise_add_op}); + GraphSafeRemoveNodes(g, {residual_conv_output, elementwise_op}); IR_NODE_LINK_TO(projection_node, residual_conv_op); - IR_NODE_LINK_TO(residual_conv_op, elementwise_add_out); + IR_NODE_LINK_TO(residual_conv_op, elementwise_out); found_projection_conv_count++; }; diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc index 371482b5343d638f005aa8e0700680b6ac00d6ec..f4358fb243f20bc9b024ef6b02768773fa995f45 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc @@ -807,74 +807,74 @@ void CPUQuantizePass::QuantizeMatmul(Graph* graph) const { PrettyLogDetail("--- quantized %d matmul ops", quantize_matmul_count); } -void CPUQuantizePass::QuantizeElementwiseAdd(Graph* graph) const { +void CPUQuantizePass::QuantizeElementwise( + Graph* graph, const std::string elementwise_type) const { GraphPatternDetector gpd; auto pattern = gpd.mutable_pattern(); - patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope_}; + patterns::Elementwise elementwise_pattern{pattern, name_scope_}; - elementwise_add_pattern( - pattern->NewNode(elementwise_add_pattern.elementwise_add_x_repr()), - pattern->NewNode(elementwise_add_pattern.elementwise_add_y_repr())); + elementwise_pattern( + pattern->NewNode(elementwise_pattern.elementwise_x_repr()), + pattern->NewNode(elementwise_pattern.elementwise_y_repr()), + elementwise_type); - int quantize_elementwise_add_count = 0; + int quantize_elementwise_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { - VLOG(4) << "Quantize elementwise_add op"; - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op, - elementwise_add_pattern); + VLOG(4) << "Quantize " + elementwise_type + " op"; + GET_IR_NODE_FROM_SUBGRAPH(elementwise_op, elementwise_op, + elementwise_pattern); // skip if should not be quantized - if (!platform::HasOpINT8DataType(elementwise_add_op->Op())) { - LogQuantizationDisabled(elementwise_add_op); + if (!platform::HasOpINT8DataType(elementwise_op->Op())) { + LogQuantizationDisabled(elementwise_op); return; } - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_x, elementwise_add_x, - elementwise_add_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_y, elementwise_add_y, - elementwise_add_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out, - elementwise_add_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_x, elementwise_x, + elementwise_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_y, elementwise_y, + elementwise_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out, + elementwise_pattern); if (!AreScalesPresentForNodes( - {elementwise_add_x, elementwise_add_y, elementwise_add_out})) { - LogCannotQuantizeOp(elementwise_add_op, + {elementwise_x, elementwise_y, elementwise_out})) { + LogCannotQuantizeOp(elementwise_op, "No scale available for the operator"); return; } bool is_x_unsigned{false}, is_y_unsigned{false}; - auto input_x_scale = - GetScaleValueForNode(elementwise_add_x, &is_x_unsigned); - auto input_y_scale = - GetScaleValueForNode(elementwise_add_y, &is_y_unsigned); + auto input_x_scale = GetScaleValueForNode(elementwise_x, &is_x_unsigned); + auto input_y_scale = GetScaleValueForNode(elementwise_y, &is_y_unsigned); // TODO(sfraczek): add support for different signness if (is_x_unsigned != is_y_unsigned) { - LogCannotQuantizeOp(elementwise_add_op, - "ElementwiseAdd inputs must be of the same type."); + LogCannotQuantizeOp(elementwise_op, + "Elementwise inputs must be of the same type."); return; } - QuantizeInput(g, elementwise_add_op, elementwise_add_x, "X", input_x_scale, + QuantizeInput(g, elementwise_op, elementwise_x, "X", input_x_scale, is_x_unsigned, "Scale_x"); - QuantizeInput(g, elementwise_add_op, elementwise_add_y, "Y", input_y_scale, + QuantizeInput(g, elementwise_op, elementwise_y, "Y", input_y_scale, is_y_unsigned, "Scale_y"); bool is_output_unsigned{false}; auto output_scale = - GetScaleValueForNode(elementwise_add_out, &is_output_unsigned); + GetScaleValueForNode(elementwise_out, &is_output_unsigned); - DequantizeOutput(g, elementwise_add_op, elementwise_add_out, "Out", - output_scale, is_output_unsigned, "Scale_out"); + DequantizeOutput(g, elementwise_op, elementwise_out, "Out", output_scale, + is_output_unsigned, "Scale_out"); - ++quantize_elementwise_add_count; + ++quantize_elementwise_count; }; gpd(graph, handler); - AddStatis(quantize_elementwise_add_count); + AddStatis(quantize_elementwise_count); - PrettyLogDetail("--- quantized %d elementwise_add ops", - quantize_elementwise_add_count); + PrettyLogDetail("--- quantized %d %s ops", quantize_elementwise_count, + elementwise_type); } void CPUQuantizePass::QuantizeFusionGru(Graph* graph) const { @@ -1146,7 +1146,8 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const { QuantizeFc(graph); QuantizeReshape(graph); QuantizeMatmul(graph); - QuantizeElementwiseAdd(graph); + QuantizeElementwise(graph, "elementwise_add"); + QuantizeElementwise(graph, "elementwise_mul"); QuantizeFusionGru(graph); QuantizeMultiGru(graph); QuantizeFusionLSTM(graph); diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h index 412c4e40a01d50b73f72076f3a0424081d633247..3a286264e41ffe1c329ba3971d777ce4fbc05b5e 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h @@ -57,7 +57,8 @@ class CPUQuantizePass : public FusePassBase { void QuantizeTranspose(Graph* graph) const; void QuantizeReshape(Graph* graph) const; void QuantizeMatmul(Graph* graph) const; - void QuantizeElementwiseAdd(Graph* graph) const; + void QuantizeElementwise(Graph* graph, + const std::string elementwise_type) const; void QuantizeFusionGru(Graph* graph) const; void QuantizeMultiGru(Graph* graph) const; void QuantizeFusionLSTM(Graph* graph) const; diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc index 889417b78c8641060b8ad89219749d8400558c6a..22000865948d629a5933ad0319e41dab71433fff 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc @@ -90,7 +90,7 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, op->SetAttr("Scale_x", 1.0f); op->SetAttr("Scale_y", 1.0f); op->SetAttr("Scale_out", 1.0f); - } else if (type == "elementwise_add") { + } else if (type == "elementwise_add" || type == "elementwise_mul") { op->SetInput("X", {inputs[0]}); if (inputs.size() > 1) op->SetInput("Y", {inputs[1]}); op->SetOutput("Out", {outputs[0]}); @@ -167,7 +167,8 @@ void CheckScales(const OpDesc* op, float scale, float shift) { scale); scale_names.push_back("Scale_in"); scale_names.push_back("Scale_out"); - } else if (type == "matmul" || type == "elementwise_add") { + } else if (type == "matmul" || type == "elementwise_add" || + type == "elementwise_mul") { scale_names.push_back("Scale_x"); scale_names.push_back("Scale_y"); scale_names.push_back("Scale_out"); @@ -546,46 +547,77 @@ TEST(CpuQuantizePass, matmul_not_quantized) { expected_operators, added_nodes, 1.0f); } -static const std::initializer_list variable_names_elementwise_add = - {"a", "b", "c", "d", "e", "f"}; +static const std::initializer_list variable_names_elementwise = { + "a", "b", "c", "d", "e", "f"}; -ProgramDesc BuildProgramDescElementwiseAdd() { +ProgramDesc BuildProgramDescElementwise(const std::string elementwise_type, + const std::string elementwise_name) { ProgramDesc prog; - for (auto& v : variable_names_elementwise_add) { + for (auto& v : variable_names_elementwise) { prog.MutableBlock(0)->Var(v); } SetOp(&prog, "dequantize", "Dequantize1", {"a"}, {"b"}, true); SetOp(&prog, "dequantize", "Dequantize2", {"c"}, {"d"}, true); - SetOp(&prog, "elementwise_add", "ElementwiseAdd", {"b", "d"}, {"e"}, true, + SetOp(&prog, elementwise_type, elementwise_name, {"b", "d"}, {"e"}, true, "int8"); SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, "float32"); return prog; } -TEST(CpuQuantizePass, elementwise_add) { +void TestElementwise(const std::string elementwise_type, + const std::string elementwise_name) { // 2 Quant + 2 IN + 1 DeQuant + 1 OUT int added_nodes = 6; std::unordered_map expected_operators = { - {"elementwise_add", 1}, {"quantize", 2}, {"dequantize", 3}}; - MainTest(BuildProgramDescElementwiseAdd(), variable_names_elementwise_add, - expected_operators, added_nodes, SCALE * S8_MAX); + {elementwise_type, 1}, {"quantize", 2}, {"dequantize", 3}}; + MainTest(BuildProgramDescElementwise(elementwise_type, elementwise_name), + variable_names_elementwise, expected_operators, added_nodes, + SCALE * S8_MAX); } -TEST(CpuQuantizePass, elementwise_add_output_scale_missing) { +void TestElementwiseOutputScaleMissing(const std::string elementwise_type, + const std::string elementwise_name) { int added_nodes = 0; std::unordered_map expected_operators = { - {"elementwise_add", 1}, {"quantize", 0}, {"dequantize", 2}}; - MainTest(BuildProgramDescElementwiseAdd(), variable_names_elementwise_add, - expected_operators, added_nodes, 1.f, 1.f, "e"); + {elementwise_type, 1}, {"quantize", 0}, {"dequantize", 2}}; + MainTest(BuildProgramDescElementwise(elementwise_type, elementwise_name), + variable_names_elementwise, expected_operators, added_nodes, 1.f, + 1.f, "e"); } -TEST(CpuQuantizePass, elementwise_add_unsigned_and_signed_input) { +void TestElementwiseUnsignedAndSignedInput(const std::string elementwise_type, + const std::string elementwise_name) { int added_nodes = 0; std::unordered_map expected_operators = { - {"elementwise_add", 1}, {"quantize", 0}, {"dequantize", 2}}; - MainTest(BuildProgramDescElementwiseAdd(), variable_names_elementwise_add, - expected_operators, added_nodes, 1.f, 1.f, "", "b"); + {elementwise_type, 1}, {"quantize", 0}, {"dequantize", 2}}; + MainTest(BuildProgramDescElementwise(elementwise_type, elementwise_name), + variable_names_elementwise, expected_operators, added_nodes, 1.f, + 1.f, "", "b"); +} + +TEST(CpuQuantizePass, elementwise_add) { + TestElementwise("elementwise_add", "ElementwiseAdd"); +} + +TEST(CpuQuantizePass, elementwise_add_output_scale_missing) { + TestElementwiseOutputScaleMissing("elementwise_add", "ElementwiseAdd"); +} + +TEST(CpuQuantizePass, elementwise_add_unsigned_and_signed_input) { + TestElementwiseUnsignedAndSignedInput("elementwise_add", "ElementwiseAdd"); +} + +TEST(CpuQuantizePass, elementwise_mul) { + TestElementwise("elementwise_mul", "ElementwiseMul"); +} + +TEST(CpuQuantizePass, elementwise_mul_output_scale_missing) { + TestElementwiseOutputScaleMissing("elementwise_mul", "ElementwiseMul"); +} + +TEST(CpuQuantizePass, elementwise_mul_unsigned_and_signed_input) { + TestElementwiseUnsignedAndSignedInput("elementwise_mul", "ElementwiseMul"); } const std::vector churn_out_vars(ProgramDesc* prog, diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc index 5f74b61ee86aad10880f3a67d8250026a6e9ac18..3b883dac9782af8350b3e22d2954e21789a1a120 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc @@ -26,10 +26,10 @@ void CPUQuantizePlacementPass::ApplyImpl(ir::Graph* graph) const { VLOG(3) << "Marks operators which are to be quantized."; std::unordered_set supported_op_types = std::unordered_set( - {"concat", "conv2d", "depthwise_conv2d", "elementwise_add", "fc", - "matmul", "nearest_interp", "nearest_interp_v2", "pool2d", - "prior_box", "reshape2", "transpose2", "fusion_gru", "fusion_lstm", - "multi_gru", "slice"}); + {"concat", "conv2d", "depthwise_conv2d", "elementwise_add", + "elementwise_mul", "fc", "matmul", "nearest_interp", + "nearest_interp_v2", "pool2d", "prior_box", "reshape2", "transpose2", + "fusion_gru", "fusion_lstm", "multi_gru", "slice"}); const auto& excluded_ids_list = Get>("quantize_excluded_op_ids"); const auto& op_types_list = diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc index 11190309814e7c75777a6cddd7e4d24bfc7ba9e6..bf2cf58f970addf1dac9f4871ba4abe09c3c7b38 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc @@ -32,8 +32,9 @@ USE_OP(conv2d_transpose); USE_OP_DEVICE_KERNEL(conv2d_transpose, MKLDNN); USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); -USE_OP(gelu); +USE_OP_ITSELF(gelu); USE_OP_DEVICE_KERNEL(gelu, MKLDNN); +PD_DECLARE_ARG_MAPPING_FN(gelu); namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc index ef2e83ced26e07f199a122ee3157eb428b63aec9..7df957b2c0eca64bacd1b48065f37ddffec1770a 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc @@ -18,6 +18,7 @@ #include #include + #include "paddle/fluid/framework/ir/pass_tester_helper.h" #include "paddle/fluid/framework/op_registry.h" @@ -27,10 +28,11 @@ USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); USE_OP_ITSELF(leaky_relu); USE_OP_DEVICE_KERNEL(leaky_relu, MKLDNN); -USE_OP(gelu); +USE_OP_ITSELF(gelu); USE_OP_ITSELF(relu); USE_OP_ITSELF(tanh); USE_OP_DEVICE_KERNEL(tanh, MKLDNN); +PD_DECLARE_ARG_MAPPING_FN(gelu); namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h index 7e61d6ae4248b3f41fd950fcf80e0306bd0971bb..8c51c278d4872bd5b0b019223fb0e778df390732 100644 --- a/paddle/fluid/framework/ir/node.h +++ b/paddle/fluid/framework/ir/node.h @@ -125,6 +125,7 @@ class Node { // Only use this for auto parallel. // A node does not have original desc if the return is zero. uint64_t OriginalDescId() const { return original_desc_id_; } + int GraphId() const { return graph_id_; } bool IsOp() const { return type_ == Type::kOperation; } bool IsVar() const { return type_ == Type::kVariable; } @@ -246,10 +247,12 @@ class Node { // Store the original id of var desc or op desc. // Only use this for auto parallel. uint64_t original_desc_id_{0}; + int graph_id_{-1}; private: // ID can only set by a Graph. void SetId(int id) { id_ = id; } + void SetGraphId(int graph_id) { graph_id_ = graph_id; } // desc_order can only set by a Graph when constructing a Graph from a // BlockDesc. diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc index ece48158586404cfe0c956fd66a20dc0db1ce96e..f30d1ea1b83dde65bdb703d511d0246fe4886113 100644 --- a/paddle/fluid/framework/naive_executor.cc +++ b/paddle/fluid/framework/naive_executor.cc @@ -41,6 +41,7 @@ void NaiveExecutor::Prepare(Scope *scope, const ProgramDesc &program_desc, void NaiveExecutor::Run() { #ifdef PADDLE_WITH_MKLDNN platform::AttachPointerHashToMKLDNNKey(this, place_); + platform::RegisterModelLayout(ops_, place_); #endif platform::ScopedFlushDenormal flush; for (auto &op : ops_) { diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc index eadb00b9e88e14075c46a53c711fd43774f26581..7fe1852f7396cb8cebe4b83f4cc80a8023421351 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc @@ -31,14 +31,14 @@ USE_OP(slice); USE_OP(concat); USE_OP(matmul); USE_OP_ITSELF(elementwise_add); -USE_OP(sigmoid); +USE_OP_ITSELF(sigmoid); USE_OP_ITSELF(tanh); USE_OP(elementwise_mul); USE_OP(softmax_with_cross_entropy); USE_OP_ITSELF(reduce_mean); USE_OP_ITSELF(reduce_sum); USE_OP_ITSELF(reduce_sum_grad); -USE_OP(reduce_mean_grad); +USE_OP_ITSELF(reduce_mean_grad); USE_OP_ITSELF(reshape2_grad); USE_OP(softmax_with_cross_entropy_grad); USE_OP_ITSELF(elementwise_add_grad); @@ -47,7 +47,7 @@ USE_OP(square); USE_OP(transpose2_grad); USE_OP(concat_grad); USE_OP_ITSELF(elementwise_mul_grad); -USE_OP(sigmoid_grad); +USE_OP_ITSELF(sigmoid_grad); USE_OP_ITSELF(tanh_grad); USE_OP(sum); USE_OP(slice_grad); diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index f23a266ef03641bc8f8d273b15ab4982e377cb03..42fbeb5d29ce4ac3a1498704b1fff88570c9c092 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -628,10 +628,12 @@ std::vector ExecutionContext::MultiOutput( bool OpSupportGPU(const std::string& op_type) { // check in new Function kernel first + bool has_phi_kernel = false; auto& kernel_factory = phi::KernelFactory::Instance(); auto kernel_key_map = kernel_factory.SelectKernelMap(phi::TransToPhiKernelName(op_type)); for (auto& kernel : kernel_key_map) { + has_phi_kernel = true; if (platform::is_gpu_place(phi::TransToPhiPlace(kernel.first.backend()))) { return true; } @@ -639,12 +641,19 @@ bool OpSupportGPU(const std::string& op_type) { auto& all_kernels = OperatorWithKernel::AllOpKernels(); auto it = all_kernels.find(op_type); - if (it == all_kernels.end()) { - // All control operator must support GPU - return true; - } - for (auto& kern_pair : it->second) { - if (platform::is_gpu_place(kern_pair.first.place_)) { + if (it != all_kernels.end()) { + for (auto& kern_pair : it->second) { + if (platform::is_gpu_place(kern_pair.first.place_)) { + return true; + } + } + } else { + if (has_phi_kernel) { + // if has phi kernel, but not find phi gpu kernel and fluid gpu kernel, + // this op doesn't support GPU + return false; + } else { + // All control operator must support GPU return true; } } @@ -1456,7 +1465,8 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const { kernel_iter = kernels.find(expected_kernel_key); } #endif -#ifdef PADDLE_WITH_XPU + +#if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP) if (platform::is_xpu_place(expected_kernel_key.place_) && (kernel_iter == kernels.end() || !paddle::platform::is_xpu_support_op(type_, expected_kernel_key) || @@ -1470,17 +1480,36 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const { #endif #ifdef PADDLE_WITH_XPU_KP - bool use_xpu_kp_kernel_rt = - FLAGS_run_kp_kernel && - paddle::platform::is_xpu_kp_support_op(type_, expected_kernel_key); - bool use_xpu_kp_kernel_debug = - paddle::platform::is_in_xpu_kpwhite_list(type_); - if (platform::is_xpu_place(expected_kernel_key.place_) && - (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug)) { - expected_kernel_key.library_type_ = LibraryType::kKP; - kernel_iter = kernels.find(expected_kernel_key); - VLOG(3) << "using XPU KP kernel: " << type_ - << ", using_kernel_key:" << expected_kernel_key; + if (paddle::platform::is_xpu_place(expected_kernel_key.place_)) { + bool use_xpu_kp_kernel_rt = + FLAGS_run_kp_kernel && + paddle::platform::is_xpu_kp_support_op(type_, expected_kernel_key); + bool use_xpu_kp_kernel_debug = + paddle::platform::is_in_xpu_kpwhite_list(type_); + if (use_xpu_kp_kernel_rt) { + VLOG(3) << "xpu_kp using rt mode "; + } + if (use_xpu_kp_kernel_debug) { + VLOG(3) << "xpu_kp using debug mode "; + } + bool is_xpu_kp_support = (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug); + if (is_xpu_kp_support) { + expected_kernel_key.library_type_ = LibraryType::kKP; + kernel_iter = kernels.find(expected_kernel_key); + VLOG(3) << "using XPU KP kernel: " << type_ + << ", using_kernel_key:" << expected_kernel_key; + } + bool is_xpu_unsupport = + (!paddle::platform::is_xpu_support_op(type_, expected_kernel_key) || + paddle::platform::is_in_xpu_black_list(type_)); + if (!is_xpu_kp_support && + (kernel_iter == kernels.end() || is_xpu_unsupport)) { + VLOG(3) << "missing XPU kernel: " << type_ + << ", expected_kernel_key:" << expected_kernel_key + << ", fallbacking to CPU one!"; + expected_kernel_key.place_ = platform::CPUPlace(); + kernel_iter = kernels.find(expected_kernel_key); + } } #endif @@ -2083,16 +2112,25 @@ void OperatorWithKernel::BuildPhiKernelContext( auto* var = ins_vector[offset]; if (var->IsType()) { tensor_in = &(var->Get()); + pt_kernel_context->EmplaceBackInputWithoutSetRange(tensor_in); } else if (var->IsType()) { tensor_in = &(var->Get()); + pt_kernel_context->EmplaceBackInputWithoutSetRange(tensor_in); + } else if (var->IsType()) { + paddle::SmallVector tensor_vector; + auto& tensor_array = var->Get(); + for (auto& t : tensor_array) { + tensor_vector.emplace_back(&t); + } + pt_kernel_context->EmplaceBackInputsWithoutSetRange(tensor_vector); + end_idx += tensor_array.size() - 1; } else { PADDLE_THROW(platform::errors::Unimplemented( "Unsupported input `%s` type when call pt kernel.", framework::ToTypeName(var->Type()))); } - - pt_kernel_context->EmplaceBackInputWithoutSetRange(tensor_in); } + // Note: here cannot deal with vector input pt_kernel_context->AssignInputRange(std::make_pair(start_idx, end_idx), i); } VLOG(4) << "Done inputs"; @@ -2120,22 +2158,33 @@ void OperatorWithKernel::BuildPhiKernelContext( for (size_t offset = 0; offset < outs_vector.size(); ++offset) { phi::TensorBase* tensor_out = nullptr; auto* var = outs_vector[offset]; - if (var) { if (var->template IsType()) { tensor_out = var->template GetMutable(); + pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out); } else if (var->template IsType()) { tensor_out = var->template GetMutable(); + pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out); + } else if (var->template IsType()) { + paddle::SmallVector tensor_vector; + auto* tensor_array = + var->template GetMutable(); + // Note: If the input LoDTensorArray size is 0, the output + // LoDTensorArray is also 0 + for (auto& t : *tensor_array) { + tensor_vector.emplace_back(&t); + } + pt_kernel_context->EmplaceBackOutputsWithoutSetRange(tensor_vector); + end_idx += tensor_array->size() - 1; } else { PADDLE_THROW(platform::errors::Unimplemented( "Unsupported output `%s` type when call pt kernel.", framework::ToTypeName(var->Type()))); } + } else { + pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out); } - - pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out); } - pt_kernel_context->AssignOutputRange(std::make_pair(start_idx, end_idx), i); } VLOG(4) << "Done outputs"; @@ -2307,6 +2356,10 @@ void OperatorWithKernel::BuildPhiKernelContext( const auto& vector_int_attr = BOOST_GET_CONST(std::vector, attr_it->second); pt_kernel_context->EmplaceBackAttr(vector_int_attr); + } else if (attr_defs[i].type_index == + std::type_index(typeid(std::vector))) { + pt_kernel_context->EmplaceBackAttr( + BOOST_GET_CONST(std::vector, attr_it->second)); } else { PADDLE_THROW(platform::errors::Unimplemented( "Unsupported cast op attribute `%s` when construct " diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 1a1171f1dba4d794796ef1421fe386f60a0e587d..6f68c261d2b24dd66a70734d29d448e8927631e9 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -483,6 +483,10 @@ class ExecutionArgumentMappingContext : public phi::ArgumentMappingContext { return ctx_.InputVar(name)->IsType(); } + bool IsDenseTensorVectorInput(const std::string& name) const override { + return ctx_.InputVar(name)->IsType(); + } + bool IsDenseTensorOutput(const std::string& name) const override { return ctx_.OutputVar(name)->IsType(); } diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 10ceae62dccbbab9329b73e0f581b51508511194..5de861235461ff6670503f6372961bdcf0be5ec2 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -1224,8 +1224,12 @@ void TensorFromStream(std::istream& is, Tensor* tensor, proto::VarType::TensorDesc desc; { // int32_t size // proto buffer - int32_t size; + int32_t size = -1; is.read(reinterpret_cast(&size), sizeof(size)); + PADDLE_ENFORCE_EQ(is.good(), true, platform::errors::Unavailable( + "Cannot read tensor desc size")); + PADDLE_ENFORCE_GE(size, 0, platform::errors::InvalidArgument( + "Tensor desc size should >= 0")); std::unique_ptr buf(new char[size]); is.read(reinterpret_cast(buf.get()), size); PADDLE_ENFORCE_EQ( diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc index 149202468be6c6bec833f100adfd4100c520f8f3..7d60b7d26f3fbceaca9b19995ff2c5d29ad426b8 100644 --- a/paddle/fluid/imperative/amp_auto_cast.cc +++ b/paddle/fluid/imperative/amp_auto_cast.cc @@ -124,7 +124,7 @@ AmpOperators::AmpOperators() OpSupportedInfos("GPU", paddle::framework::proto::VarType::BF16)); unsupported_bf16_ops_->insert(unsupported_ops_gpu_bf16.begin(), unsupported_ops_gpu_bf16.end()); -// NOTE: GPU/NPU/XPU is compiled seperatly. +// NOTE: GPU/NPU/XPU/MLU is compiled seperatly. #elif defined(PADDLE_WITH_ASCEND_CL) auto unsupported_ops_npu_fp16 = std::get<2>( OpSupportedInfos("NPU", paddle::framework::proto::VarType::FP16)); @@ -143,6 +143,15 @@ AmpOperators::AmpOperators() OpSupportedInfos("XPU", paddle::framework::proto::VarType::BF16)); unsupported_bf16_ops_->insert(unsupported_ops_xpu_bf16.begin(), unsupported_ops_xpu_bf16.end()); +#elif defined(PADDLE_WITH_MLU) + auto unsupported_ops_mlu_fp16 = std::get<2>( + OpSupportedInfos("MLU", paddle::framework::proto::VarType::FP16)); + unsupported_fp16_ops_->insert(unsupported_ops_mlu_fp16.begin(), + unsupported_ops_mlu_fp16.end()); + auto unsupported_ops_mlu_bf16 = std::get<2>( + OpSupportedInfos("MLU", paddle::framework::proto::VarType::BF16)); + unsupported_bf16_ops_->insert(unsupported_ops_mlu_bf16.begin(), + unsupported_ops_mlu_bf16.end()); #endif VLOG(4) << allow_ops_->size() << " " << block_ops_->size() << " " << unsupported_fp16_ops_->size() << " " @@ -209,7 +218,10 @@ inline bool NeedCast(const std::shared_ptr& var) { auto data_type = GetDataType(var); if (paddle::platform::is_gpu_place(place) || paddle::platform::is_cuda_pinned_place(place) || - paddle::platform::is_xpu_place(place)) { + paddle::platform::is_xpu_place(place) || + paddle::platform::is_mlu_place(place) || + paddle::platform::is_npu_place(place) || + paddle::platform::is_npu_pinned_place(place)) { // CudaPinndePlace is added for varbase created by dataloader if (data_type == paddle::framework::proto::VarType::FP32 || data_type == paddle::framework::proto::VarType::FP16 || diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc index 12aa13bbacc3bae5d690323f45817f95762c376c..499cf4d8ad6d82dd554fa4f5bbcf39833fed0eab 100644 --- a/paddle/fluid/imperative/gradient_accumulator.cc +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -423,7 +423,7 @@ void TensorAdd(const VarType& src, VarType* dst) { } if (data_type == framework::proto::VarType::BF16) { if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) return TensorAddImpl( src_tensor, dst_tensor, place); #else diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index bae49fb381a475dd8227d1dc855a6db28c9cd273..a427b9b8199116098d149689961cedf14e86e5e1 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -234,7 +234,7 @@ PreparedOp PrepareImpl(const NameVarMap& ins, auto& kernels = kernels_iter->second; auto kernel_iter = kernels.find(expected_kernel_key); -#ifdef PADDLE_WITH_XPU +#if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP) if (paddle::platform::is_xpu_place(expected_kernel_key.place_) && (kernel_iter == kernels.end() || is_xpu_unsupport)) { VLOG(3) << "missing XPU kernel: " << op.Type() @@ -243,29 +243,36 @@ PreparedOp PrepareImpl(const NameVarMap& ins, expected_kernel_key.place_ = platform::CPUPlace(); kernel_iter = kernels.find(expected_kernel_key); } - #endif #ifdef PADDLE_WITH_XPU_KP - expected_kernel_key.place_ = platform::XPUPlace(); - bool use_xpu_kp_kernel_rt = - FLAGS_run_kp_kernel && - paddle::platform::is_xpu_kp_support_op(op.Type(), expected_kernel_key); - bool use_xpu_kp_kernel_debug = - paddle::platform::is_in_xpu_kpwhite_list(op.Type()); - if (use_xpu_kp_kernel_rt) { - VLOG(3) << "xpu_kp using rt mode "; - } - if (use_xpu_kp_kernel_debug) { - VLOG(3) << "xpu_kp using debug mode "; - } - if (paddle::platform::is_xpu_place(expected_kernel_key.place_) && - (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug)) { - expected_kernel_key.place_ = platform::XPUPlace(); - expected_kernel_key.library_type_ = paddle::framework::LibraryType::kKP; - kernel_iter = kernels.find(expected_kernel_key); - VLOG(3) << "using XPU KP kernel: " << op.Type() - << ", using_kernel_key:" << expected_kernel_key; + if (paddle::platform::is_xpu_place(expected_kernel_key.place_)) { + bool use_xpu_kp_kernel_rt = + FLAGS_run_kp_kernel && + paddle::platform::is_xpu_kp_support_op(op.Type(), expected_kernel_key); + bool use_xpu_kp_kernel_debug = + paddle::platform::is_in_xpu_kpwhite_list(op.Type()); + if (use_xpu_kp_kernel_rt) { + VLOG(3) << "xpu_kp using rt mode "; + } + if (use_xpu_kp_kernel_debug) { + VLOG(3) << "xpu_kp using debug mode "; + } + bool is_xpu_kp_support = (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug); + if (is_xpu_kp_support) { + expected_kernel_key.library_type_ = paddle::framework::LibraryType::kKP; + kernel_iter = kernels.find(expected_kernel_key); + VLOG(3) << "using XPU KP kernel: " << op.Type() + << ", using_kernel_key:" << expected_kernel_key; + } + if (!is_xpu_kp_support && + (kernel_iter == kernels.end() || is_xpu_unsupport)) { + VLOG(3) << "missing XPU kernel: " << op.Type() + << ", expected_kernel_key:" << expected_kernel_key + << ", fallbacking to CPU one!"; + expected_kernel_key.place_ = platform::CPUPlace(); + kernel_iter = kernels.find(expected_kernel_key); + } } #endif diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h index 8deb3b93e9c50489dcfc6805063f23e3705cb634..9daac181d57de63a85116d176a286a9be9b3d4c7 100644 --- a/paddle/fluid/imperative/prepared_operator.h +++ b/paddle/fluid/imperative/prepared_operator.h @@ -289,14 +289,23 @@ void BuildDygraphPhiKernelContext( auto& var = ins_vector[offset]->Var(); if (var.template IsType()) { tensor_in = &(var.template Get()); + kernel_ctx->EmplaceBackInputWithoutSetRange(tensor_in); } else if (var.template IsType()) { tensor_in = &(var.template Get()); + kernel_ctx->EmplaceBackInputWithoutSetRange(tensor_in); + } else if (var.template IsType()) { + paddle::SmallVector tensor_vector; + auto& tensor_array = var.template Get(); + for (auto& t : tensor_array) { + tensor_vector.emplace_back(&t); + } + kernel_ctx->EmplaceBackInputsWithoutSetRange(tensor_vector); + end_idx += tensor_array.size() - 1; } else { PADDLE_THROW(platform::errors::Unimplemented( "Unsupported input `%s` type when call pt kernel.", framework::ToTypeName(var.Type()))); } - kernel_ctx->EmplaceBackInputWithoutSetRange(tensor_in); } kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i); } @@ -326,22 +335,32 @@ void BuildDygraphPhiKernelContext( if (var) { if (var->template IsType()) { tensor_out = var->template GetMutable(); + kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out); } else if (var->template IsType()) { tensor_out = var->template GetMutable(); + kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out); + } else if (var->template IsType()) { + paddle::SmallVector tensor_vector; + auto* tensor_array = + var->template GetMutable(); + for (auto& t : *tensor_array) { + tensor_vector.emplace_back(&t); + } + kernel_ctx->EmplaceBackOutputsWithoutSetRange(tensor_vector); + end_idx += tensor_array->size() - 1; } else { PADDLE_THROW(platform::errors::Unimplemented( "Unsupported output `%s` type when call pt kernel.", framework::ToTypeName(var->Type()))); } + } else { + kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out); } - - kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out); } kernel_ctx->AssignOutputRange(std::make_pair(start_idx, end_idx), i); } for (size_t i = 0; i < attr_names.size(); ++i) { - VLOG(1) << "############## attr_name: " << i << " : " << attr_names[i]; if (attr_defs[i].type_index == std::type_index(typeid(phi::ScalarArray))) { if (attrs.find(attr_names[i]) != attrs.end()) { // shape is in the attribute @@ -522,6 +541,10 @@ void BuildDygraphPhiKernelContext( } else if (attr_defs[i].type_index == std::type_index(typeid(std::vector))) { kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(std::vector, attr)); + } else if (attr_defs[i].type_index == + std::type_index(typeid(std::vector))) { + kernel_ctx->EmplaceBackAttr( + BOOST_GET_CONST(std::vector, attr)); } else { PADDLE_THROW(platform::errors::Unimplemented( "Unsupported cast op attribute `%s` when construct " diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index c55599cc9aa954e2bd437f0917c792e4fdb6b577..d18c8e96c49b6a993fbd0a8d632212ae8d7f8c6d 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -390,8 +390,8 @@ bool Tracer::ComputeRequiredGrad(const NameTensorMap& ins, } phi::KernelSignature Tracer::GetExpectedKernelSignature( - const std::string& type, const NameVarBaseMap& ins, - const NameVarBaseMap& outs, framework::AttributeMap attrs) const { + const std::string& type, const NameTensorMap& ins, + const NameTensorMap& outs, framework::AttributeMap attrs) const { auto op = framework::OpRegistry::CreateOp(type, {}, {}, {}, false); framework::RuntimeContext ctx({}, {}); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); @@ -406,7 +406,7 @@ phi::KernelSignature Tracer::GetExpectedKernelSignature( attr_checker == nullptr ? empty_attrs_map : attr_checker->GetDefaultAttrMap(); auto dygraph_exe_ctx = - imperative::DygraphExecutionContext( + imperative::DygraphExecutionContext( *op, framework::Scope(), *dev_ctx, ctx, ins, outs, attrs, default_attrs); auto* opbase_with_kernel = diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index fd13fce6a6e17a47a7a91dfa78598a99ec22f0b7..f24961885c9b85b03c561f60f375b1a21bf086dd 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -156,8 +156,8 @@ class Tracer { } phi::KernelSignature GetExpectedKernelSignature( - const std::string& type, const NameVarBaseMap& ins, - const NameVarBaseMap& outs, framework::AttributeMap attrs) const; + const std::string& type, const NameTensorMap& ins, + const NameTensorMap& outs, framework::AttributeMap attrs) const; paddle::framework::GarbageCollector* MutableGarbageCollectorIfNotExists( const platform::Place& place); diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index a5c32164bf1a28687ea6f8cc53427db67560c307..74e8ca3f229c6b7093e29cb53c0ce15e0b15d6a9 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -188,6 +188,9 @@ struct Argument { DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool); DECL_ARGUMENT_FIELD(use_fc_padding, UseFcPadding, bool); DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int); + DECL_ARGUMENT_FIELD(use_gpu_fp16, UseGPUFp16, bool); + DECL_ARGUMENT_FIELD(gpu_fp16_disabled_op_types, GpuFp16DisabledOpTypes, + std::unordered_set); // Usually use for trt dynamic shape. // TRT will select the best kernel according to opt shape diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index 796c86a3ad1efe45dd8a00139b92c2642676a811..287c896e49bf254d70a5c79c818a39f913472f2f 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -189,6 +189,10 @@ void IRPassManager::CreatePasses(Argument *argument, new int(argument->dlnne_min_subgraph_size())); pass->Set("program", new framework::ProgramDesc *(&argument->main_program())); + } else if (pass_name == "mixed_precision_configure_pass") { + pass->Set("gpu_fp16_disabled_op_types", + new std::unordered_set( + argument->gpu_fp16_disabled_op_types())); } if (pass_name == "lite_subgraph_pass") { bool lite_enable_int8 = diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc index daa18d8c78bf875ebcc6571bf955a7f634948e4f..614eea24a0e2ee9d4fabd68a9374fa7c44b63ad7 100644 --- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h" #include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/platform/enforce.h" @@ -65,6 +66,26 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToNpu(Argument *argument) { #else +void IrParamsSyncAmongDevicesPass::GetVarNameToOpTypeMap( + const framework::ir::Graph &graph, + std::unordered_map *var_name_op_type_map) { + std::vector node_list = + framework::ir::TopologyVarientSort( + graph, static_cast(0)); + for (auto *op_node : node_list) { + if (!op_node->IsOp() || op_node->Op()->Type() == "feed" || + op_node->Op()->Type() == "fetch") + continue; + + for (auto *pre_node : op_node->inputs) { + if (pre_node->IsVar() && pre_node->Var()->Persistable()) { + var_name_op_type_map->insert(std::pair( + pre_node->Var()->Name(), op_node->Op()->Type())); + } + } + } +} + void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) { // The parameters are on the cpu, therefore, synchronization is not necessary. if (!argument->use_gpu()) return; @@ -102,6 +123,16 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) { if (with_dynamic_shape) { reserve_cpu_weights = true; } + + bool mixed_precision_mode = + argument->Has("use_gpu_fp16") && argument->use_gpu_fp16(); + std::unordered_map var_name_op_type_map{}; + std::unordered_set blacklist{}; + if (mixed_precision_mode) { + GetVarNameToOpTypeMap(graph, &var_name_op_type_map); + blacklist = argument->gpu_fp16_disabled_op_types(); + } + for (auto &var_name : all_vars) { if (std::count(repetitive_params.begin(), repetitive_params.end(), var_name)) { @@ -117,18 +148,29 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) { var->IsType()) { auto *t = var->GetMutable(); - platform::CPUPlace cpu_place; - framework::LoDTensor temp_tensor; - temp_tensor.Resize(t->dims()); - temp_tensor.mutable_data(cpu_place); - - // Copy the parameter data to a tmp tensor. - paddle::framework::TensorCopySync(*t, cpu_place, &temp_tensor); - // Reallocation the space on GPU - t->clear(); - - // Copy parameter data to newly allocated GPU space. - paddle::framework::TensorCopySync(temp_tensor, place, t); + bool is_float = t->dtype() == paddle::experimental::DataType::FLOAT32 || + t->dtype() == paddle::experimental::DataType::FLOAT64; + if (mixed_precision_mode && + !blacklist.count(var_name_op_type_map[var_name]) && is_float) { + framework::Tensor half_tensor; + half_tensor.set_type(paddle::experimental::DataType::FLOAT16); + half_tensor.Resize(t->dims()); + auto *half_data = + half_tensor.mutable_data(platform::CPUPlace()); + for (int i = 0; i < t->numel(); i++) { + auto *data = t->mutable_data(platform::CPUPlace()); + half_data[i] = static_cast(data[i]); + } + t->clear(); + paddle::framework::TensorCopySync(half_tensor, place, t); + } else { + platform::CPUPlace cpu_place; + framework::LoDTensor temp_tensor; + temp_tensor.Resize(t->dims()); + paddle::framework::TensorCopySync(*t, cpu_place, &temp_tensor); + t->clear(); + paddle::framework::TensorCopySync(temp_tensor, place, t); + } } } } diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h index d5e98ec886e65f829a1496b1431f23aad6c4bc4c..f8209f051d53444435ed8c65b400f08bf8627553 100644 --- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h +++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h @@ -38,7 +38,12 @@ class IrParamsSyncAmongDevicesPass : public AnalysisPass { #ifdef PADDLE_WITH_ASCEND_CL void CopyParamsToNpu(Argument *argument); #else - void CopyParamsToGpu(Argument *argument); + + void GetVarNameToOpTypeMap( + const framework::ir::Graph& graph, + std::unordered_map* var_name_op_type_map); + + void CopyParamsToGpu(Argument* argument); #endif }; diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 41c01d3b7e261314d8dc6b852f5b2a597421fe48..d08d28a3f623389790e63d45e13584a8d0db6adc 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -83,6 +83,7 @@ void AnalysisConfig::SetModel(const std::string &prog_file_path, Update(); } + void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb, int device_id) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) @@ -97,12 +98,26 @@ void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb, Update(); } + void AnalysisConfig::DisableGpu() { use_gpu_ = false; Update(); } +void AnalysisConfig::Exp_EnableUseGpuFp16( + std::unordered_set op_list) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + use_gpu_fp16_ = true; + gpu_fp16_disabled_op_types_.insert(op_list.begin(), op_list.end()); +#else + LOG(ERROR) << "Please compile with gpu to Exp_EnableUseGpuFp16()"; + use_gpu_fp16_ = false; +#endif + + Update(); +} + void AnalysisConfig::DisableFCPadding() { use_fc_padding_ = false; @@ -213,6 +228,8 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { CP_MEMBER(use_cudnn_); CP_MEMBER(gpu_device_id_); CP_MEMBER(memory_pool_init_size_mb_); + CP_MEMBER(use_gpu_fp16_); + CP_MEMBER(gpu_fp16_disabled_op_types_); CP_MEMBER(enable_memory_optim_); // TensorRT related. @@ -573,6 +590,20 @@ void AnalysisConfig::Update() { #endif } + if (use_gpu_fp16_) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + if (!enable_ir_optim_) { + LOG(ERROR) << "Exp_EnableUseGpuFp16() only works when IR optimization is " + "enabled."; + } else if (!use_gpu()) { + LOG(ERROR) + << "Exp_EnableUseGpuFp16() only works when use_gpu is enabled."; + } else { + pass_builder()->Exp_EnableUseGpuFp16(); + } +#endif + } + if (use_mkldnn_) { #ifdef PADDLE_WITH_MKLDNN if (!enable_ir_optim_) { @@ -669,6 +700,8 @@ std::string AnalysisConfig::SerializeInfoCache() { ss << params_file_; ss << use_gpu_; + ss << use_gpu_fp16_; + for (auto &item : gpu_fp16_disabled_op_types_) ss << item; ss << use_fc_padding_; ss << gpu_device_id_; ss << xpu_device_id_; diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 871ed596a3ee9d6362b03e99ca10313765826a51..a7caa3e369f80a954f36226c070ff1f7bd822a2b 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -50,8 +50,7 @@ #include "paddle/phi/api/ext/op_meta_info.h" #include "paddle/utils/string/split.h" -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) #include "paddle/fluid/distributed/fleet_executor/fleet_executor.h" #include "paddle/fluid/distributed/fleet_executor/fleet_executor_desc.pb.h" #include "paddle/fluid/distributed/fleet_executor/task_node.h" @@ -374,8 +373,7 @@ static void DisablePrepareDataOpt( } bool AnalysisPredictor::PrepareExecutor() { -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) if (config_.dist_config().use_dist_model()) { VLOG(3) << "use_dist_model is enabled, will init FleetExecutor."; return PrepareFleetExecutor(); @@ -393,8 +391,7 @@ bool AnalysisPredictor::PrepareExecutor() { return true; } -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) bool AnalysisPredictor::PrepareFleetExecutor() { VLOG(3) << "AnalysisPredictor::PrepareFleetExecutor()"; if (config_.dist_config().nranks() > 1 && !CommInit()) { @@ -872,6 +869,11 @@ void AnalysisPredictor::PrepareArgument() { argument_.SetDlnneMinSubgraphSize(config_.dlnne_min_subgraph_size_); } + if (config_.gpu_fp16_enabled()) { + argument_.SetUseGPUFp16(true); + argument_.SetGpuFp16DisabledOpTypes(config_.gpu_fp16_disabled_op_types_); + } + if (config_.lite_engine_enabled()) { argument_.SetCpuMathLibraryNumThreads( config_.cpu_math_library_num_threads()); @@ -1189,8 +1191,7 @@ std::vector AnalysisPredictor::GetOutputNames() { std::unique_ptr AnalysisPredictor::GetInputTensor( const std::string &name) { framework::Scope *scope; -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) if (config_.dist_config().use_dist_model()) { scope = scope_.get(); } else { @@ -1239,8 +1240,7 @@ std::unique_ptr AnalysisPredictor::GetInputTensor( std::unique_ptr AnalysisPredictor::GetOutputTensor( const std::string &name) { framework::Scope *scope; -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) if (config_.dist_config().use_dist_model()) { scope = scope_.get(); } else { @@ -1287,8 +1287,7 @@ std::unique_ptr AnalysisPredictor::GetOutputTensor( } bool AnalysisPredictor::ZeroCopyRun() { -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) if (config_.dist_config().use_dist_model()) { VLOG(3) << "ZeroCopyRun will use the fleet executor."; inference::Timer timer; diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index 21a7e9658bbeeb16d4cbff6364aaef68edcae16d..d9992f3fbef9d6ed626410ae5b9fc881b0772aa8 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -18,8 +18,7 @@ #include #include #include -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) #include "paddle/fluid/distributed/fleet_executor/fleet_executor.h" #endif #include "paddle/fluid/framework/naive_executor.h" @@ -395,8 +394,7 @@ class AnalysisPredictor : public PaddlePredictor { void StatisticShapeRangeInfo(); void CollectShapeRangeInfo(); -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) // fleet exe related /// @@ -488,8 +486,7 @@ class AnalysisPredictor : public PaddlePredictor { std::map>> shape_info_; static int clone_num_; -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) // fleet executor related distributed::FleetExecutorDesc executor_desc_; std::shared_ptr fleet_exe_; diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc index 2c6e8f4f1a4d9ea0dfba8f400c7d3782a5e2c32d..ecb5eaf982548c44eb97fde7e2b7365c9b0e9fc2 100644 --- a/paddle/fluid/inference/api/analysis_predictor_tester.cc +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -375,6 +375,19 @@ TEST(AnalysisPredictor, enable_onnxruntime) { ASSERT_TRUE(!config.use_onnxruntime()); } +TEST(AnalysisPredictor, exp_enable_use_gpu_fp16) { + AnalysisConfig config; + config.SwitchIrOptim(); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + config.EnableUseGpu(100, 0); + config.Exp_EnableUseGpuFp16(); + ASSERT_TRUE(config.gpu_fp16_enabled()); +#else + config.DisableGpu(); +#endif + LOG(INFO) << config.Summary(); +} + } // namespace paddle namespace paddle_infer { @@ -434,6 +447,19 @@ TEST(Predictor, EnableONNXRuntime) { auto predictor = CreatePredictor(config); } +TEST(Predictor, Exp_EnableUseGpuFp16) { + Config config; + config.SetModel(FLAGS_dirname); + config.SwitchIrOptim(); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + config.EnableUseGpu(100, 0); + config.Exp_EnableUseGpuFp16(); +#else + config.DisableGpu(); +#endif + auto predictor = CreatePredictor(config); +} + TEST(Tensor, CpuShareExternalData) { Config config; config.SetModel(FLAGS_dirname); diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh index 79a31555c7f0b1cb4a8d9c48bae16145d605935b..2c0945cd5b386a003ce63c86f3feb52213b378ba 100755 --- a/paddle/fluid/inference/api/demo_ci/run.sh +++ b/paddle/fluid/inference/api/demo_ci/run.sh @@ -53,7 +53,11 @@ if [ $7 == ON ]; then if [[ -e "MobileNetV2.inference.model.tar.gz" ]]; then echo "MobileNetV2.inference.model.tar.gz has been downloaded." else - wget -q --no-proxy http://paddle-inference-dist.bj.bcebos.com/MobileNetV2.inference.model.tar.gz + if [ $WIN_DETECT != "" ]; then + wget -q -Y off http://paddle-inference-dist.bj.bcebos.com/MobileNetV2.inference.model.tar.gz + else + wget -q --no-proxy http://paddle-inference-dist.bj.bcebos.com/MobileNetV2.inference.model.tar.gz + fi tar xzf *.tar.gz fi cd .. diff --git a/paddle/fluid/inference/api/details/CMakeLists.txt b/paddle/fluid/inference/api/details/CMakeLists.txt index 4341fb0a9ccd8822151d4660f5a0c22901e47122..b2cfb060dd32559f6157fc456c7399736fc9fe51 100644 --- a/paddle/fluid/inference/api/details/CMakeLists.txt +++ b/paddle/fluid/inference/api/details/CMakeLists.txt @@ -14,7 +14,11 @@ # cc_library(reset_tensor_array SRCS reset_tensor_array.cc DEPS lod_tensor scope) -cc_library(zero_copy_tensor SRCS zero_copy_tensor.cc DEPS scope lod_tensor enforce) +if (WITH_ONNXRUNTIME) + cc_library(zero_copy_tensor SRCS zero_copy_tensor.cc DEPS scope lod_tensor enforce onnxruntime) +else (WITH_ONNXRUNTIME) + cc_library(zero_copy_tensor SRCS zero_copy_tensor.cc DEPS scope lod_tensor enforce) +endif (WITH_ONNXRUNTIME) cc_library(zero_copy_tensor_dummy SRCS zero_copy_tensor_dummy.cc) cc_test(zero_copy_tensor_test SRCS zero_copy_tensor_test.cc DEPS paddle_inference_api) diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc index 18b1d09f0e8a7c4be9862991060a4706ee7cde7e..66dec0157d98e776b38ec8af81a0c006bc732bf4 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -22,12 +22,22 @@ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/float16.h" #include "paddle/phi/core/allocator.h" +#ifdef PADDLE_WITH_ONNXRUNTIME +#include "paddle/fluid/inference/api/onnxruntime_predictor.h" +#endif namespace paddle_infer { using float16 = paddle::platform::float16; void Tensor::Reshape(const std::vector &shape) { +#ifdef PADDLE_WITH_ONNXRUNTIME + if (is_ort_tensor_) { + shape_.assign(shape.begin(), shape.end()); + return; + } +#endif + PADDLE_ENFORCE_EQ( name_.empty(), false, paddle::platform::errors::PreconditionNotMet( @@ -123,6 +133,11 @@ T *Tensor::data(PlaceType *place, int *size) const { } DataType Tensor::type() const { +#ifdef PADDLE_WITH_ONNXRUNTIME + if (is_ort_tensor_) { + return dtype_; + } +#endif EAGER_GET_TENSOR(paddle::framework::LoDTensor); auto type = paddle::framework::TransToProtoVarType(tensor->dtype()); if (type == paddle::framework::proto::VarType::FP32) { @@ -145,6 +160,13 @@ PlaceType Tensor::place() const { return place_; } template void Tensor::CopyFromCpu(const T *data) { +#ifdef PADDLE_WITH_ONNXRUNTIME + if (is_ort_tensor_) { + ORTCopyFromCpu(data); + return; + } +#endif + EAGER_GET_TENSOR(paddle::framework::LoDTensor); PADDLE_ENFORCE_GE(tensor->numel(), 0, paddle::platform::errors::PreconditionNotMet( @@ -382,6 +404,13 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb, template void Tensor::CopyToCpu(T *data) const { +#ifdef PADDLE_WITH_ONNXRUNTIME + if (is_ort_tensor_) { + ORTCopyToCpu(data); + return; + } +#endif + CopyToCpuImpl(data, nullptr, nullptr, nullptr); } @@ -489,12 +518,7 @@ template PD_INFER_DECL uint8_t *Tensor::mutable_data(PlaceType place); template PD_INFER_DECL int8_t *Tensor::mutable_data(PlaceType place); template PD_INFER_DECL float16 *Tensor::mutable_data(PlaceType place); -Tensor::Tensor(void *scope) : scope_{scope} { - PADDLE_ENFORCE_NOT_NULL(scope_, - paddle::platform::errors::PreconditionNotMet( - "The `scope` can not be nullptr. It should be " - "set to the pointer of scope.")); -} +Tensor::Tensor(void *scope) : scope_{scope} {} template void *Tensor::FindTensor() const { @@ -513,6 +537,26 @@ void *Tensor::FindTensor() const { } std::vector Tensor::shape() const { +#ifdef PADDLE_WITH_ONNXRUNTIME + if (is_ort_tensor_) { + std::vector shape; + // input handle + if (idx_ < 0) { + shape.assign(shape_.begin(), shape_.end()); + } else { // output handle + auto binding = binding_.lock(); + PADDLE_ENFORCE_NOT_NULL(binding, + paddle::platform::errors::PreconditionNotMet( + "output tensor [%s] no binding ptr", name_)); + std::vector outputs = binding->GetOutputValues(); + Ort::Value &value = outputs[idx_]; + auto info = value.GetTensorTypeAndShapeInfo(); + auto ort_shape = info.GetShape(); + shape.assign(ort_shape.begin(), ort_shape.end()); + } + return shape; + } +#endif EAGER_GET_TENSOR(paddle::framework::LoDTensor); PADDLE_ENFORCE_NOT_NULL( tensor_, paddle::platform::errors::PreconditionNotMet( @@ -573,4 +617,99 @@ void Tensor::SetPlace(PlaceType place, int device) { device_ = device; } +#ifdef PADDLE_WITH_ONNXRUNTIME +void Tensor::SetOrtMark(bool is_ort_tensor) { is_ort_tensor_ = is_ort_tensor; } + +void Tensor::SetOrtBinding(const std::shared_ptr binding) { + binding_ = binding; +} + +Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, float *data, + size_t size, const int64_t *shape, size_t shape_len) { + return Ort::Value::CreateTensor(memory_info, data, size, shape, + shape_len); +} + +Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, int64_t *data, + size_t size, const int64_t *shape, size_t shape_len) { + return Ort::Value::CreateTensor(memory_info, data, size, shape, + shape_len); +} + +Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, int32_t *data, + size_t size, const int64_t *shape, size_t shape_len) { + return Ort::Value::CreateTensor(memory_info, data, size, shape, + shape_len); +} + +Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, uint8_t *data, + size_t size, const int64_t *shape, size_t shape_len) { + return Ort::Value::CreateTensor(memory_info, data, size, shape, + shape_len); +} + +Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, int8_t *data, + size_t size, const int64_t *shape, size_t shape_len) { + return Ort::Value::CreateTensor(memory_info, data, size, shape, + shape_len); +} + +Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, float16 *data, + size_t size, const int64_t *shape, size_t shape_len) { + return Ort::Value::CreateTensor(memory_info, static_cast(data), + size * sizeof(float16), shape, shape_len, + ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16); +} + +template +void Tensor::ORTCopyFromCpu(const T *data) { + auto binding = binding_.lock(); + PADDLE_ENFORCE_NOT_NULL(binding, + paddle::platform::errors::PreconditionNotMet( + "input tensor [%s] no binding ptr", name_)); + const char *device_name = place_ == PlaceType::kCPU ? "Cpu" : "Cuda"; + Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator, device_, + OrtMemTypeDefault); + size_t size = std::accumulate(begin(shape_), end(shape_), 1UL, + std::multiplies()); + auto ort_value = GetOrtVaule(memory_info, const_cast(data), size, + shape_.data(), shape_.size()); + binding->BindInput(name_.c_str(), ort_value); +} + +template +void Tensor::ORTCopyToCpu(T *data) const { + auto binding = binding_.lock(); + PADDLE_ENFORCE_NOT_NULL(binding, + paddle::platform::errors::PreconditionNotMet( + "output tensor [%s] no binding ptr", name_)); + std::vector outputs = binding->GetOutputValues(); + Ort::Value &value = outputs[idx_]; + auto info = value.GetTensorTypeAndShapeInfo(); + size_t size = info.GetElementCount() * sizeof(T); + + if (place_ == PlaceType::kCPU) { + std::memcpy(static_cast(data), value.GetTensorData(), size); + } else { + paddle::memory::Copy(paddle::platform::CPUPlace(), + static_cast(data), + paddle::platform::CUDAPlace(device_), + value.GetTensorData(), size, nullptr); + } +} + +template void Tensor::ORTCopyFromCpu(const float *data); +template void Tensor::ORTCopyFromCpu(const int64_t *data); +template void Tensor::ORTCopyFromCpu(const int32_t *data); +template void Tensor::ORTCopyFromCpu(const uint8_t *data); +template void Tensor::ORTCopyFromCpu(const int8_t *data); +template void Tensor::ORTCopyFromCpu(const float16 *data); + +template void Tensor::ORTCopyToCpu(float *data) const; +template void Tensor::ORTCopyToCpu(int32_t *data) const; +template void Tensor::ORTCopyToCpu(uint8_t *data) const; +template void Tensor::ORTCopyToCpu(int8_t *data) const; +template void Tensor::ORTCopyToCpu(float16 *data) const; +#endif + } // namespace paddle_infer diff --git a/paddle/fluid/inference/api/onnxruntime_predictor.cc b/paddle/fluid/inference/api/onnxruntime_predictor.cc index ee82da139d8f39c26002763c4a4835050c48fc99..bd9de252a0962bc27a23b949b428d8f18f96190f 100644 --- a/paddle/fluid/inference/api/onnxruntime_predictor.cc +++ b/paddle/fluid/inference/api/onnxruntime_predictor.cc @@ -25,11 +25,7 @@ #include #include "paddle/fluid//platform/device/gpu/gpu_types.h" -#include "paddle/fluid/framework/feed_fetch_method.h" -#include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/var_type_traits.h" -#include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/framework/version.h" #include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h" @@ -45,24 +41,23 @@ namespace paddle { -framework::proto::VarType::Type ConvertONNXType( - ONNXTensorElementDataType type) { +paddle_infer::DataType ConvertONNXType(ONNXTensorElementDataType type) { switch (type) { case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT: - return framework::proto::VarType::FP32; - // case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16: - // return DataType::FP16; + return paddle_infer::DataType::FLOAT32; + case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16: + return paddle_infer::DataType::FLOAT16; case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8: - return framework::proto::VarType::INT8; + return paddle_infer::DataType::INT8; case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32: - return framework::proto::VarType::INT32; + return paddle_infer::DataType::INT32; case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64: - return framework::proto::VarType::INT64; + return paddle_infer::DataType::INT64; case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8: - return framework::proto::VarType::UINT8; + return paddle_infer::DataType::UINT8; default: LOG(ERROR) << "unsupported ONNX Tensor Type: " << static_cast(type); - return framework::proto::VarType::FP32; + return paddle_infer::DataType::FLOAT32; } } @@ -87,13 +82,12 @@ bool ONNXRuntimePredictor::Init() { VLOG(3) << "ONNXRuntime Predictor::init()"; // Now ONNXRuntime only suuport CPU + const char *device_name = config_.use_gpu() ? "Cuda" : "Cpu"; if (config_.use_gpu()) { place_ = paddle::platform::CUDAPlace(config_.gpu_device_id()); } else { place_ = paddle::platform::CPUPlace(); } - scope_.reset(new paddle::framework::Scope()); - sub_scope_ = &scope_->NewScope(); std::string onnx_proto; paddle2onnx::Export(config_.prog_file(), config_.params_file(), &onnx_proto, @@ -125,13 +119,12 @@ bool ONNXRuntimePredictor::Init() { "generated."; } session_ = {env_, onnx_proto.data(), onnx_proto.size(), session_options}; + binding_ = std::make_shared(session_); - auto memory_info = - Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault); + Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator, + place_.GetDeviceId(), OrtMemTypeDefault); Ort::Allocator allocator(session_, memory_info); - framework::proto::VarType::Type proto_type = - framework::proto::VarType::LOD_TENSOR; size_t n_inputs = session_.GetInputCount(); for (size_t i = 0; i < n_inputs; ++i) { auto input_name = session_.GetInputName(i, allocator); @@ -141,8 +134,6 @@ bool ONNXRuntimePredictor::Init() { ONNXTensorElementDataType data_type = type_info.GetTensorTypeAndShapeInfo().GetElementType(); input_desc_.emplace_back(ONNXDesc{input_name, shape, data_type}); - auto *ptr = scope_->Var(input_name); - framework::InitializeVariable(ptr, proto_type); allocator.Free(input_name); } @@ -155,11 +146,13 @@ bool ONNXRuntimePredictor::Init() { ONNXTensorElementDataType data_type = type_info.GetTensorTypeAndShapeInfo().GetElementType(); output_desc_.emplace_back(ONNXDesc{output_name, shape, data_type}); - auto *ptr = scope_->Var(output_name); - framework::InitializeVariable(ptr, proto_type); + + Ort::MemoryInfo out_memory_info(device_name, OrtDeviceAllocator, + place_.GetDeviceId(), OrtMemTypeDefault); + binding_->BindOutput(output_name, out_memory_info); + allocator.Free(output_name); } - return true; } @@ -216,15 +209,26 @@ std::vector ONNXRuntimePredictor::GetOutputNames() { return output_names; } +bool ONNXRuntimePredictor::FindONNXDesc(const std::string &name, + bool is_input) { + if (is_input) { + for (auto i : input_desc_) + if (i.name == name) return true; + } else { + for (auto i : output_desc_) + if (i.name == name) return true; + } + return false; +} + std::unique_ptr ONNXRuntimePredictor::GetInputTensor( const std::string &name) { - PADDLE_ENFORCE_NOT_NULL(scope_->FindVar(name), - platform::errors::PreconditionNotMet( - "The in variable named %s is not found in the " - "scope of the ONNXPredictor.", - name)); - std::unique_ptr res( - new ZeroCopyTensor(static_cast(scope_.get()))); + PADDLE_ENFORCE_EQ(FindONNXDesc(name, true), true, + platform::errors::PreconditionNotMet( + "The in variable named %s is not found in the " + "ONNXPredictor.", + name)); + std::unique_ptr res(new ZeroCopyTensor(nullptr)); res->input_or_output_ = true; res->SetName(name); if (platform::is_cpu_place(place_)) { @@ -233,18 +237,19 @@ std::unique_ptr ONNXRuntimePredictor::GetInputTensor( auto gpu_place = place_; res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId()); } + res->SetOrtMark(true); + res->SetOrtBinding(binding_); return res; } std::unique_ptr ONNXRuntimePredictor::GetOutputTensor( const std::string &name) { - PADDLE_ENFORCE_NOT_NULL(scope_->FindVar(name), - platform::errors::PreconditionNotMet( - "The out variable named %s is not found in the " - "scope of the ONNXPredictor.", - name)); - std::unique_ptr res( - new ZeroCopyTensor(static_cast(scope_.get()))); + PADDLE_ENFORCE_EQ(FindONNXDesc(name, false), true, + platform::errors::PreconditionNotMet( + "The out variable named %s is not found in the " + "ONNXPredictor.", + name)); + std::unique_ptr res(new ZeroCopyTensor(nullptr)); res->input_or_output_ = false; res->SetName(name); if (platform::is_cpu_place(place_)) { @@ -253,46 +258,18 @@ std::unique_ptr ONNXRuntimePredictor::GetOutputTensor( auto gpu_place = place_; res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId()); } + res->SetOrtMark(true); + res->SetOrtBinding(binding_); + int size = output_desc_.size(); + for (int i = 0; i < size; ++i) + if (output_desc_[i].name == name) { + res->idx_ = i; + res->dtype_ = ConvertONNXType(output_desc_[i].dtype); + break; + } return res; } -Ort::Value ONNXRuntimePredictor::GetOrtValue(const ONNXDesc &desc, - const char *device_name) { - Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator, - place_.GetDeviceId(), OrtMemTypeDefault); - auto *var = scope_->FindVar(desc.name); - auto *tensor = var->GetMutable(); - size_t size = - tensor->numel() * - framework::SizeOfType(framework::TransToProtoVarType(tensor->dtype())); - std::vector shape = phi::vectorize(tensor->dims()); - return Ort::Value::CreateTensor(memory_info, - static_cast(tensor->data()), size, - shape.data(), shape.size(), desc.dtype); -} - -void ONNXRuntimePredictor::AsTensor(const Ort::Value &value, - const ONNXDesc &desc) { - auto info = value.GetTensorTypeAndShapeInfo(); - - auto *var = scope_->FindVar(desc.name); - auto *tensor = var->GetMutable(); - tensor->Resize(phi::make_ddim(info.GetShape())); - auto dtype = ConvertONNXType(info.GetElementType()); - auto *ptr = tensor->mutable_data(place_, dtype); - - if (platform::is_cpu_place(place_)) { - std::memcpy(ptr, const_cast(value.GetTensorData()), - tensor->numel() * framework::SizeOfType(dtype)); - } else { - auto src_place = place_; - auto dst_place = place_; - memory::Copy(dst_place, ptr, src_place, - const_cast(value.GetTensorData()), - tensor->numel() * framework::SizeOfType(dtype)); - } -} - bool ONNXRuntimePredictor::Run(const std::vector &inputs, std::vector *output_data, int batch_size) { @@ -302,31 +279,7 @@ bool ONNXRuntimePredictor::Run(const std::vector &inputs, bool ONNXRuntimePredictor::ZeroCopyRun() { try { - Ort::IoBinding binding(session_); - std::vector inputs; - std::vector outputs; - Ort::RunOptions options; - - inputs.reserve(input_desc_.size()); - const char *device_name = config_.use_gpu() ? "Cuda" : "Cpu"; - for (auto desc : input_desc_) { - inputs.push_back(GetOrtValue(desc, device_name)); - binding.BindInput(desc.name.c_str(), inputs.back()); - } - - // TODO(heliqi): Optimization —— move to Init() - for (auto desc : output_desc_) { - Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator, - place_.GetDeviceId(), OrtMemTypeDefault); - binding.BindOutput(desc.name.c_str(), memory_info); - } - - session_.Run({}, binding); - - outputs = binding.GetOutputValues(); - for (size_t i = 0; i < output_desc_.size(); ++i) { - AsTensor(outputs[i], output_desc_[i]); - } + session_.Run({}, *(binding_.get())); } catch (const std::exception &e) { LOG(ERROR) << e.what(); return false; @@ -345,9 +298,9 @@ uint64_t ONNXRuntimePredictor::TryShrinkMemory() { } ONNXRuntimePredictor::~ONNXRuntimePredictor() { - if (sub_scope_) { - scope_->DeleteScope(sub_scope_); - } + binding_->ClearBoundInputs(); + binding_->ClearBoundOutputs(); + memory::Release(place_); } diff --git a/paddle/fluid/inference/api/onnxruntime_predictor.h b/paddle/fluid/inference/api/onnxruntime_predictor.h index 7fb07aa97bd2746773192456ddeba941a24e8906..d01756e4b96b132e3f9c3815e96f612433616ff2 100644 --- a/paddle/fluid/inference/api/onnxruntime_predictor.h +++ b/paddle/fluid/inference/api/onnxruntime_predictor.h @@ -94,9 +94,8 @@ class ONNXRuntimePredictor : public PaddlePredictor { /// \param[in] AnalysisConfig config /// explicit ONNXRuntimePredictor(const AnalysisConfig &config) - : config_(config) { + : config_(config), env_(ORT_LOGGING_LEVEL_WARNING, "onnx") { predictor_id_ = inference::GetUniqueId(); - env_ = Ort::Env(ORT_LOGGING_LEVEL_INFO, "onnx"); } /// /// \brief Destroy the ONNXRuntime Predictor object @@ -177,30 +176,17 @@ class ONNXRuntimePredictor : public PaddlePredictor { /// std::unique_ptr Clone() override; - std::shared_ptr scope_; - private: /// - /// \brief get the Ort Value(input Tensor). - /// - /// \param[in] desc ONNXDesce(name、shape、dtype) - /// - /// \param[in] device_name "cpu" or "gpu" of device - /// - /// \return get a Ort::Value - /// - Ort::Value GetOrtValue(const ONNXDesc &desc, const char *device_name); - - /// - /// \brief Ort::Value to Paddle::ZeroCopyTensor. + /// \brief Whether to find in/out by name. /// - /// \param[in] value Ort::Value(output Tensor) + /// \param[in] name input or output name /// - /// \param[in] desc a ONNXDesce(name、shape、dtype) + /// \param[in] is_input input(true) or output(false) /// - /// \return get a Ort::Value + /// \return Whether to find by name /// - void AsTensor(const Ort::Value &value, const ONNXDesc &desc); + bool FindONNXDesc(const std::string &name, bool is_input); private: AnalysisConfig config_; @@ -208,9 +194,9 @@ class ONNXRuntimePredictor : public PaddlePredictor { // ONNXRuntime Ort::Env env_; Ort::Session session_{nullptr}; + std::shared_ptr binding_; platform::Place place_; - framework::Scope *sub_scope_{nullptr}; std::vector input_desc_; std::vector output_desc_; int predictor_id_; diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 7b765e3fa8a24ef1b81b68da8ba12dd8e5577572..bdfe0e46e9ca4519c294a181cda6b8c4b87a6b9b 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -253,6 +253,19 @@ struct PD_INFER_DECL AnalysisConfig { /// /// void DisableGpu(); + /// + /// \brief Enable GPU fp16 precision computation, in experimental state. + /// + /// \param op_list The operator type list. + /// + void Exp_EnableUseGpuFp16(std::unordered_set op_list = {}); + /// + /// \brief A boolean state telling whether the GPU fp16 precision is turned + /// on. + /// + /// \return bool Whether the GPU fp16 precision is turned on. + /// + bool gpu_fp16_enabled() const { return use_gpu_fp16_; } /// /// \brief Turn on XPU. @@ -859,6 +872,9 @@ struct PD_INFER_DECL AnalysisConfig { int gpu_device_id_{0}; uint64_t memory_pool_init_size_mb_{100}; // initial size is 100MB. bool thread_local_stream_{false}; + bool use_gpu_fp16_{false}; + std::unordered_set gpu_fp16_disabled_op_types_{ + "conv2d_fusion", "conv2d", "roll", "strided_slice"}; bool use_cudnn_{false}; diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 22d9dedb32ebfcc229e0034cc5cf6092907dc8df..95975d8f2a892e709e5591135f96fbff07eb62e3 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -172,6 +172,40 @@ void GpuPassStrategy::EnableCUDNN() { use_cudnn_ = true; } +void GpuPassStrategy::Exp_EnableUseGpuFp16() { + passes_.assign({ + "is_test_pass", // + "simplify_with_basic_ops_pass", // + "conv_bn_fuse_pass", // + "conv_eltwiseadd_bn_fuse_pass", // + "embedding_eltwise_layernorm_fuse_pass", // + "multihead_matmul_fuse_pass_v2", // + "gpu_cpu_squeeze2_matmul_fuse_pass", // + "gpu_cpu_reshape2_matmul_fuse_pass", // + "gpu_cpu_flatten2_matmul_fuse_pass", // + "gpu_cpu_map_matmul_v2_to_mul_pass", // + "gpu_cpu_map_matmul_v2_to_matmul_pass", // + "gpu_cpu_map_matmul_to_mul_pass", // + // "fc_fuse_pass", // + "fc_elementwise_layernorm_fuse_pass", // +#if CUDNN_VERSION >= 7100 // To run conv_fusion, the version of cudnn must be + // guaranteed at least v7 +// cudnn8.0 has memory leak problem in conv + eltwise + act, so we +// disable the pass. +#if !(CUDNN_VERSION >= 8000 && CUDNN_VERSION < 8100) + "conv_elementwise_add_act_fuse_pass", // + "conv_elementwise_add2_act_fuse_pass", // +#endif + "conv_elementwise_add_fuse_pass", // +#endif // + "transpose_flatten_concat_fuse_pass", // + "mixed_precision_configure_pass", // + "runtime_context_cache_pass" // + }); + + use_gpu_fp16_ = true; +} + void GpuPassStrategy::EnableMKLDNN() { LOG(ERROR) << "GPU not support MKLDNN yet"; } diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index 351cf71e5ca7493928dfd81d776d847463f3b7bf..02290ed33ff1cd4f72d707d6f9d23f16e05c321b 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -125,6 +125,9 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder { /// \brief Enable the use of cuDNN kernel. virtual void EnableCUDNN() {} + /// \brief Enable use gpu fp16 kernel. + virtual void Exp_EnableUseGpuFp16() {} + /// \brief Enable the use of MKLDNN. /// The MKLDNN control exists in both CPU and GPU mode, because there can /// still be some CPU kernels running in GPU mode. @@ -140,6 +143,10 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder { /// \return A bool variable implying whether we are in gpu mode. bool use_gpu() const { return use_gpu_; } + /// \brief Check if we are using gpu fp16 kernel. + /// \return A bool variable implying whether we are in gpu fp16 mode. + bool use_gpu_fp16() const { return use_gpu_fp16_; } + /// \brief Check if we are using xpu. /// \return A bool variable implying whether we are in xpu mode. bool use_xpu() const { return use_xpu_; } @@ -162,6 +169,7 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder { bool use_npu_{false}; bool use_ipu_{false}; bool use_mkldnn_{false}; + bool use_gpu_fp16_{false}; /// \endcond }; @@ -223,6 +231,9 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy { /// \brief Enable the use of cuDNN kernel. void EnableCUDNN() override; + /// \brief Enable the use of gpu fp16 kernel. + void Exp_EnableUseGpuFp16() override; + /// \brief Not supported in GPU mode yet. void EnableMKLDNN() override; @@ -238,6 +249,7 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy { protected: /// \cond Protected bool use_cudnn_{false}; + bool use_gpu_fp16_{false}; /// \endcond }; diff --git a/paddle/fluid/inference/api/paddle_tensor.h b/paddle/fluid/inference/api/paddle_tensor.h index 5a98d109aed79cc5bcefdc01b47a166bdf9c01d9..2afe2d32e2f60e47136b1e2f002b0e98c9b17cd2 100644 --- a/paddle/fluid/inference/api/paddle_tensor.h +++ b/paddle/fluid/inference/api/paddle_tensor.h @@ -18,6 +18,11 @@ #include "paddle_infer_declare.h" // NOLINT +#ifdef PADDLE_WITH_ONNXRUNTIME +#include "onnxruntime_c_api.h" // NOLINT +#include "onnxruntime_cxx_api.h" // NOLINT +#endif + namespace paddle_infer { /// \brief Experimental. @@ -175,6 +180,23 @@ class PD_INFER_DECL Tensor { PlaceType place_; int device_; +#ifdef PADDLE_WITH_ONNXRUNTIME + bool is_ort_tensor_{false}; + std::vector shape_; + std::weak_ptr binding_; + int idx_{-1}; + + void SetOrtMark(bool is_ort_tensor); + + void SetOrtBinding(const std::shared_ptr binding); + + template + void ORTCopyFromCpu(const T* data); + + template + void ORTCopyToCpu(T* data) const; +#endif + friend class paddle_infer::contrib::TensorUtils; #if defined(PADDLE_WITH_TESTING) && defined(PADDLE_WITH_INFERENCE_API_TEST) friend class paddle_infer::InferApiTesterUtils; diff --git a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc index 67e7c78b62e9d212b5c1738403361d77d7a3925b..496e8932a690dbcd87001da4f7e017fc86d6bff5 100644 --- a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc @@ -11,7 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/layer_norm_op.h" + #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h" diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc index 1946f9e28388e3ab6d1d580d0f7d91c1ef3e604f..1ad82df41737c4093d0b5518c754ed85c505b8be 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc @@ -53,6 +53,6 @@ TEST(Relu6OpConverter, main) { test_activation("relu6"); } } // namespace paddle USE_OP_ITSELF(relu); -USE_OP(sigmoid); +USE_OP_ITSELF(sigmoid); USE_OP_ITSELF(tanh); USE_OP(relu6); diff --git a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu index 861e98e4437564bfe5fae2a575741beb1d8823de..67d44184a76d0552b667c6d5a3d9466582e33558 100644 --- a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu @@ -17,7 +17,7 @@ #include #include "glog/logging.h" #include "paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h" -#include "paddle/fluid/operators/layer_norm_op.h" +#include "paddle/phi/kernels/layer_norm_kernel.h" namespace paddle { namespace inference { @@ -83,7 +83,7 @@ int LayerNormPlugin::enqueue(int batch_size, const void *const *inputs, cudaMemcpyAsync(bias_d, bias_.data(), sizeof(float) * feature_size, cudaMemcpyHostToDevice, stream); - paddle::operators::LayerNormDirectCUDAFunctor layer_norm; + phi::LayerNormDirectCUDAFunctor layer_norm; layer_norm(stream, input, input_shape, bias_d, scale_d, output, mean_d, variance_d, begin_norm_axis, eps); return cudaGetLastError() != cudaSuccess; @@ -177,7 +177,7 @@ int LayerNormPluginDynamic::enqueue( cudaMemcpyAsync(bias_d, bias_.data(), sizeof(float) * feature_size, cudaMemcpyHostToDevice, stream); - paddle::operators::LayerNormDirectCUDAFunctor layer_norm; + phi::LayerNormDirectCUDAFunctor layer_norm; layer_norm(stream, input, input_shape, bias_d, scale_d, output, mean_d, variance_d, begin_norm_axis, eps); } else { diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 61e292a922f0e98a958d4fe2f8fc7850bdf47e18..4a44448dc84cf744cdf061031bdf7fae8f658c4b 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -219,6 +219,12 @@ class AllocatorFacadePrivate { } InitNaiveBestFitCUDAPinnedAllocator(); #endif +#ifdef PADDLE_WITH_ASCEND_CL + for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) { + InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id)); + } + InitNaiveBestFitNPUPinnedAllocator(); +#endif #ifdef PADDLE_WITH_XPU for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) { InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id)); diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 4205f2253a652ccc5f6d4886df1b1194f5e5062f..845d0ed073b32cc136ec6b9d76c9e3073d7b051a 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -1485,6 +1485,17 @@ REGISTER_ACTIVATION_OP(atanh, Atanh, AtanhFunctor, AtanhGradFunctor); REGISTER_ACTIVATION_OP(brelu, BRelu, BReluFunctor, BReluGradFunctor); REGISTER_ACTIVATION_OP(thresholded_relu, ThresholdedRelu, ThresholdedReluFunctor, ThresholdedReluGradFunctor); +REGISTER_ACTIVATION_OP(hard_shrink, HardShrink, HardShrinkFunctor, + HardShrinkGradFunctor); +REGISTER_ACTIVATION_OP(softshrink, SoftShrink, SoftShrinkFunctor, + SoftShrinkGradFunctor); +REGISTER_ACTIVATION_OP(tanh_shrink, TanhShrink, TanhShrinkFunctor, + TanhShrinkGradFunctor); +REGISTER_ACTIVATION_OP(silu, Silu, SiluFunctor, SiluGradFunctor); +REGISTER_ACTIVATION_OP(hard_sigmoid, HardSigmoid, HardSigmoidFunctor, + HardSigmoidGradFunctor); +REGISTER_ACTIVATION_OP(logsigmoid, LogSigmoid, LogSigmoidFunctor, + LogSigmoidGradFunctor); /* ========================== sigmoid register ============================= */ @@ -1519,30 +1530,6 @@ REGISTER_OPERATOR(sigmoid_triple_grad, ops::SigmoidTripleGradFunctor::FwdDeps()>, ops::ActivationTripleGradOpInplaceInferer); -// Register Sigmoid/GradSigmoid Kernels -REGISTER_ACTIVATION_CPU_KERNEL(sigmoid, Sigmoid, SigmoidFunctor, - SigmoidGradFunctor); - -// Register DoubleGrad Kernel -REGISTER_OP_CPU_KERNEL( - sigmoid_grad_grad, - ops::SigmoidDoubleGradKernel>, - ops::SigmoidDoubleGradKernel>, - ops::SigmoidDoubleGradKernel>); - -// Register TripleGrad Kernel -REGISTER_OP_CPU_KERNEL( - sigmoid_triple_grad, - ops::SigmoidTripleGradKernel>, - ops::SigmoidTripleGradKernel>, - ops::SigmoidTripleGradKernel>); - /* ========================================================================== */ /* ========================== tanh register ============================= */ @@ -1626,22 +1613,6 @@ REGISTER_OPERATOR( ops::ActivationOpDoubleGrad::FwdDeps()>, ops::ActivationDoubleGradOpInplaceInferer); -REGISTER_OP_CPU_KERNEL(elu, - ops::ActivationKernel>, - ops::ActivationKernel>); -REGISTER_OP_CPU_KERNEL( - elu_grad, ops::ELUGradKernel, - ops::ELUGradKernel); -REGISTER_OP_CPU_KERNEL( - elu_grad_grad, ops::ELUDoubleGradKernel>, - ops::ELUDoubleGradKernel>, - ops::ELUDoubleGradKernel>); - /* ========================================================================== */ /* ======================== logit register ============================ diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index b076db01c22c62b17fdd85b7208467eea1375fed..f1984af6e15eac6682bd341f470727b899e82f3a 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -238,15 +238,6 @@ struct BaseActivationFunctor { AttrPair GetAttrs() { return AttrPair(); } }; -// sigmoid(x) = 1 / (1 + exp(-x)) -template -struct SigmoidFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = static_cast(1) / (static_cast(1) + (-x).exp()); - } -}; - #define USE_PHI_FUNCTOR(name) \ template \ using name##Functor = phi::funcs::name##Functor; \ @@ -279,181 +270,20 @@ USE_PHI_FUNCTOR(BRelu) USE_PHI_FUNCTOR(ThresholdedRelu) USE_PHI_FUNCTOR(LeakyRelu) USE_PHI_DOUBLE_GRAD_FUNCTOR(LeakyRelu) +USE_PHI_FUNCTOR(HardShrink) +USE_PHI_FUNCTOR(SoftShrink) +USE_PHI_FUNCTOR(TanhShrink) +USE_PHI_FUNCTOR(Silu) +USE_PHI_FUNCTOR(ELU) +USE_PHI_DOUBLE_GRAD_FUNCTOR(ELU) +USE_PHI_FUNCTOR(Sigmoid) +USE_PHI_DOUBLE_GRAD_FUNCTOR(Sigmoid) +USE_PHI_TRIPLE_GRAD_FUNCTOR(Sigmoid) +USE_PHI_FUNCTOR(LogSigmoid) +USE_PHI_FUNCTOR(HardSigmoid) template -struct SigmoidGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * out * (static_cast(1) - out); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - -/* - Out - DOut -> SigmoidGradGrad -> DOutNew - DDX DDOut - - DDOut = (1-Out)*Out*DDX - DOutNew = (1-2*Out)*DOut*DDX -*/ -template -struct SigmoidGradGradFunctor : public BaseActivationFunctor { - template - void operator()(const Device& dev, const framework::Tensor* Out, - const framework::Tensor* ddX, const framework::Tensor* dOut, - framework::Tensor* dOutNew, framework::Tensor* ddOut) const { - auto* d = dev.eigen_device(); - auto ddx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddX, "Input", "DDX", "SigmoidGradGrad")); - auto out = framework::EigenVector::Flatten( - GET_DATA_SAFELY(Out, "Input", "Out", "SigmoidGradGrad")); - - if (dOutNew) { - auto dout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(dOut, "Input", "DOut", "SigmoidGradGrad")); - auto dout_new = framework::EigenVector::Flatten( - GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "SigmoidGradGrad")); - dout_new.device(*d) = - (static_cast(1) - static_cast(2) * out) * dout * ddx; - } - if (ddOut) { - auto ddout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddOut, "Output", "DDOut", "SigmoidGradGrad")); - ddout.device(*d) = (static_cast(1) - out) * out * ddx; - } - } - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - -/* - Out - DOut D_Dout - DDx -> SigmoidTripleGrad -> D_DDx - D_DDout d_OutNew - D_Dout_new - - D_Dout = (1-2*Out)*DDx*D_Dout_new - D_DDx = (1-Out)*Out*D_DDout + (1-2*Out)*DOut*D_Dout_new - D_OutNew = (DDx-2*Out*DDx)*D_DDout - 2*DOut*DDx*D_Dout_new - - Out, DDX, DOut, D_DDOut, D_DOut_New // input - D_OutNew, D_DOut, D_DDx // output -*/ -template -struct SigmoidTripleGradFunctor : public BaseActivationFunctor { - template - void operator()(const Device& dev, const framework::Tensor* Out, - const framework::Tensor* ddX, const framework::Tensor* dOut, - const framework::Tensor* d_DDOut, - const framework::Tensor* d_dOut_New, - framework::Tensor* d_d_Out, framework::Tensor* d_Out_New, - framework::Tensor* d_DDx) const { - auto* d = dev.eigen_device(); - auto ddx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddX, "Input", "DDX", "SigmoidTripleGrad")); - auto out = framework::EigenVector::Flatten( - GET_DATA_SAFELY(Out, "Input", "Out", "SigmoidTripleGrad")); - auto dout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(dOut, "Input", "DOut", "SigmoidTripleGrad")); - auto d_ddOut = framework::EigenVector::Flatten( - GET_DATA_SAFELY(d_DDOut, "Input", "D_DDOut", "SigmoidTripleGrad")); - auto d_dOutNew = framework::EigenVector::Flatten(GET_DATA_SAFELY( - d_dOut_New, "Input", "D_DOut_New", "SigmoidTripleGrad")); - - if (d_Out_New) { - auto d_OutNew = framework::EigenVector::Flatten(GET_DATA_SAFELY( - d_Out_New, "Output", "D_OutNew", "SigmoidTripleGrad")); - d_OutNew.device(*d) = (ddx - static_cast(2) * out * ddx) * d_ddOut - - static_cast(2) * dout * ddx * d_dOutNew; - } - if (d_d_Out) { - auto d_dOut = framework::EigenVector::Flatten( - GET_DATA_SAFELY(d_d_Out, "Output", "D_DOut", "SigmoidTripleGrad")); - d_dOut.device(*d) = - (static_cast(1) - static_cast(2) * out) * ddx * d_dOutNew; - } - if (d_DDx) { - auto d_ddx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(d_DDx, "Output", "D_DDx", "SigmoidTripleGrad")); - d_ddx.device(*d) = - (static_cast(1) - out) * out * d_ddOut + - (static_cast(1) - static_cast(2) * out) * dout * d_dOutNew; - } - } - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - -// silu(x) = x / (1 + exp(-x)) -template -struct SiluFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - auto temp = static_cast(1) / (static_cast(1) + (-x).exp()); - out.device(d) = x * temp; - } -}; - -// silu'(x) = (1 / (1 + e^{-x})) * (1 + out * e^{-x})) -template -struct SiluGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - auto temp1 = static_cast(1) + (-x).exp(); // 1+e^(-x) - auto temp2 = x * (-x).exp(); // x*e^(-x) - dx.device(d) = dout * ((static_cast(1) / temp1) * - (static_cast(1) + (temp2 / temp1))); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - -// Originally: logsigmoid(x) = -log (1 + exp(-x)) -// For numerical stability, we can use the log-sum-exp trick: -// https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/ -// We can rewrite the above equation as: -// out = -log( exp(0) + exp(-x)) [since exp(0) = 1] -// = -log( exp(max(-x, 0) - max(-x, 0)) + exp(-x + max(-x, 0) - max(-x, 0))) -// = -log( exp(max(-x, 0)) * exp(-max(-x, 0)) - exp(max(-x, 0)) * exp(-x - -// max(-x, 0))) -// = -log( exp(max(-x, 0)) * (exp(-max(-x, 0)) + exp(-x - max(-x, 0)))) -// = -log( exp(max(-x, 0)) - log(exp(-max(-x, 0)) + exp(-x - max(-x, 0))) -// -// Hence, logsigmoid(x) = - (max(-x, 0) + log(exp(-max(-x, 0)) -// + exp(-x - max(-x, 0)))) -template -struct LogSigmoidFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - auto temp = (-x).cwiseMax(static_cast(0)); // temp = max(-x, 0) - out.device(d) = -temp - (((-temp).exp() + (-x - temp).exp()).log()); - } -}; - -// Originally: f' = exp(-x) / (1 + exp(-x)) -// For numerical stability: f' = exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) + -// exp(-x - max(-x, 0))) -template -struct LogSigmoidGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - auto temp = (-x).cwiseMax(static_cast(0)); // temp = max(-x, 0) - dx.device(d) = - dout * ((-x - temp).exp() / ((-temp).exp() + (-x - temp).exp())); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; +using ELUGradNegativeAlphaFunctor = phi::funcs::ELUGradNegativeAlphaFunctor; // exp(x) = e^x template @@ -512,99 +342,6 @@ using ReluGradGradFunctor = phi::funcs::ReluGradGradFunctor; template using ReluCUDAFunctor = phi::funcs::ReluCUDAFunctor; -// tanhshrink(x) = x - tanh(x) -// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) -template -struct TanhShrinkFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x - x.tanh(); - } -}; - -template -struct TanhShrinkGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * (x.tanh() * x.tanh()); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - -// tanhshrink(x) = x - tanh(x) -// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) -template -struct HardShrinkFunctor : public BaseActivationFunctor { - float threshold; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"threshold", &threshold}}; - } - template - void operator()(Device d, X x, Out out) const { - auto temp1 = x < static_cast(threshold * -1.f); - auto temp2 = x > static_cast(threshold); - out.device(d) = x * (temp1 || temp2).template cast(); - } -}; - -template -struct HardShrinkGradFunctor : public BaseActivationFunctor { - float threshold; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"threshold", &threshold}}; - } - - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - auto temp1 = x < static_cast(threshold * -1.f); - auto temp2 = x > static_cast(threshold); - dx.device(d) = dout * (temp1 || temp2).template cast(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - -// softshrink(x) = x - lambda, if x > lambda; x + lambda, if x < -lambda; 0 -// otherwise -template -struct SoftShrinkFunctor : public BaseActivationFunctor { - float lambda; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"lambda", &lambda}}; - } - - template - void operator()(Device d, X x, Out out) const { - auto lambdaT = static_cast(lambda); - auto temp1 = (x > lambdaT).template cast(); - auto temp2 = (x < -lambdaT).template cast(); - out.device(d) = temp1 * (x - lambdaT) + temp2 * (x + lambdaT); - } -}; - -template -struct SoftShrinkGradFunctor : public BaseActivationFunctor { - float lambda; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"lambda", &lambda}}; - } - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - auto lambdaT = static_cast(lambda); - auto temp1 = (x > lambdaT).template cast(); - auto temp2 = (x < -lambdaT).template cast(); - dx.device(d) = dout * (temp1 + temp2).template cast(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - // sqrt(x) = x^(1/2) template struct SqrtFunctor : public BaseActivationFunctor { @@ -1036,59 +773,6 @@ struct SoftReluGradFunctor : public BaseActivationFunctor { } }; -template -struct ELUFunctor : public BaseActivationFunctor { - float alpha; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - - template - void operator()(Device d, X x, Out out) const { - out.device(d) = - (x < static_cast(0)) - .select(static_cast(alpha) * (x.exp() - static_cast(1)), x); - } -}; - -template -struct ELUGradFunctor : public BaseActivationFunctor { - float alpha; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - // case 1: alpha >= 0 - // dx = dout, if out > 0 - // dx = dout * (out + alpha), if out <= 0 - dx.device(d) = (out > static_cast(0)) - .select(dout, dout * (out + static_cast(alpha))); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - -template -struct ELUGradNegativeAlphaFunctor : public BaseActivationFunctor { - float alpha; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - // case 2: alpha < 0 - // dx = dout, if x > 0 - // dx = dout * (out + alpha), if x <=0 - dx.device(d) = (x > static_cast(0)) - .select(dout, dout * static_cast(alpha) * x.exp()); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - template class ELUGradKernel : public framework::OpKernel { public: @@ -1263,43 +947,6 @@ struct STanhGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; -template -struct HardSigmoidFunctor : public BaseActivationFunctor { - float slope; - float offset; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"slope", &slope}, {"offset", &offset}}; - } - - template - void operator()(Device d, X x, Out out) const { - auto temp = x * static_cast(slope) + static_cast(offset); - out.device(d) = - temp.cwiseMax(static_cast(0)).cwiseMin(static_cast(1)); - } -}; - -template -struct HardSigmoidGradFunctor : public BaseActivationFunctor { - float slope; - float offset; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"slope", &slope}, {"offset", &offset}}; - } - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * - ((out > static_cast(0)) * (out < static_cast(1))) - .template cast() * - static_cast(slope); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - template struct SwishFunctor : public BaseActivationFunctor { float beta; @@ -1354,44 +1001,6 @@ struct AbsGradGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; -template -struct ELUGradGradFunctor : public BaseActivationFunctor { - float alpha; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - template - void operator()(const Device& dev, const framework::Tensor* X, - const framework::Tensor* ddX, framework::Tensor* ddOut, - const framework::Tensor* dOut, framework::Tensor* dX) const { - auto* d = dev.eigen_device(); - auto ddx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddX, "Input", "DDX", "ELUGradGrad")); - auto x = framework::EigenVector::Flatten( - GET_DATA_SAFELY(X, "Input", "X", "ELUGradGrad")); - - if (dX) { - auto dx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(dX, "Output", "DX", "ELUGradGrad")); - auto dout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(dOut, "Output", "DOut", "ELUGradGrad")); - dx.device(*d) = ddx * dout * static_cast(alpha) * x.exp() * - (x <= static_cast(0)).template cast(); - } - - if (ddOut) { - auto ddout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddOut, "Output", "DDOut", "ELUGradGrad")); - ddout.device(*d) = ddx * - ((x > static_cast(0)).template cast() + - static_cast(alpha) * x.exp() * - (x <= static_cast(0)).template cast()) - .template cast(); - } - } - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - template struct CELUGradGradFunctor : public BaseActivationFunctor { float alpha; @@ -1565,211 +1174,6 @@ inline void ExtractDoubleGradTensorWithInputDOut( } } -template -class SigmoidDoubleGradKernel - : public framework::OpKernel { - public: - using T = typename Functor::ELEMENT_TYPE; - void Compute(const framework::ExecutionContext& ctx) const override { - const framework::Tensor *Out, *ddX, *dOut; - framework::Tensor *dOutNew, *ddOut; - Out = ddX = dOut = nullptr; - dOutNew = ddOut = nullptr; - // extract ddx(input) and out(input) - ddX = ctx.Input("DDX"); - Out = ctx.Input("Out"); - PADDLE_ENFORCE_NOT_NULL( - ddX, platform::errors::NotFound( - "Cannot get input Variable ddX, variable name = %s", - ctx.InputName("DDX"))); - PADDLE_ENFORCE_NOT_NULL( - Out, platform::errors::NotFound( - "Cannot get input Variable Out, variable name = %s", - ctx.InputName("Out"))); - // set output ddout - ddOut = ctx.Output("DDOut"); - // extract dOut(intput) - dOut = ctx.Input("DOut"); - PADDLE_ENFORCE_NOT_NULL( - dOut, platform::errors::NotFound( - "Cannot get input Variable dOut, variable name = %s", - ctx.InputName("DOut"))); - dOutNew = ctx.Output("DOutNew"); - if (dOutNew) dOutNew->mutable_data(Out->dims(), ctx.GetPlace()); - if (ddOut) ddOut->mutable_data(Out->dims(), ctx.GetPlace()); - auto& place = ctx.template device_context(); - Functor functor; - functor(place, Out, ddX, dOut, dOutNew, ddOut); - } -}; - -// Out, DDX, DOut, D_DDOut, D_DOut_New // input -// D_OutNew, D_DOut, D_DDx // output -template -class SigmoidTripleGradKernel - : public framework::OpKernel { - public: - using T = typename Functor::ELEMENT_TYPE; - void Compute(const framework::ExecutionContext& ctx) const override { - const framework::Tensor *Out, *ddX, *dOut, *d_ddOut, *d_dOutNew; - framework::Tensor *d_OutNew, *d_dOut, *d_ddx; - Out = ddX = dOut = d_ddOut = d_dOutNew = nullptr; - d_OutNew = d_dOut = d_ddx = nullptr; - - // extract ddx(input), out(input), dOut(input), d_ddOut(input), - // d_dOutNew(input) - ddX = ctx.Input("DDX"); - Out = ctx.Input("Out"); - dOut = ctx.Input("DOut"); - d_ddOut = ctx.Input("D_DDOut"); - d_dOutNew = ctx.Input("D_DOut_New"); - - PADDLE_ENFORCE_NOT_NULL( - ddX, platform::errors::NotFound( - "Cannot get input Variable ddX, variable name = %s", - ctx.InputName("DDX"))); - PADDLE_ENFORCE_NOT_NULL( - Out, platform::errors::NotFound( - "Cannot get input Variable Out, variable name = %s", - ctx.InputName("Out"))); - PADDLE_ENFORCE_NOT_NULL( - dOut, platform::errors::NotFound( - "Cannot get input Variable dOut, variable name = %s", - ctx.InputName("DOut"))); - PADDLE_ENFORCE_NOT_NULL( - d_ddOut, platform::errors::NotFound( - "Cannot get input Variable d_ddOut, variable name = %s", - ctx.InputName("D_DDOut"))); - PADDLE_ENFORCE_NOT_NULL( - d_dOutNew, - platform::errors::NotFound( - "Cannot get input Variable d_dOutNew, variable name = %s", - ctx.InputName("D_DOutNew"))); - - // set output d_OutNew、d_dOut、d_ddx - d_dOut = ctx.Output("D_DOut"); - d_OutNew = ctx.Output("D_OutNew"); - d_ddx = ctx.Output("D_DDx"); - - if (d_dOut) d_dOut->mutable_data(Out->dims(), ctx.GetPlace()); - if (d_OutNew) d_OutNew->mutable_data(Out->dims(), ctx.GetPlace()); - if (d_ddx) d_ddx->mutable_data(ddX->dims(), ctx.GetPlace()); - auto& place = ctx.template device_context(); - Functor functor; - functor(place, Out, ddX, dOut, d_ddOut, d_dOutNew, // input - d_dOut, d_OutNew, d_ddx); // output - } -}; - -template -class TanhDoubleGradKernel - : public framework::OpKernel { - public: - using T = typename Functor::ELEMENT_TYPE; - void Compute(const framework::ExecutionContext& ctx) const override { - const framework::Tensor *Out, *ddX, *dOut; - framework::Tensor *dOutNew, *ddOut; - Out = ddX = dOut = nullptr; - dOutNew = ddOut = nullptr; - - // extract ddx(input) and out(input) - auto ddx_var = ctx.InputVar("DDX"); - auto out_var = ctx.InputVar("Out"); - PADDLE_ENFORCE_NOT_NULL( - ddx_var, platform::errors::NotFound( - "Cannot get input Variable ddx, variable name = %s", - ctx.InputName("DDX"))); - PADDLE_ENFORCE_NOT_NULL( - out_var, platform::errors::NotFound( - "Cannot get input Variable out, variable name = %s", - ctx.InputName("Out"))); - ddX = ctx.Input("DDX"); - Out = ctx.Input("Out"); - - // set output ddout - auto ddout_var = ctx.OutputVar("DDOut"); - if (ddout_var) { - ddOut = ctx.Output("DDOut"); - } - - // extract dOut(intput) - auto dout_var = ctx.InputVar("DOut"); - PADDLE_ENFORCE_NOT_NULL( - dout_var, platform::errors::NotFound( - "Cannot get input Variable dout_var, variable name = %s", - ctx.InputName("DOut"))); - dOut = ctx.Input("DOut"); - - // set output dout_new - auto dout_new_var = ctx.OutputVar("DOutNew"); - if (dout_new_var) { - dOutNew = ctx.Output("DOutNew"); - } - - if (dOutNew) dOutNew->mutable_data(Out->dims(), ctx.GetPlace()); - if (ddOut) ddOut->mutable_data(Out->dims(), ctx.GetPlace()); - auto& place = ctx.template device_context(); - Functor functor; - functor(place, Out, ddX, dOut, dOutNew, ddOut); - } -}; - -template -class TanhTripeGradKernel - : public framework::OpKernel { - public: - using T = typename Functor::ELEMENT_TYPE; - void Compute(const framework::ExecutionContext& ctx) const override { - const framework::Tensor *Out, *ddX, *dOut, *d_ddOut, *d_dOutNew; - framework::Tensor *d_OutNew, *d_dOut, *d_ddx; - Out = ddX = dOut = d_ddOut = d_dOutNew = nullptr; - d_OutNew = d_dOut = d_ddx = nullptr; - - // extract ddx(input), out(input), dOut(input), d_ddOut(input), - // d_dOutNew(input) - ddX = ctx.Input("DDX"); - Out = ctx.Input("Out"); - dOut = ctx.Input("DOut"); - d_ddOut = ctx.Input("D_DDOut"); - d_dOutNew = ctx.Input("D_DOut_New"); - - PADDLE_ENFORCE_NOT_NULL( - ddX, platform::errors::NotFound( - "Cannot get input Variable ddX, variable name = %s", - ctx.InputName("DDX"))); - PADDLE_ENFORCE_NOT_NULL( - Out, platform::errors::NotFound( - "Cannot get input Variable Out, variable name = %s", - ctx.InputName("Out"))); - PADDLE_ENFORCE_NOT_NULL( - dOut, platform::errors::NotFound( - "Cannot get input Variable dOut, variable name = %s", - ctx.InputName("DOut"))); - PADDLE_ENFORCE_NOT_NULL( - d_ddOut, platform::errors::NotFound( - "Cannot get input Variable d_ddOut, variable name = %s", - ctx.InputName("D_DDOut"))); - PADDLE_ENFORCE_NOT_NULL( - d_dOutNew, - platform::errors::NotFound( - "Cannot get input Variable d_dOutNew, variable name = %s", - ctx.InputName("D_DOutNew"))); - - // set output d_OutNew、d_dOut、d_ddx - d_dOut = ctx.Output("D_DOut"); - d_OutNew = ctx.Output("D_OutNew"); - d_ddx = ctx.Output("D_DDx"); - - if (d_dOut) d_dOut->mutable_data(Out->dims(), ctx.GetPlace()); - if (d_OutNew) d_OutNew->mutable_data(Out->dims(), ctx.GetPlace()); - if (d_ddx) d_ddx->mutable_data(ddX->dims(), ctx.GetPlace()); - auto& place = ctx.template device_context(); - Functor functor; - functor(place, Out, ddX, dOut, d_ddOut, d_dOutNew, // input - d_dOut, d_OutNew, d_ddx); // output - } -}; - template class SquareDoubleGradKernel : public framework::OpKernel { @@ -2151,26 +1555,19 @@ struct LogGradGradFunctor : public BaseActivationFunctor { } // namespace operators } // namespace paddle -#define FOR_EACH_ACTIVATION_OP(__macro) \ - __macro(silu, Silu, SiluFunctor, SiluGradFunctor); \ - __macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor); \ - __macro(softshrink, SoftShrink, SoftShrinkFunctor, SoftShrinkGradFunctor); \ - __macro(ceil, Ceil, CeilFunctor, ZeroGradFunctor); \ - __macro(floor, Floor, FloorFunctor, ZeroGradFunctor); \ - __macro(round, Round, RoundFunctor, ZeroGradFunctor); \ - __macro(reciprocal, Reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \ - __macro(log1p, Log1p, Log1pFunctor, Log1pGradFunctor); \ - __macro(log2, Log2, Log2Functor, Log2GradFunctor); \ - __macro(log10, Log10, Log10Functor, Log10GradFunctor); \ - __macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor); \ - __macro(stanh, STanh, STanhFunctor, STanhGradFunctor); \ - __macro(softplus, Softplus, SoftplusFunctor, SoftplusGradFunctor); \ - __macro(softsign, Softsign, SoftsignFunctor, SoftsignGradFunctor); \ - __macro(relu6, Relu6, Relu6Functor, Relu6GradFunctor); \ - __macro(tanh_shrink, TanhShrink, TanhShrinkFunctor, TanhShrinkGradFunctor); \ - __macro(hard_shrink, HardShrink, HardShrinkFunctor, HardShrinkGradFunctor); \ - __macro(hard_sigmoid, HardSigmoid, HardSigmoidFunctor, \ - HardSigmoidGradFunctor); \ - __macro(swish, Swish, SwishFunctor, SwishGradFunctor); \ - __macro(mish, Mish, MishFunctor, MishGradFunctor); \ +#define FOR_EACH_ACTIVATION_OP(__macro) \ + __macro(ceil, Ceil, CeilFunctor, ZeroGradFunctor); \ + __macro(floor, Floor, FloorFunctor, ZeroGradFunctor); \ + __macro(round, Round, RoundFunctor, ZeroGradFunctor); \ + __macro(reciprocal, Reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \ + __macro(log1p, Log1p, Log1pFunctor, Log1pGradFunctor); \ + __macro(log2, Log2, Log2Functor, Log2GradFunctor); \ + __macro(log10, Log10, Log10Functor, Log10GradFunctor); \ + __macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor); \ + __macro(stanh, STanh, STanhFunctor, STanhGradFunctor); \ + __macro(softplus, Softplus, SoftplusFunctor, SoftplusGradFunctor); \ + __macro(softsign, Softsign, SoftsignFunctor, SoftsignGradFunctor); \ + __macro(relu6, Relu6, Relu6Functor, Relu6GradFunctor); \ + __macro(swish, Swish, SwishFunctor, SwishGradFunctor); \ + __macro(mish, Mish, MishFunctor, MishGradFunctor); \ __macro(hard_swish, HardSwish, HardSwishFunctor, HardSwishGradFunctor); diff --git a/paddle/fluid/operators/activation_op.kps b/paddle/fluid/operators/activation_op.kps index 256f20db08445e8b8d5933aa0e3151f69fcb5b10..7c1b288080162e2a5bf847a795fc640ab5e5e4e1 100644 --- a/paddle/fluid/operators/activation_op.kps +++ b/paddle/fluid/operators/activation_op.kps @@ -15,138 +15,11 @@ limitations under the License. */ #include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/phi/kernels/funcs/activation_functor.h" + namespace paddle { namespace operators { -template -struct CudaSigmoidFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType one = static_cast(1.0f); - - // sigmoid(x) = 1 / (1 + exp(-x)) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(one / (one + exp(-x))); - } -}; - -template -struct CudaSigmoidGradFunctor : public BaseActivationFunctor { - T one = static_cast(1.0f); - - // dx = dout * out * (1 - out) - __device__ __forceinline__ T operator()(const T dout, const T out) const { - return dout * out * (one - out); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - -template -struct CudaSiluFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType one = static_cast(1.0f); - - // silu(x) = x / (1 + exp(-x)) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(x / (one + exp(-x))); - } -}; - -template -struct CudaSiluGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType one = static_cast(1.0f); - - // dx = dout * (1 + exp(-x) + x * exp(-x) / (1 + exp(-x))^2) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - MPType temp = one / (one + exp(-x)); - return static_cast(dout * (temp * (one + x * (one - temp)))); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - -template -struct CudaLogSigmoidFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType zero = static_cast(0.0f); - - // logsigmoid(x) = log(1 / (1 + exp(-x))) - // For numerical stability, - // logsigmoid(x) = - // - (max(-x, 0) + log(exp(-max(-x, 0)) + exp(-x - max(-x, 0)))) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - MPType temp = x > zero ? zero : -x; - return static_cast(-temp - log(exp(-temp) + exp(-x - temp))); - } -}; - -template -struct CudaLogSigmoidGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType zero = static_cast(0.0f); - - // dx = dout * exp(-x) / (1 + exp(-x)) - // For numerical stability: - // dx = dout * exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) + exp(-x - max(-x, - // 0))) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - MPType temp1 = x > zero ? zero : -x; - MPType temp2 = exp(-x - temp1); - return static_cast(dout * (temp2 / (exp(-temp1) + temp2))); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - -template -struct CudaSoftShrinkFunctor : public BaseActivationFunctor { - float lambda; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"lambda", &lambda}}; - } - - // softshrink(x) = x - lambda, if x > lambda; - // x + lambda, if x < -lambda; - // 0, otherwise. - __device__ __forceinline__ T operator()(const T x) const { - T l = static_cast(lambda); - T temp1 = static_cast(x > l); - T temp2 = static_cast(x < -l); - return temp1 * (x - l) + temp2 * (x + l); - } -}; - -template -struct CudaSoftShrinkGradFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - float lambda; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"lambda", &lambda}}; - } - - // dx = dout, if x > lambda or x < -lambda else 0 - __device__ __forceinline__ T operator()(const T dout, const T x) const { - T l = static_cast(lambda); - return (x >= -l && x <= l) ? zero : dout; - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - template struct CudaCeilFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; @@ -615,109 +488,6 @@ struct CudaRelu6GradFunctor : public BaseActivationFunctor { } }; -template -struct CudaTanhShrinkFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // tanhshrink(x) = x - tanh(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(x - tanh(x)); - } -}; - -template -struct CudaTanhShrinkGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // dx = dout * tanh(x)^2 - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout * tanh(x) * tanh(x)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - -template -struct CudaHardShrinkFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - float threshold; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"threshold", &threshold}}; - } - - // hadrshrink(x) = (x > -threshold && x < threshold) ? 0 : x - __device__ __forceinline__ T operator()(const T x) const { - T t = static_cast(threshold); - return (x > -t && x < t) ? zero : x; - } -}; - -template -struct CudaHardShrinkGradFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - float threshold; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"threshold", &threshold}}; - } - - // dx = (x > -threshold && x < threshold) ? 0 : dout - __device__ __forceinline__ T operator()(const T dout, const T x) const { - T t = static_cast(threshold); - return (x > -t && x < t) ? zero : dout; - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - -template -struct CudaHardSigmoidFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - T one = static_cast(1.0f); - float slope; - float offset; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"slope", &slope}, {"offset", &offset}}; - } - - // hard_sigmoid(x) = 0, when x <= -3 - // 1, when x >= 3 - // x * slope + offset, otherwise - __device__ __forceinline__ T operator()(const T x) const { - T temp = x * static_cast(slope) + static_cast(offset); - T temp_max = temp > zero ? temp : zero; - T temp_min = temp_max < one ? temp_max : one; - return temp_min; - } -}; - -template -struct CudaHardSigmoidGradFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - T one = static_cast(1.0f); - float slope; - float offset; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"slope", &slope}, {"offset", &offset}}; - } - - // dx = (out > 0 && out < 1) ? dout * slope : 0 - __device__ __forceinline__ T operator()(const T dout, const T out) const { - return (out > zero && out < one) ? dout * static_cast(slope) : zero; - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - template struct CudaSwishFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; @@ -863,110 +633,6 @@ struct CudaHardSwishGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; -template -struct CudaELUFunctor : public BaseActivationFunctor { - using CT = typename details::MPTypeTrait::Type; - CT zero = static_cast(0.0f); - CT one = static_cast(1.0f); - float alpha; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - - // elu(x) = x, if x > 0 - // elu(x) = alpha * (e^x - 1), if x <= 0 - __device__ __forceinline__ T operator()(const T arg_x) const { - CT x = static_cast(arg_x); - CT temp = static_cast(alpha) * (exp(x) - one); - CT res = x > zero ? x : temp; - return static_cast(res); - } -}; - -template -struct CudaELUGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType zero = static_cast(0.0f); - float alpha; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - - // case 1: alpha >= 0 - // dx = dout, if out > 0 - // dx = dout * (out + alpha), if out <= 0 - __device__ __forceinline__ T operator()(T arg_dout, T arg_out) const { - MPType dout = static_cast(arg_dout); - MPType out = static_cast(arg_out); - MPType a = static_cast(alpha); - MPType out_pos = static_cast(out > zero); - MPType out_neg = static_cast(out <= zero); - return static_cast(dout * (out_pos + out_neg * (out + a))); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - -template -struct CudaELUGradNegativeAlphaFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType zero = static_cast(0.0f); - float alpha; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - - // case 2: alpha < 0 - // dx = dout, if x > 0 - // dx = dout * (out + alpha), if x <=0 - __device__ __forceinline__ T operator()(const T arg_dout, const T arg_out, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType out = static_cast(arg_out); - MPType x = static_cast(arg_x); - MPType a = static_cast(alpha); - MPType x_pos = static_cast(x > zero); - MPType x_neg = static_cast(x <= zero); - return static_cast(dout * (x_pos + x_neg * (out + a))); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - -template -class ELUGradCudaKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const { - auto* d_out = ctx.Input(framework::GradVarName("Out")); - auto* out = ctx.Input("Out"); - auto* x = ctx.Input("X"); - auto* d_x = ctx.Output(framework::GradVarName("X")); - d_x->mutable_data(ctx.GetPlace()); - const float alpha = ctx.Attr("alpha"); - - auto& dev_ctx = ctx.device_context(); - std::vector ins = {d_out, out}; - std::vector outs = {d_x}; - if (alpha > 0) { - CudaELUGradFunctor functor; - functor.alpha = alpha; - paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, - &outs, functor); - } else { - CudaELUGradNegativeAlphaFunctor functor; - functor.alpha = alpha; - ins.push_back(x); - paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, - &outs, functor); - } - } -}; - template struct CudaCELUFunctor : public BaseActivationFunctor { using CT = typename details::MPTypeTrait::Type; @@ -1099,6 +765,18 @@ USE_PHI_FUNCTOR(CudaTanh) USE_PHI_FUNCTOR(CudaBRelu) USE_PHI_FUNCTOR(CudaLeakyRelu) USE_PHI_FUNCTOR(CudaThresholdedRelu) +USE_PHI_FUNCTOR(CudaHardShrink) +USE_PHI_FUNCTOR(CudaSoftShrink) +USE_PHI_FUNCTOR(CudaTanhShrink) +USE_PHI_FUNCTOR(CudaSilu) +USE_PHI_FUNCTOR(CudaELU) +USE_PHI_FUNCTOR(CudaSigmoid) +USE_PHI_FUNCTOR(CudaLogSigmoid) +USE_PHI_FUNCTOR(CudaHardSigmoid) + +template +using CudaELUGradNegativeAlphaFunctor = + phi::funcs::CudaELUGradNegativeAlphaFunctor; } // namespace operators } // namespace paddle @@ -1158,26 +836,6 @@ namespace plat = paddle::platform; ops::ActivationGradCudaKernel>); -/* ======================== elu register ============================ */ -REGISTER_OP_CUDA_KERNEL( - elu, ops::ActivationCudaKernel>, - ops::ActivationCudaKernel>, - ops::ActivationCudaKernel>); -REGISTER_OP_CUDA_KERNEL( - elu_grad, ops::ELUGradCudaKernel, - ops::ELUGradCudaKernel, - ops::ELUGradCudaKernel); - -REGISTER_OP_CUDA_KERNEL( - elu_grad_grad, ops::ELUDoubleGradKernel>, - ops::ELUDoubleGradKernel>, - ops::ELUDoubleGradKernel>); /* ========================================================================== */ /* ======================== celu register ============================ */ @@ -1193,35 +851,6 @@ REGISTER_OP_CUDA_KERNEL( ops::CELUGradGradFunctor>); /* ========================================================================== */ -/* =========================== sigmoid register ============================ - */ -REGISTER_ACTIVATION_CUDA_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor, - CudaSigmoidGradFunctor); - -REGISTER_OP_CUDA_KERNEL( - sigmoid_grad_grad, - ops::SigmoidDoubleGradKernel>, - ops::SigmoidDoubleGradKernel>, - ops::SigmoidDoubleGradKernel>, - ops::SigmoidDoubleGradKernel>); - -REGISTER_OP_CUDA_KERNEL( - sigmoid_triple_grad, - ops::SigmoidTripleGradKernel>, - ops::SigmoidTripleGradKernel>, - ops::SigmoidTripleGradKernel>, - ops::SigmoidTripleGradKernel< - plat::CUDADeviceContext, - ops::SigmoidTripleGradFunctor>); -/* ========================================================================== */ - /* =========================== sqrt register ============================= */ REGISTER_ACTIVATION_CUDA_KERNEL(sqrt, Sqrt, CudaSqrtFunctor, CudaSqrtGradFunctor); @@ -1359,9 +988,6 @@ REGISTER_OP_CUDA_KERNEL( /* ========================================================================== */ #define FOR_EACH_ACTIVATION_CUDA_OP(__macro) \ - __macro(silu, Silu, CudaSiluFunctor, CudaSiluGradFunctor); \ - __macro(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor, \ - CudaLogSigmoidGradFunctor); \ __macro(softshrink, SoftShrink, CudaSoftShrinkFunctor, \ CudaSoftShrinkGradFunctor); \ __macro(ceil, Ceil, CudaCeilFunctor, CudaZeroGradFunctor); \ @@ -1381,8 +1007,6 @@ REGISTER_OP_CUDA_KERNEL( CudaTanhShrinkGradFunctor); \ __macro(hard_shrink, HardShrink, CudaHardShrinkFunctor, \ CudaHardShrinkGradFunctor); \ - __macro(hard_sigmoid, HardSigmoid, CudaHardSigmoidFunctor, \ - CudaHardSigmoidGradFunctor); \ __macro(swish, Swish, CudaSwishFunctor, CudaSwishGradFunctor); \ __macro(mish, Mish, CudaMishFunctor, CudaMishGradFunctor); \ __macro(hard_swish, HardSwish, CudaHardSwishFunctor, \ @@ -1390,63 +1014,221 @@ REGISTER_OP_CUDA_KERNEL( FOR_EACH_ACTIVATION_CUDA_OP(REGISTER_ACTIVATION_CUDA_KERNEL) #ifdef PADDLE_WITH_XPU_KP -#define REGISTER_ACTIVATION_XPU_KERNEL(act_type, op_name, functor, \ - grad_functor) \ - REGISTER_OP_KERNEL( \ - act_type, KP, plat::XPUPlace, \ - ops::ActivationCudaKernel>); \ - REGISTER_OP_KERNEL(act_type##_grad, KP, plat::XPUPlace, \ - ops::ActivationGradCudaKernel>); - -REGISTER_ACTIVATION_XPU_KERNEL(leaky_relu, LeakyRelu, CudaLeakyReluFunctor, - CudaLeakyReluGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor, - CudaSigmoidGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(exp, Exp, CudaExpFunctor, CudaExpGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(log, Log, CudaLogFunctor, CudaLogGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(reciprocal, Reciprocal, CudaReciprocalFunctor, - CudaReciprocalGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(softplus, Softplus, CudaSoftplusFunctor, - CudaSoftplusGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(hard_swish, HardSwish, CudaHardSwishFunctor, - CudaHardSwishGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(elu, Elu, CudaELUFunctor, CudaELUGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(celu, Celu, CudaCELUFunctor, - CudaCELUGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(sqrt, Sqrt, CudaSqrtFunctor, - CudaSqrtGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(square, Square, CudaSquareFunctor, - CudaSquareGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(silu, Silu, CudaSiluFunctor, - CudaSiluGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor, - CudaLogSigmoidGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(softshrink, SoftShrink, CudaSoftShrinkFunctor, - CudaSoftShrinkGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(ceil, Ceil, CudaCeilFunctor, - CudaZeroGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(floor, Floor, CudaFloorFunctor, - CudaZeroGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(log1p, Log1p, CudaLog1pFunctor, - CudaLog1pGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(brelu, BRelu, CudaBReluFunctor, - CudaBReluGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(soft_relu, SoftRelu, CudaSoftReluFunctor, - CudaSoftReluGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(softsign, Softsign, CudaSoftsignFunctor, - CudaSoftsignGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(relu6, Relu6, CudaRelu6Functor, - CudaRelu6GradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(hard_shrink, HardShrink, CudaHardShrinkFunctor, - CudaHardShrinkGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(hard_sigmoid, HardSigmoid, - CudaHardSigmoidFunctor, - CudaHardSigmoidGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(swish, Swish, CudaSwishFunctor, - CudaSwishGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(thresholded_relu, ThresholdedRelu, - CudaThresholdedReluFunctor, - CudaThresholdedReluGradFunctor); +REGISTER_OP_KERNEL( + brelu, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + brelu_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(ceil, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + ceil_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(celu, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + celu_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(elu, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + elu_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(exp, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + exp_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(floor, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + floor_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL( + hard_shrink, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + hard_shrink_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL( + hard_sigmoid, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + hard_sigmoid_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(hard_swish, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + hard_swish_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL( + leaky_relu, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + leaky_relu_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(log, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + log_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(log1p, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + log1p_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL( + logsigmoid, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + logsigmoid_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL( + reciprocal, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + reciprocal_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL( + relu, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + relu_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(relu6, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + relu6_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(sigmoid, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + sigmoid_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(silu, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + silu_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(soft_relu, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + soft_relu_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(softplus, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + softplus_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL( + softshrink, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + softshrink_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(softsign, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + softsign_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(sqrt, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + sqrt_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(square, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + square_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(swish, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + swish_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL( + thresholded_relu, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + thresholded_relu_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); #endif // PADDLE_WITH_XPU_KP diff --git a/paddle/fluid/operators/assign_op.cc b/paddle/fluid/operators/assign_op.cc index 684ac5bafd0ef430f8424614104a865b3cbe29c6..ea6614cbfbdf874df029ab349f4373f27e5c8e21 100644 --- a/paddle/fluid/operators/assign_op.cc +++ b/paddle/fluid/operators/assign_op.cc @@ -16,6 +16,9 @@ limitations under the License. */ #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace framework { class OpDesc; @@ -36,26 +39,6 @@ class AssignOp : public framework::OperatorWithKernel { const framework::AttributeMap &attrs) : OperatorWithKernel(type, inputs, outputs, attrs) {} - void InferShape(framework::InferShapeContext *ctx) const override { - if (ctx->HasInput("X")) { - auto type = ctx->GetInputsVarType("X")[0]; - if (type == framework::proto::VarType::SELECTED_ROWS || - type == framework::proto::VarType::LOD_TENSOR) { - ctx->SetOutputDim("Out", ctx->GetInputDim("X")); - if (type == framework::proto::VarType::LOD_TENSOR) { - ctx->ShareLoD("X", /*->*/ "Out"); - } - } else if (type == framework::proto::VarType::LOD_TENSOR_ARRAY) { - if (ctx->IsRuntime()) { - // The runtime output shape is determined in kernel. - return; - } else { - ctx->SetOutputDim("Out", ctx->GetInputDim("X")); - } - } - } - } - protected: framework::OpKernelType GetKernelTypeForVar( const std::string &var_name, const framework::Tensor &tensor, @@ -91,24 +74,6 @@ class AssignInferVarType : public framework::VarTypeInference { } }; -class AssignKernel { - public: - void operator()(const framework::ExecutionContext &ctx) const { - auto *x = ctx.InputVar("X"); - if (x == nullptr) { - return; - } - PADDLE_ENFORCE_EQ( - ctx.HasOutput("Out"), true, - platform::errors::NotFound("Output(Out) of assign_op is not found.")); - auto *out = ctx.OutputVar("Out"); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(ctx.GetPlace()); - - framework::VisitVarType(*x, AssignFunctor(out, dev_ctx)); - } -}; - class AssignOpProtoMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -147,23 +112,11 @@ DECLARE_INPLACE_OP_INFERER(AssignOpInplaceInferer, {"X", "Out"}); namespace ops = paddle::operators; namespace plat = paddle::platform; + +DECLARE_INFER_SHAPE_FUNCTOR(assign, AssignInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); REGISTER_OPERATOR(assign, ops::AssignOp, ops::AssignGradMaker, ops::AssignGradMaker, ops::AssignOpProtoMaker, ops::AssignOpInplaceInferer, - ops::AssignInferVarType); - -REGISTER_OP_CPU_KERNEL_FUNCTOR(assign, float, ops::AssignKernel, double, - ops::AssignKernel, int, ops::AssignKernel, - int64_t, ops::AssignKernel, uint8_t, - ops::AssignKernel, bool, ops::AssignKernel, - plat::float16, ops::AssignKernel, plat::bfloat16, - ops::AssignKernel); - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -REGISTER_OP_CUDA_KERNEL_FUNCTOR(assign, float, ops::AssignKernel, double, - ops::AssignKernel, int, ops::AssignKernel, - int64_t, ops::AssignKernel, uint8_t, - ops::AssignKernel, bool, ops::AssignKernel, - plat::float16, ops::AssignKernel); -#endif + ops::AssignInferVarType, AssignInferShapeFunctor); diff --git a/paddle/fluid/operators/assign_op_npu_test.cc b/paddle/fluid/operators/assign_op_npu_test.cc index b452dea8536dd98d6d4060d5224e39daf9137c50..b91eb50646feca30046915248d45ee2e91cabc39 100644 --- a/paddle/fluid/operators/assign_op_npu_test.cc +++ b/paddle/fluid/operators/assign_op_npu_test.cc @@ -29,7 +29,7 @@ limitations under the License. */ namespace f = paddle::framework; namespace p = paddle::platform; -USE_OP(assign); +USE_OP_ITSELF(assign); USE_OP_DEVICE_KERNEL(assign, NPU); template diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index 174207deb08b84194d6f20fe04e4c27245295caf..5194c8772e47bca5ec728079b4b2dce883e39c22 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -21,6 +21,9 @@ limitations under the License. */ #include "paddle/fluid/platform/mkldnn_helper.h" #endif +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/infermeta/multiary.h" + namespace paddle { namespace operators { @@ -297,184 +300,6 @@ The required data format for this layer is one of the following: )DOC"); } -template -class BatchNormKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - const float epsilon = ctx.Attr("epsilon"); - float momentum = ctx.Attr("momentum"); - const bool is_test = ctx.Attr("is_test"); - const bool use_global_stats = ctx.Attr("use_global_stats"); - const bool trainable_stats = ctx.Attr("trainable_statistics"); - bool test_mode = is_test && (!trainable_stats); - - bool global_stats = test_mode || use_global_stats; - - const std::string data_layout_str = ctx.Attr("data_layout"); - DataLayout data_layout = framework::StringToDataLayout(data_layout_str); - - const auto *x = ctx.Input("X"); - const auto &x_dims = x->dims(); - PADDLE_ENFORCE_GE( - x_dims.size(), 2, - platform::errors::InvalidArgument( - "The size of input X's dimensions should be larger than 1." - "But received: the size of input X's dimensions is [%d]", - x_dims.size())); - PADDLE_ENFORCE_LE( - x_dims.size(), 5, - platform::errors::InvalidArgument( - "The size of input X's dimensions should be less than 6." - "But received: the size of input X's dimensionss is [%d]", - x_dims.size())); - const int N = x_dims[0]; - const int C = - (data_layout == DataLayout::kNCHW ? x_dims[1] - : x_dims[x_dims.size() - 1]); - const int sample_size = x->numel() / N / C; - - auto *y = ctx.Output("Y"); - - auto *mean_out = ctx.Output("MeanOut"); - auto *variance_out = ctx.Output("VarianceOut"); - auto *saved_mean = ctx.Output("SavedMean"); - auto *saved_variance = ctx.Output("SavedVariance"); - - // alloc memory - y->mutable_data(ctx.GetPlace()); - mean_out->mutable_data(ctx.GetPlace()); - variance_out->mutable_data(ctx.GetPlace()); - saved_mean->mutable_data(ctx.GetPlace()); - saved_variance->mutable_data(ctx.GetPlace()); - - // input dimension is 2 and the format is NCHW. The input can be regarded - // as NHWC format - if (x_dims.size() == 2 && data_layout == DataLayout::kNCHW) { - data_layout = DataLayout::kNHWC; - } - - if (!global_stats) { - // saved_xx is use just in this batch of data - EigenVectorArrayMap saved_mean_e( - saved_mean->mutable_data(ctx.GetPlace()), C); - EigenVectorArrayMap saved_variance_e( - saved_variance->mutable_data(ctx.GetPlace()), C); - saved_mean_e.setZero(); - saved_variance_e.setZero(); - - EigenVectorArrayMap running_mean_arr( - mean_out->mutable_data(ctx.GetPlace()), C); - EigenVectorArrayMap running_var_arr( - variance_out->mutable_data(ctx.GetPlace()), C); - - if ((N * sample_size) == 1) { - // Only 1 element in normalization dimension, - // we skip the batch norm calculation, let y = x. - framework::TensorCopy(*x, ctx.GetPlace(), y); - return; - } - - switch (data_layout) { - case DataLayout::kNCHW: { - ConstEigenArrayMap x_arr(x->data(), sample_size, N * C); - for (int nc = 0; nc < N * C; ++nc) { - saved_mean_e(nc % C) += x_arr.col(nc).sum(); - } - saved_mean_e /= N * sample_size; - for (int nc = 0; nc < N * C; ++nc) { - saved_variance_e(nc % C) += - (x_arr.col(nc) - saved_mean_e(nc % C)).matrix().squaredNorm(); - } - saved_variance_e /= N * sample_size; - break; - } - case DataLayout::kNHWC: { - ConstEigenArrayMap x_arr(x->data(), C, N * sample_size); - for (int i = 0; i < N * sample_size; ++i) { - saved_mean_e += x_arr.col(i); - } - saved_mean_e /= N * sample_size; - for (int i = 0; i < N * sample_size; ++i) { - saved_variance_e += - (x_arr.col(i) - saved_mean_e) * (x_arr.col(i) - saved_mean_e); - } - saved_variance_e /= N * sample_size; - break; - } - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "Unknown storage order: %s", data_layout_str)); - } - - // if MomentumTensor is set, use MomentumTensor value, momentum - // is only used in this training branch - if (ctx.HasInput("MomentumTensor")) { - const auto *mom_tensor = ctx.Input("MomentumTensor"); - momentum = mom_tensor->data()[0]; - } - - running_mean_arr = - running_mean_arr * momentum + saved_mean_e * (1. - momentum); - running_var_arr = - running_var_arr * momentum + saved_variance_e * (1. - momentum); - } - - // use SavedMean and SavedVariance to do normalize - Eigen::Array inv_std(C); - if (global_stats) { - ConstEigenVectorArrayMap var_arr( - ctx.Input("Variance")->data(), C); - inv_std = (var_arr + epsilon).sqrt().inverse(); - } else { - EigenVectorArrayMap saved_inv_std( - ctx.Output("SavedVariance")->data(), C); - // inverse SavedVariance first, gradient will use it too. - saved_inv_std = (saved_inv_std + epsilon).inverse().sqrt(); - inv_std = saved_inv_std; - } - ConstEigenVectorArrayMap mean_arr( - global_stats ? ctx.Input("Mean")->data() - : ctx.Output("SavedMean")->data(), - C); - - // ((x - est_mean) * (inv_var) * scale + bias - // formula transform ====> - // (x * inv_var * scale) + (bias - est_mean * inv_var * scale) - const auto *scale = ctx.Input("Scale"); - const auto *bias = ctx.Input("Bias"); - ConstEigenVectorArrayMap scale_arr(scale->data(), C); - ConstEigenVectorArrayMap bias_arr(bias->data(), C); - Eigen::Array new_scale = inv_std * scale_arr; - Eigen::Array new_bias = - bias_arr - mean_arr * inv_std * scale_arr; - - switch (data_layout) { - case DataLayout::kNCHW: { - EigenArrayMap y_arr(y->mutable_data(ctx.GetPlace()), sample_size, - N * C); - ConstEigenArrayMap x_arr(x->data(), sample_size, N * C); - for (int nc = 0; nc < N * C; ++nc) { - y_arr.col(nc) = x_arr.col(nc) * new_scale(nc % C) + new_bias(nc % C); - } - break; - } - case DataLayout::kNHWC: { - EigenArrayMap(y->mutable_data(ctx.GetPlace()), C, - N * sample_size) = - (ConstEigenArrayMap(x->data(), C, N * sample_size).colwise() * - new_scale) - .colwise() + - new_bias; - break; - } - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "Unknown storage order: %d", data_layout)); - } - } -}; - void BatchNormGradOp::InferShape(framework::InferShapeContext *ctx) const { // check input OP_INOUT_CHECK(ctx->HasInput("Scale"), "Input", "Scale", "BatchNormGrad"); @@ -585,261 +410,6 @@ framework::OpKernelType BatchNormGradOp::GetKernelTypeForVar( tensor.place(), tensor.layout()); } -template -class BatchNormGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - const auto *d_y = ctx.Input(framework::GradVarName("Y")); - const auto *scale = ctx.Input("Scale"); - const auto *bias = ctx.Input("Bias"); - const auto *saved_mean = ctx.Input("SavedMean"); - // SavedVariance have been reverted in forward operator - const auto *saved_inv_variance = ctx.Input("SavedVariance"); - const std::string data_layout_str = ctx.Attr("data_layout"); - bool use_global_stats = ctx.Attr("use_global_stats"); - const bool is_test = ctx.Attr("is_test"); - const float epsilon = ctx.Attr("epsilon"); - DataLayout data_layout = framework::StringToDataLayout(data_layout_str); - - auto *d_x = ctx.Output(framework::GradVarName("X")); - auto *d_scale = ctx.Output(framework::GradVarName("Scale")); - auto *d_bias = ctx.Output(framework::GradVarName("Bias")); - - use_global_stats = is_test || use_global_stats; - - // batch_norm with inplace as false will take X as grad input, which - // is same as cuDNN batch_norm backward calculation, batch_norm - // with inplace as true only take Y as input and X should be calculate - // by inverse operation of batch_norm on Y - const Tensor *x; - bool is_inplace; - if (ctx.HasInput("Y")) { - x = ctx.Input("Y"); - is_inplace = true; - // if the input of batch norm is stop_gradient, d_x is null. - if (d_x) { - PADDLE_ENFORCE_EQ(d_x, d_y, - platform::errors::InvalidArgument( - "X@GRAD and Y@GRAD not inplace in inplace mode")); - } - } else { - x = ctx.Input("X"); - is_inplace = false; - if (d_x) { - PADDLE_ENFORCE_NE( - d_x, d_y, platform::errors::InvalidArgument( - "X@GRAD and Y@GRAD inplaced in non-inplace mode")); - } - } - - // Get the size for each dimension. - // NCHW [batch_size, in_channels, in_height, in_width] - const auto &x_dims = x->dims(); - PADDLE_ENFORCE_GE( - x_dims.size(), 2, - platform::errors::InvalidArgument( - "The size of input X's dimensions should be larger than 1." - "But received: the size of input X's dimensions is [%d]", - x_dims.size())); - PADDLE_ENFORCE_LE( - x_dims.size(), 5, - platform::errors::InvalidArgument( - "The size of input X's dimensions should be less than 6." - "But received: the size of input X's dimensions is [%d]", - x_dims.size())); - const int N = x_dims[0]; - const int C = - (data_layout == DataLayout::kNCHW ? x_dims[1] - : x_dims[x_dims.size() - 1]); - const int sample_size = x->numel() / N / C; - - // input dimension is 2 and the format is NCHW. The input can be regarded as - // NHWC format - if (x_dims.size() == 2 && data_layout == DataLayout::kNCHW) { - data_layout = DataLayout::kNHWC; - } - - // init output - if (d_x) { - d_x->mutable_data(ctx.GetPlace()); - } - - const T *mean_data = saved_mean->data(); - const T *inv_var_data = saved_inv_variance->data(); - Tensor inv_var_tensor; - if (use_global_stats) { - const auto *running_mean = ctx.Input("Mean"); - const auto *running_variance = ctx.Input("Variance"); - mean_data = running_mean->data(); - inv_var_tensor.Resize({C}); - T *running_inv_var_data = inv_var_tensor.mutable_data(ctx.GetPlace()); - EigenVectorArrayMap inv_var_tmp(running_inv_var_data, C); - ConstEigenVectorArrayMap var_arr(running_variance->data(), C); - - inv_var_tmp = (var_arr + epsilon).sqrt().inverse(); - inv_var_data = running_inv_var_data; - } - - ConstEigenVectorArrayMap scale_arr(scale->data(), C); - ConstEigenVectorArrayMap bias_arr(bias->data(), C); - ConstEigenVectorArrayMap mean_arr(mean_data, C); - ConstEigenVectorArrayMap inv_var_arr(inv_var_data, C); - - T *d_bias_data = nullptr; - T *d_scale_data = nullptr; - if (d_scale && d_bias) { - d_scale->mutable_data(ctx.GetPlace()); - d_bias->mutable_data(ctx.GetPlace()); - d_bias_data = d_bias->mutable_data(ctx.GetPlace()); - d_scale_data = d_scale->mutable_data(ctx.GetPlace()); - } - - // d_bias = np.sum(d_y, axis=0) - // d_scale = np.sum((X - mean) / inv_std * dy, axis=0) - // d_x = (1. / N) * scale * inv_var * (N * d_y - np.sum(d_y, axis=0) - // - (X - mean) * inv_var * inv_var * np.sum(d_y * (X - mean), axis=0)) - EigenVectorArrayMap d_bias_arr(d_bias_data, C); - EigenVectorArrayMap d_scale_arr(d_scale_data, C); - - if (d_scale && d_bias) { - d_bias_arr.setZero(); - d_scale_arr.setZero(); - } - - if (d_x && (N * sample_size) == 1 && !use_global_stats) { - framework::TensorCopy(*d_y, ctx.GetPlace(), d_x); - return; - } - - int scale_coefff = use_global_stats ? 1 : N * sample_size; - const auto scale_inv_var_nhw = scale_arr * inv_var_arr / scale_coefff; - - Tensor dy_sum; - dy_sum.Resize({C}); - dy_sum.mutable_data(ctx.GetPlace()); - EigenVectorArrayMap dy_sum_arr(dy_sum.mutable_data(ctx.GetPlace()), - C); - - Tensor dy_mul_x_sub_mean_mul_invstd_sum; - dy_mul_x_sub_mean_mul_invstd_sum.Resize({C}); - dy_mul_x_sub_mean_mul_invstd_sum.mutable_data(ctx.GetPlace()); - EigenVectorArrayMap dy_mul_x_sub_mean_mul_invstd_sum_arr( - dy_mul_x_sub_mean_mul_invstd_sum.mutable_data(ctx.GetPlace()), C); - - dy_sum_arr.setZero(); - dy_mul_x_sub_mean_mul_invstd_sum_arr.setZero(); - - // inplace calculation - // Y: ((x - est_mean) * (inv_var) * scale + bias - // formula transform ====> - // (x * inv_var * scale) + (bias - est_mean * inv_var * scale) - // X: (y - bias) / scale / (inv_var) + est_mean - // formula transform ====> - // (y - bias) / (scale * inv_var) + est_mean - switch (data_layout) { - case DataLayout::kNCHW: { - if (is_inplace) { - auto px = *x; - EigenArrayMap x_data(px.mutable_data(ctx.GetPlace()), - sample_size, N * C); - ConstEigenArrayMap y_data(x->data(), sample_size, N * C); - for (int nc = 0; nc < N * C; ++nc) { - x_data.col(nc) = (y_data.col(nc) - bias_arr(nc % C)) / - scale_inv_var_nhw(nc % C) / scale_coefff + - mean_arr(nc % C); - } - } - ConstEigenArrayMap x_arr(x->data(), sample_size, N * C); - ConstEigenArrayMap d_y_arr(d_y->data(), sample_size, N * C); - - for (int nc = 0; nc < N * C; ++nc) { - int c = nc % C; - dy_sum_arr(c) += d_y_arr.col(nc).sum(); - dy_mul_x_sub_mean_mul_invstd_sum_arr(c) += - ((x_arr.col(nc) - mean_arr(c)) * inv_var_arr(c) * d_y_arr.col(nc)) - .sum(); - } - - if (d_scale && d_bias) { - d_bias_arr = dy_sum_arr; - d_scale_arr = dy_mul_x_sub_mean_mul_invstd_sum_arr; - } - - if (d_x) { - EigenArrayMap d_x_arr(d_x->mutable_data(ctx.GetPlace()), - sample_size, N * C); - if (!use_global_stats) { - for (int nc = 0; nc < N * C; ++nc) { - int c = nc % C; - d_x_arr.col(nc) = - scale_inv_var_nhw(c) * - (d_y_arr.col(nc) * N * sample_size - dy_sum_arr(c) - - (x_arr.col(nc) - mean_arr[c]) * - dy_mul_x_sub_mean_mul_invstd_sum_arr(c) * - inv_var_arr(c)); - } - } else { - for (int nc = 0; nc < N * C; ++nc) { - int c = nc % C; - d_x_arr.col(nc) = scale_inv_var_nhw(c) * d_y_arr.col(nc); - } - } - } - break; - } - case DataLayout::kNHWC: { - if (is_inplace) { - auto px = *x; - EigenArrayMap x_data(px.mutable_data(ctx.GetPlace()), C, - N * sample_size); - ConstEigenArrayMap y_data(x->data(), C, N * sample_size); - for (int nhw = 0; nhw < N * sample_size; nhw++) { - x_data.col(nhw) = (y_data.col(nhw) - bias_arr) / scale_inv_var_nhw / - scale_coefff + - mean_arr; - } - } - ConstEigenArrayMap x_arr(x->data(), C, N * sample_size); - ConstEigenArrayMap d_y_arr(d_y->data(), C, N * sample_size); - - for (int nhw = 0; nhw < N * sample_size; ++nhw) { - dy_sum_arr += d_y_arr.col(nhw); - dy_mul_x_sub_mean_mul_invstd_sum_arr += - (x_arr.col(nhw) - mean_arr) * inv_var_arr * d_y_arr.col(nhw); - } - - if (d_scale && d_bias) { - d_bias_arr = dy_sum_arr; - d_scale_arr = dy_mul_x_sub_mean_mul_invstd_sum_arr; - } - - if (d_x) { - EigenArrayMap d_x_arr(d_x->mutable_data(ctx.GetPlace()), C, - N * sample_size); - if (!use_global_stats) { - for (int nhw = 0; nhw < N * sample_size; ++nhw) { - d_x_arr.col(nhw) = - scale_inv_var_nhw * - (d_y_arr.col(nhw) * N * sample_size - dy_sum_arr - - (x_arr.col(nhw) - mean_arr) * - dy_mul_x_sub_mean_mul_invstd_sum_arr * inv_var_arr); - } - } else { - for (int nhw = 0; nhw < N * sample_size; ++nhw) { - d_x_arr.col(nhw) = scale_inv_var_nhw * d_y_arr.col(nhw); - } - } - } - break; - } - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "Unknown storage order: %s", data_layout_str)); - } - } -}; - template void BatchNormGradMaker::Apply(GradOpPtr op) const { op->SetType(this->ForwardOpType() + "_grad"); @@ -951,335 +521,16 @@ framework::OpKernelType BatchNormDoubleGradOp::GetExpectedKernelType( OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace()); } -template -class BatchNormDoubleGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - const auto *X = ctx.Input("X"); - const auto *Scale = ctx.Input("Scale"); - const auto *dY = ctx.Input("DY"); - const auto *Saved_mean = ctx.Input("SavedMean"); - const auto *Saved_variance = ctx.Input("SavedVariance"); - const float epsilon = ctx.Attr("epsilon"); - const bool use_global_stats = ctx.Attr("use_global_stats"); - const bool is_test = ctx.Attr("is_test"); - - PADDLE_ENFORCE_EQ( - is_test, false, - platform::errors::InvalidArgument( - "`is_test = True` CANNOT be used in train program. If " - "you want to use global status in pre_train model, " - "please set `use_global_stats = True`")); - - const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = - framework::StringToDataLayout(data_layout_str); - - const auto *ddX = ctx.Input("DDX"); - const auto *ddScale = ctx.Input("DDScale"); - const auto *ddBias = ctx.Input("DDBias"); - - auto *dX = ctx.Output("DX"); - auto *dScale = ctx.Output("DScale"); - auto *ddY = ctx.Output("DDY"); - dX->mutable_data(ctx.GetPlace()); - ddY->mutable_data(ctx.GetPlace()); - - auto &dev_ctx = ctx.template device_context(); - - const auto &x_dims = X->dims(); - const int C = - (data_layout == DataLayout::kNCHW ? x_dims[1] - : x_dims[x_dims.size() - 1]); - const int sample_size = X->numel() / C; - phi::funcs::SetConstant set_constant; - - const T *mean_data = Saved_mean->data(); - const T *inv_var_data = Saved_variance->data(); - - Tensor inv_var_tensor; - if (use_global_stats) { - const auto *running_mean = ctx.Input("Mean"); - const auto *running_variance = ctx.Input("Variance"); - mean_data = running_mean->data(); - inv_var_tensor.Resize({C}); - - T *running_inv_var_data = inv_var_tensor.mutable_data(ctx.GetPlace()); - EigenVectorArrayMap inv_var_tmp(running_inv_var_data, C); - ConstEigenVectorArrayMap var_arr(running_variance->data(), C); - - inv_var_tmp = (var_arr + epsilon).sqrt().inverse(); - inv_var_data = running_inv_var_data; - } - - // transpose NCHW -> NHWC for easy calculate - Tensor transformed_x(X->type()); - Tensor transformed_dy(dY->type()); - Tensor transformed_ddx(ddX->type()); - - Tensor transformed_dx(dX->type()); - Tensor transformed_ddy(ddY->type()); - if (data_layout == DataLayout::kNCHW && x_dims.size() > 2) { - VLOG(3) << "Transform batchnorm output from NCHW to NHWC"; - // Input Tensor - ResizeToChannelLast(ctx, X, - &transformed_x); - TransToChannelLast(ctx, X, &transformed_x); - ResizeToChannelLast(ctx, dY, - &transformed_dy); - TransToChannelLast(ctx, dY, - &transformed_dy); - ResizeToChannelLast(ctx, ddX, - &transformed_ddx); - TransToChannelLast(ctx, ddX, - &transformed_ddx); - // Output Tensor - ResizeToChannelLast(ctx, dX, - &transformed_dx); - ResizeToChannelLast(ctx, ddY, - &transformed_ddy); - } else { - transformed_x.ShareDataWith(*X); - transformed_dy.ShareDataWith(*dY); - transformed_ddx.ShareDataWith(*ddX); - - transformed_dx.ShareDataWith(*dX); - transformed_ddy.ShareDataWith(*ddY); - } - - ConstEigenArrayMap x_arr(transformed_x.data(), C, sample_size); - ConstEigenVectorArrayMap mean_arr(mean_data, C); - ConstEigenVectorArrayMap inv_var_arr(inv_var_data, C); - - Tensor mean_tile; - mean_tile.Resize({C, sample_size}); - mean_tile.mutable_data(ctx.GetPlace()); - EigenArrayMap mean_tile_data(mean_tile.mutable_data(ctx.GetPlace()), - C, sample_size); - - Tensor inv_var_tile; - inv_var_tile.Resize({C, sample_size}); - inv_var_tile.mutable_data(ctx.GetPlace()); - EigenArrayMap inv_var_tile_data( - inv_var_tile.mutable_data(ctx.GetPlace()), C, sample_size); - - mean_tile_data = mean_arr.replicate(1, sample_size); - inv_var_tile_data = inv_var_arr.replicate(1, sample_size); - - Tensor Scale_data; - if (!Scale) { - Scale_data.mutable_data({C}, ctx.GetPlace()); - set_constant(dev_ctx, &Scale_data, static_cast(1)); - } - ConstEigenVectorArrayMap scale_arr( - Scale ? Scale->data() : Scale_data.data(), C); - - Tensor scale_tile; - scale_tile.Resize({C, sample_size}); - scale_tile.mutable_data(ctx.GetPlace()); - EigenArrayMap scale_tile_data(scale_tile.mutable_data(ctx.GetPlace()), - C, sample_size); - scale_tile_data = scale_arr.replicate(1, sample_size); - - ConstEigenArrayMap dy_arr(transformed_dy.data(), C, sample_size); - ConstEigenArrayMap ddx_arr(transformed_ddx.data(), C, sample_size); - - Tensor x_sub_mean_mul_invstd; - x_sub_mean_mul_invstd.Resize({C, sample_size}); - x_sub_mean_mul_invstd.mutable_data(ctx.GetPlace()); - EigenArrayMap x_sub_mean_mul_invstd_arr( - x_sub_mean_mul_invstd.mutable_data(ctx.GetPlace()), C, sample_size); - x_sub_mean_mul_invstd_arr = (x_arr - mean_tile_data) * inv_var_tile_data; - - if (dX) { - dX->mutable_data(ctx.GetPlace()); - EigenArrayMap dx_arr(transformed_dx.mutable_data(ctx.GetPlace()), C, - sample_size); - dx_arr.setZero(); - if (use_global_stats) { - // math: dx = (ddscale * dy) * inv_var - if (ddScale) { - ConstEigenVectorArrayMap ddscale_arr(ddScale->data(), C); - Tensor ddscale_tile; - ddscale_tile.Resize({C, sample_size}); - EigenArrayMap ddscale_tile_data( - ddscale_tile.mutable_data(ctx.GetPlace()), C, sample_size); - ddscale_tile_data = ddscale_arr.replicate(1, sample_size); - - dx_arr = dy_arr * ddscale_tile_data * inv_var_tile_data; - } - } else { - // math: dx = scale * ((x - mean) * inv_var / NxHxW * (np.mean(ddx, - // axis=(n,h,w)) * - // np.sum(dy, axis=(n,h,w)) - - // np.sum(dy * ddx, axis=(n,h,w)) + 3 * np.mean(dy * (x - - // mean), - // axis=(n,h,w)) * inv_var.pow(2) * - // np.sum(ddx * (x - mean), axis=(n,h,w))) + inv_var.pow(3) / - // NxHxW * - // np.sum(ddx * (x - mean)) * - // (np.mean(dy, axis=(n,h,w)) - dy) + inv_var.pow(3) / NxHxW * - // np.sum(dy, - // axis=(n,h,w)) * (x - mean) * - // (np.mean(ddx, axis=(n,h,w)) - ddx)) + ddr * (dy * inv_var - - // inv_var - // * - // np.mean(dy, axis=(n,h,w)) - - // inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean), - // axis=(n,h,w))) - - if (ddX) { - dx_arr += - (x_sub_mean_mul_invstd_arr * inv_var_tile_data * - inv_var_tile_data / sample_size) - .colwise() * - (ddx_arr.rowwise().sum() * dy_arr.rowwise().sum() / sample_size - - (dy_arr * ddx_arr).rowwise().sum() + - 3. * (dy_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() * - (ddx_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() / - sample_size); - - dx_arr += (inv_var_tile_data * inv_var_tile_data).colwise() * - (ddx_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() / - sample_size * - (dy_arr.rowwise().sum() / sample_size - dy_arr); - - dx_arr += (inv_var_tile_data * inv_var_tile_data).colwise() * - (dy_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() / - sample_size * - (ddx_arr.rowwise().sum() / sample_size - ddx_arr); - - dx_arr = scale_tile_data * dx_arr; - } - if (ddScale) { - ConstEigenVectorArrayMap ddscale_arr(ddScale->data(), C); - Tensor ddscale_tile; - ddscale_tile.Resize({C, sample_size}); - EigenArrayMap ddscale_tile_data( - ddscale_tile.mutable_data(ctx.GetPlace()), C, sample_size); - ddscale_tile_data = ddscale_arr.replicate(1, sample_size); - - dx_arr += (dy_arr * inv_var_tile_data - - (dy_arr.rowwise().sum().replicate(1, sample_size) / - sample_size) * - inv_var_tile_data - - x_sub_mean_mul_invstd_arr * inv_var_tile_data * - (dy_arr * x_sub_mean_mul_invstd_arr) - .rowwise() - .sum() - .replicate(1, sample_size) / - sample_size) * - ddscale_tile_data; - } - } - if (data_layout == DataLayout::kNCHW) { - VLOG(3) << "Transform batchnorm output from NHWC to NCHW"; - TransToChannelFirst( - ctx, &transformed_dx, dX); - } - } - if (dScale) { - dScale->mutable_data(ctx.GetPlace()); - EigenVectorArrayMap dscale_arr(dScale->mutable_data(ctx.GetPlace()), - C); - dscale_arr.setZero(); - if (use_global_stats) { - // math: dscale = np.sum(ddx * dy, axis=(n,h,w)) * inv_var - if (ddX) { - dscale_arr = (ddx_arr * dy_arr * inv_var_tile_data).rowwise().sum(); - } - } else { - // math: dscale = inv_var * (dy - np.mean(dy, axis=(n,h,w) - (x-mean) * - // inv_var.pow(2) * np.mean(dy * (x-mean), axis=(n,h,w)))) * - // ddx - if (ddX) { - Tensor first_grad; - first_grad.Resize({C, sample_size}); - EigenArrayMap first_grad_arr( - first_grad.mutable_data(ctx.GetPlace()), C, sample_size); - first_grad_arr.setZero(); - - first_grad_arr += - inv_var_tile_data * - (dy_arr - - dy_arr.rowwise().sum().replicate(1, sample_size) / sample_size - - x_sub_mean_mul_invstd_arr * - (dy_arr * x_sub_mean_mul_invstd_arr) - .rowwise() - .sum() - .replicate(1, sample_size) / - sample_size); - dscale_arr = (first_grad_arr * ddx_arr).rowwise().sum(); - } - } - } - - if (ddY) { - ddY->mutable_data(ctx.GetPlace()); - EigenArrayMap ddy_arr(transformed_ddy.mutable_data(ctx.GetPlace()), - C, sample_size); - ddy_arr.setZero(); - if (use_global_stats) { - // math: ddy = r * ddx * inv_var + ddbias + - // ddscale * (x - mean) * inv_var - if (ddX) { - ddy_arr = scale_tile_data * ddx_arr * inv_var_tile_data; - } - } else { - // math: ddy = (x - mean) * inv_var * ddscale + ddbias + - // scale * inv_var * (ddx - (x - mean) * inv_var.pow(2) * - // np.mean(ddx * (x - mean), axis=(n,h,w))) - if (ddX) { - ddy_arr += - scale_tile_data * inv_var_tile_data * - (ddx_arr - - ddx_arr.rowwise().sum().replicate(1, sample_size) / sample_size - - x_sub_mean_mul_invstd_arr * - (ddx_arr * x_sub_mean_mul_invstd_arr) - .rowwise() - .sum() - .replicate(1, sample_size) / - sample_size); - } - } - if (ddScale) { - ConstEigenVectorArrayMap ddscale_arr(ddScale->data(), C); - Tensor ddscale_tile; - ddscale_tile.Resize({C, sample_size}); - EigenArrayMap ddscale_tile_data( - ddscale_tile.mutable_data(ctx.GetPlace()), C, sample_size); - ddscale_tile_data = ddscale_arr.replicate(1, sample_size); - - ddy_arr += x_sub_mean_mul_invstd_arr * ddscale_tile_data; - } - - if (ddBias) { - ConstEigenVectorArrayMap ddbias_arr(ddBias->data(), C); - Tensor ddbias_tile; - ddbias_tile.Resize({C, sample_size}); - EigenArrayMap ddbias_tile_data( - ddbias_tile.mutable_data(ctx.GetPlace()), C, sample_size); - ddbias_tile_data = ddbias_arr.replicate(1, sample_size); - - ddy_arr += ddbias_tile_data; - } - - if (data_layout == DataLayout::kNCHW) { - VLOG(3) << "Transform batchnorm output from NHWC to NCHW"; - TransToChannelFirst( - ctx, &transformed_ddy, ddY); - } - } - } -}; - DECLARE_INPLACE_OP_INFERER(BatchNormDoubleGradOpInplaceInferer, {"DY", "DDY"}); } // namespace operators } // namespace paddle namespace ops = paddle::operators; + +DECLARE_INFER_SHAPE_FUNCTOR(batch_norm, BatchNormInferShapeFunctor, + PD_INFER_META(phi::BatchNormInferMeta)); + REGISTER_OPERATOR(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker, ops::BatchNormOpInferVarType, ops::BatchNormGradMaker, diff --git a/paddle/fluid/operators/batch_norm_op.h b/paddle/fluid/operators/batch_norm_op.h index f8d37d685b929258118e5d4b9d02a6be9d71c078..d274e8d2c006d7cbfe8337eab5c6d9a57a62e5ca 100644 --- a/paddle/fluid/operators/batch_norm_op.h +++ b/paddle/fluid/operators/batch_norm_op.h @@ -113,23 +113,5 @@ class BatchNormOpInferVarType } }; -template -class BatchNormKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override; -}; - -template -class BatchNormGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override; -}; - -template -class BatchNormDoubleGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override; -}; - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/batch_norm_op_mlu.cc b/paddle/fluid/operators/batch_norm_op_mlu.cc index 0e64b461786cce845f7388a520c09101dcba9c09..6507890a8b5dcd7a415215caf51bd05c2857db5e 100644 --- a/paddle/fluid/operators/batch_norm_op_mlu.cc +++ b/paddle/fluid/operators/batch_norm_op_mlu.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/batch_norm_op.h" +#include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/mlu/mlu_baseop.h" namespace paddle { @@ -20,6 +21,8 @@ namespace operators { template class MLUBatchNormOpKernel : public framework::OpKernel { + using MPDType = typename details::MPTypeTrait::Type; + public: void Compute(const framework::ExecutionContext &ctx) const override { const auto &place = ctx.GetPlace(); @@ -68,10 +71,10 @@ class MLUBatchNormOpKernel : public framework::OpKernel { // alloc memory y->mutable_data(place); - mean_out->mutable_data(place); - variance_out->mutable_data(place); - saved_mean->mutable_data(place); - saved_variance->mutable_data(place); + mean_out->mutable_data(place); + variance_out->mutable_data(place); + saved_mean->mutable_data(place); + saved_variance->mutable_data(place); Tensor transformed_x; Tensor transformed_y; @@ -132,6 +135,8 @@ class MLUBatchNormOpKernel : public framework::OpKernel { template class MLUBatchNormGradOpKernel : public framework::OpKernel { + using MPDType = typename details::MPTypeTrait::Type; + public: void Compute(const framework::ExecutionContext &ctx) const override { const auto *x = ctx.Input("X"); @@ -154,10 +159,10 @@ class MLUBatchNormGradOpKernel : public framework::OpKernel { auto &dev_ctx = ctx.template device_context(); auto d_x_tmp = ctx.AllocateTmpTensor(x->dims(), dev_ctx); - auto scale_grad_tmp = - ctx.AllocateTmpTensor(scale->dims(), dev_ctx); + auto scale_grad_tmp = ctx.AllocateTmpTensor( + scale->dims(), dev_ctx); auto bias_grad_tmp = - ctx.AllocateTmpTensor(bias->dims(), dev_ctx); + ctx.AllocateTmpTensor(bias->dims(), dev_ctx); if (d_x == nullptr) { d_x = &d_x_tmp; @@ -171,8 +176,8 @@ class MLUBatchNormGradOpKernel : public framework::OpKernel { const auto &place = ctx.GetPlace(); d_x->mutable_data(place); - d_scale->mutable_data(place); - d_bias->mutable_data(place); + d_scale->mutable_data(place); + d_bias->mutable_data(place); use_global_stats = is_test || use_global_stats; diff --git a/paddle/fluid/operators/batch_norm_op_npu.cc b/paddle/fluid/operators/batch_norm_op_npu.cc index a70b6e991161dfef99cc0b6da9fba9a2696cc08e..ae03ecbcb16a0441cdb87e0ec579c07d872bc9a2 100644 --- a/paddle/fluid/operators/batch_norm_op_npu.cc +++ b/paddle/fluid/operators/batch_norm_op_npu.cc @@ -76,10 +76,10 @@ class NPUBatchNormOpKernel : public framework::OpKernel { auto *variance_out = ctx.Output("VarianceOut"); auto *saved_mean = ctx.Output("SavedMean"); auto *saved_variance = ctx.Output("SavedVariance"); - mean_out->mutable_data(ctx.GetPlace()); - variance_out->mutable_data(ctx.GetPlace()); - saved_mean->mutable_data(ctx.GetPlace()); - saved_variance->mutable_data(ctx.GetPlace()); + mean_out->mutable_data(ctx.GetPlace()); + variance_out->mutable_data(ctx.GetPlace()); + saved_mean->mutable_data(ctx.GetPlace()); + saved_variance->mutable_data(ctx.GetPlace()); // if MomentumTensor is set, use MomentumTensor value, momentum // is only used in this training branch @@ -170,8 +170,8 @@ class NPUBatchNormGradOpKernel : public framework::OpKernel { auto stream = ctx.template device_context().stream(); if (d_scale && d_bias) { - d_scale->mutable_data(ctx.GetPlace()); - d_bias->mutable_data(ctx.GetPlace()); + d_scale->mutable_data(ctx.GetPlace()); + d_bias->mutable_data(ctx.GetPlace()); if (use_global_stats) { const auto *running_mean = ctx.Input("Mean"); const auto *running_variance = ctx.Input("Variance"); diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index 8213e877f722433488cd826bb63cba376972c57a..9be63a85fc0de3ba75cb9741b25f7c312cd9f60b 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -27,6 +27,9 @@ limitations under the License. */ #endif #include "paddle/fluid/platform/cudnn_workspace_helper.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/infermeta/binary.h" + namespace paddle { namespace operators { @@ -841,6 +844,8 @@ framework::OpKernelType ConvOpDoubleGrad::GetExpectedKernelType( } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(conv2d, Conv2dInferShapeFunctor, + PD_INFER_META(phi::ConvInferMeta)); REGISTER_OPERATOR(conv2d, ops::ConvOp, ops::Conv2DOpMaker, ops::ConvOpInferVarType, ops::Conv2DGradMaker, @@ -851,6 +856,8 @@ REGISTER_OPERATOR(conv2d_grad, ops::ConvOpGrad, REGISTER_OPERATOR(conv2d_grad_grad, ops::ConvOpDoubleGrad); // depthwise convolution op +DECLARE_INFER_SHAPE_FUNCTOR(depthwise_conv2d, DepthwiseConv2dInferShapeFunctor, + PD_INFER_META(phi::ConvInferMeta)); REGISTER_OPERATOR(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker, ops::ConvOpInferVarType, ops::Conv2DGradMaker, @@ -860,6 +867,8 @@ REGISTER_OPERATOR(depthwise_conv2d_grad, ops::ConvOpGrad, ops::Conv2DDoubleGradMaker); REGISTER_OPERATOR(depthwise_conv2d_grad_grad, ops::ConvOpDoubleGrad); +DECLARE_INFER_SHAPE_FUNCTOR(conv3d, Conv3dInferShapeFunctor, + PD_INFER_META(phi::ConvInferMeta)); REGISTER_OPERATOR(conv3d, ops::ConvOp, ops::Conv3DOpMaker, ops::ConvOpInferVarType, ops::Conv3DGradMaker, diff --git a/paddle/fluid/operators/conv_op_npu.cc b/paddle/fluid/operators/conv_op_npu.cc index 8897f7b229c321e28609d8ef739f4388f5cb586a..fcda16a3e72ac9250a0206e69f50c75d71cb0d64 100644 --- a/paddle/fluid/operators/conv_op_npu.cc +++ b/paddle/fluid/operators/conv_op_npu.cc @@ -356,7 +356,7 @@ class NPUConvGradOpKernel : public framework::OpKernel { auto stream = ctx.template device_context().stream(); if (filter_grad) { - filter_grad->mutable_data(ctx.GetPlace()); + filter_grad->mutable_data(ctx.GetPlace()); std::vector filter_shape_vec = phi::vectorize(filter->dims()); const auto& runner = NpuOpRunner( diff --git a/paddle/fluid/operators/cumprod_op.cc b/paddle/fluid/operators/cumprod_op.cc index bff6673429d9a4088c65f9dc02c1546f23d96878..889cdac8f6882744c7a7044861d237964e6f6ac0 100644 --- a/paddle/fluid/operators/cumprod_op.cc +++ b/paddle/fluid/operators/cumprod_op.cc @@ -12,7 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/cumprod_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -20,14 +23,6 @@ namespace operators { class CumprodOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Cumprod"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Cumprod"); - - ctx->ShareDim("X", "Out"); - ctx->ShareLoD("X", "Out"); - } }; class CumprodOpMaker : public framework::OpProtoAndCheckerMaker { @@ -81,22 +76,12 @@ class CumprodGradOp : public framework::OperatorWithKernel { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(cumprod, CumprodInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); REGISTER_OPERATOR(cumprod, ops::CumprodOp, ops::CumprodOpMaker, ops::CumprodGradOpMaker, - ops::CumprodGradOpMaker); + ops::CumprodGradOpMaker, + CumprodInferShapeFunctor); REGISTER_OPERATOR(cumprod_grad, ops::CumprodGradOp); - -REGISTER_OP_CPU_KERNEL( - cumprod, ops::CumprodOpCPUKernel, ops::CumprodOpCPUKernel, - ops::CumprodOpCPUKernel, ops::CumprodOpCPUKernel, - ops::CumprodOpCPUKernel>, - ops::CumprodOpCPUKernel>); - -REGISTER_OP_CPU_KERNEL( - cumprod_grad, ops::CumprodGradOpCPUKernel, - ops::CumprodGradOpCPUKernel, ops::CumprodGradOpCPUKernel, - ops::CumprodGradOpCPUKernel, - ops::CumprodGradOpCPUKernel>, - ops::CumprodGradOpCPUKernel>); diff --git a/paddle/fluid/operators/cumprod_op.cu b/paddle/fluid/operators/cumprod_op.cu deleted file mode 100644 index f792d6832917f52573dce7ee3e449c2f4be63584..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/cumprod_op.cu +++ /dev/null @@ -1,369 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include "paddle/fluid/operators/cumprod_op.h" -#include "paddle/fluid/operators/math/inclusive_scan.h" -#include "paddle/fluid/platform/for_range.h" -#include "paddle/phi/kernels/funcs/complex_functors.h" - -namespace paddle { -namespace operators { - -template -struct MultiplyFunctor { - HOSTDEVICE T operator()(T a, T b) const { return a * b; } -}; - -template -class CumprodOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - const auto *x = ctx.Input("X"); - auto *y = ctx.Output("Out"); - auto dim = ctx.Attr("dim"); - size_t outer_dim, mid_dim, inner_dim; - GetCumprodDimInfo(x->dims(), dim, &outer_dim, &mid_dim, &inner_dim); - - const auto *x_data = x->data(); - auto *y_data = y->mutable_data(ctx.GetPlace()); - const auto &dev_ctx = - ctx.template device_context(); - math::InclusiveScan>( - x_data, y_data, outer_dim, mid_dim, inner_dim, static_cast(1), - MultiplyFunctor(), /*reverse=*/false, dev_ctx); - } -}; - -template -struct IsZeroFunctor { - HOSTDEVICE bool operator()(T x) const { return x == static_cast(0); } -}; - -template -struct CumprodGradFunctorExceptFirstZero { - HOSTDEVICE CumprodGradFunctorExceptFirstZero( - const T *x, const T *y, const T *dy_mul_y_reversed_cumsum, - const uint8_t *zero_mask, size_t mid_dim, size_t inner_dim, T *dx, - int64_t *first_zero_idx, T *x_filled_one) - : x_(x), - y_(y), - dy_mul_y_reversed_cumsum_(dy_mul_y_reversed_cumsum), - zero_mask_(zero_mask), - mid_dim_(mid_dim), - inner_dim_(inner_dim), - dx_(dx), - first_zero_idx_(first_zero_idx), - x_filled_one_(x_filled_one) {} - - HOSTDEVICE void operator()(size_t idx) const { - auto inner_idx = idx % inner_dim_; - auto outer_idx = idx / (mid_dim_ * inner_dim_); - auto mid_idx = (idx - inner_idx) / inner_dim_ % mid_dim_; - auto mask = zero_mask_[idx]; - bool should_fill_one = true; - - if (mask == 0) { - dx_[idx] = dy_mul_y_reversed_cumsum_[idx] / x_[idx]; - if (mid_idx == mid_dim_ - 1) { - // record first zero position as -1, i.e., no zero - first_zero_idx_[outer_idx * inner_dim_ + inner_idx] = -1; - } - } else if (mid_idx > 0) { // mask > 0 - if (zero_mask_[idx - inner_dim_] > 0) { // not first zero - dx_[idx] = 0; - should_fill_one = false; - } else { - // idx is the first zero position, it should be recorded - dx_[idx] = y_[idx - inner_dim_]; - first_zero_idx_[outer_idx * inner_dim_ + inner_idx] = mid_idx; - } - } else { // the first zero position is index 0 - dx_[idx] = 1; - first_zero_idx_[outer_idx * inner_dim_ + inner_idx] = 0; - } - - x_filled_one_[idx] = should_fill_one ? 1 : x_[idx]; - } - - private: - const T *x_; - const T *y_; - const T *dy_mul_y_reversed_cumsum_; - const uint8_t *zero_mask_; - size_t mid_dim_; - size_t inner_dim_; - T *dx_; - int64_t *first_zero_idx_; - T *x_filled_one_; -}; - -template -struct FillFirstZeroPositionGradFunctor { - HOSTDEVICE FillFirstZeroPositionGradFunctor(const int64_t *first_zero_idx, - const T *grad_value, - size_t mid_dim, size_t inner_dim, - T *dx) - : first_zero_idx_(first_zero_idx), - grad_value_(grad_value), - mid_dim_(mid_dim), - inner_dim_(inner_dim), - dx_(dx) {} - - HOSTDEVICE void operator()(size_t idx) const { - auto outer_idx = idx / inner_dim_; - auto inner_idx = idx % inner_dim_; - auto mid_idx = first_zero_idx_[idx]; - if (mid_idx >= 0) { - auto full_idx = - outer_idx * mid_dim_ * inner_dim_ + mid_idx * inner_dim_ + inner_idx; - dx_[full_idx] *= grad_value_[full_idx]; - } - } - - private: - const int64_t *first_zero_idx_; - const T *grad_value_; - size_t mid_dim_; - size_t inner_dim_; - T *dx_; -}; - -/* -Reference to -https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/ReduceOps.cpp -input: x, y, dL/dy -output: dL/dx -dL/dx[i] = sum{0<=j k, dL/dx[i] = 0; -i < k, dL/dx[i] = 1/x[i]*sum{i<=j k - dx[i] = 0; - x_filled_one[i] = x[i]; - } - } - } -} -T = reversed_cumsum(dy[j]*cumprod(x_filled_one[j])); -if (zero_index != -1) { - dx[zero_index] *= T[zero_index]; -} -*/ - -template -class CumprodGradOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - const auto *x = ctx.Input("X"); - const auto *y = ctx.Input("Out"); - const auto *dy = - ctx.Input(framework::GradVarName("Out")); - auto *dx = ctx.Output(framework::GradVarName("X")); - auto dim = ctx.Attr("dim"); - - size_t outer_dim, mid_dim, inner_dim; - GetCumprodDimInfo(x->dims(), dim, &outer_dim, &mid_dim, &inner_dim); - if (outer_dim == 0 || mid_dim == 0 || inner_dim == 0) return; - - size_t numel = outer_dim * mid_dim * inner_dim; - - const auto *x_data = x->data(); - const auto *y_data = y->data(); - const auto *dy_data = dy->data(); - - auto place = ctx.GetPlace(); - const auto &dev_ctx = - ctx.template device_context(); - auto *dx_data = dx->mutable_data(place); - - // deal with complex - const T *x_data_deal; - const T *y_data_deal; - memory::AllocationPtr x_conj; - memory::AllocationPtr y_conj; - if (framework::IsComplex::value) { - x_conj = memory::Alloc(place, numel * sizeof(T)); - auto *x_data_conj = reinterpret_cast(x_conj->ptr()); - y_conj = memory::Alloc(place, numel * sizeof(T)); - auto *y_data_conj = reinterpret_cast(y_conj->ptr()); - - platform::ForRange for_range_x(dev_ctx, - numel); - phi::funcs::ConjFunctor functor_x(x_data, numel, x_data_conj); - for_range_x(functor_x); - - platform::ForRange for_range_y(dev_ctx, - numel); - phi::funcs::ConjFunctor functor_y(y_data, numel, y_data_conj); - for_range_y(functor_y); - x_data_deal = x_data_conj; - y_data_deal = y_data_conj; - } else { - x_data_deal = x_data; - y_data_deal = y_data; - } - -// Step 1: find cummax-ed zero mask of x -#ifdef PADDLE_WITH_CUDA - const auto &exec_policy = thrust::cuda::par.on(dev_ctx.stream()); -#else - const auto &exec_policy = thrust::hip::par.on(dev_ctx.stream()); -#endif - auto zero_mask_without_cummax = - memory::Alloc(place, numel * sizeof(uint8_t)); - auto *zero_mask_without_cummax_data = - reinterpret_cast(zero_mask_without_cummax->ptr()); - thrust::transform( - exec_policy, thrust::device_pointer_cast(x_data_deal), - thrust::device_pointer_cast(x_data_deal) + numel, - thrust::device_pointer_cast(zero_mask_without_cummax_data), - IsZeroFunctor()); - - auto zero_mask = memory::Alloc(place, numel * sizeof(uint8_t)); - auto *zero_mask_data = reinterpret_cast(zero_mask->ptr()); - math::InclusiveScan( - zero_mask_without_cummax_data, zero_mask_data, outer_dim, mid_dim, - inner_dim, static_cast(0), cub::Max(), /*reverse=*/false, - dev_ctx); - zero_mask_without_cummax = nullptr; - - // Step 2: calculate reversed cumsum(dy * y) - auto dy_mul_y = memory::Alloc(place, numel * sizeof(T)); - auto *dy_mul_y_data = reinterpret_cast(dy_mul_y->ptr()); - thrust::transform(exec_policy, thrust::device_pointer_cast(dy_data), - thrust::device_pointer_cast(dy_data) + numel, - thrust::device_pointer_cast(y_data_deal), - thrust::device_pointer_cast(dy_mul_y_data), - MultiplyFunctor()); - - auto dy_mul_y_reversed_cumsum = memory::Alloc(place, numel * sizeof(T)); - auto *dy_mul_y_reversed_cumsum_data = - reinterpret_cast(dy_mul_y_reversed_cumsum->ptr()); - math::InclusiveScan( - dy_mul_y_data, dy_mul_y_reversed_cumsum_data, outer_dim, mid_dim, - inner_dim, static_cast(0), cub::Sum(), /*reverse=*/true, dev_ctx); - - // Step 3: calculate the gradient value except the first zero position. - // The gradient value of the first zero position is filled with out[idx-1], - // while the gradient value of the other positions are calculated out - // completely. This functor also: - // (1) find the first zero index, i.e., first_zero_idx_data. - // (2) fill x_filled_one, which satifies - // x_filled_one[i] = x[i], i > pos - // x_filled_one[i] = 1, i <= pos - auto first_zero_idx = - memory::Alloc(place, outer_dim * inner_dim * sizeof(int64_t)); - auto *first_zero_idx_data = - reinterpret_cast(first_zero_idx->ptr()); - auto *x_filled_one_data = dy_mul_y_data; // reuse former allocated memory - platform::ForRange for_range(dev_ctx, numel); - CumprodGradFunctorExceptFirstZero functor_except_first_zero( - x_data_deal, y_data_deal, dy_mul_y_reversed_cumsum_data, zero_mask_data, - mid_dim, inner_dim, dx_data, first_zero_idx_data, x_filled_one_data); - for_range(functor_except_first_zero); - - // Step 4: calculate cumprod of x_filled_one - auto *x_filled_one_cumprod_data = - dy_mul_y_reversed_cumsum_data; // reuse former allocated memory - math::InclusiveScan>( - x_filled_one_data, x_filled_one_cumprod_data, outer_dim, mid_dim, - inner_dim, static_cast(1), MultiplyFunctor(), /*reverse=*/false, - dev_ctx); - - // Step 5: calculate reversed cumsum(dy * x_filled_one_cumprod) - auto *dy_mul_x_filled_one_cumprod = - dy_mul_y_data; // reuse former allocated memory - thrust::transform(exec_policy, thrust::device_pointer_cast(dy_data), - thrust::device_pointer_cast(dy_data) + numel, - thrust::device_pointer_cast(x_filled_one_cumprod_data), - thrust::device_pointer_cast(dy_mul_x_filled_one_cumprod), - MultiplyFunctor()); - auto *dy_mul_x_filled_one_cumprod_reversed_cumsum = - dy_mul_y_reversed_cumsum_data; // reuse former allocated memory - math::InclusiveScan( - dy_mul_x_filled_one_cumprod, - dy_mul_x_filled_one_cumprod_reversed_cumsum, outer_dim, mid_dim, - inner_dim, static_cast(0), cub::Sum(), - /*reverse=*/true, dev_ctx); - - // Step 6: fill zero pos gradient value - platform::ForRange - for_range_fill_zero_pos_grad(dev_ctx, outer_dim * inner_dim); - FillFirstZeroPositionGradFunctor fill_first_zero_pos_grad_functor( - first_zero_idx_data, dy_mul_x_filled_one_cumprod_reversed_cumsum, - mid_dim, inner_dim, dx_data); - for_range_fill_zero_pos_grad(fill_first_zero_pos_grad_functor); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL( - cumprod, ops::CumprodOpCUDAKernel, ops::CumprodOpCUDAKernel, - ops::CumprodOpCUDAKernel, ops::CumprodOpCUDAKernel, - ops::CumprodOpCUDAKernel>, - ops::CumprodOpCUDAKernel>); - -REGISTER_OP_CUDA_KERNEL( - cumprod_grad, ops::CumprodGradOpCUDAKernel, - ops::CumprodGradOpCUDAKernel, ops::CumprodGradOpCUDAKernel, - ops::CumprodGradOpCUDAKernel, - ops::CumprodGradOpCUDAKernel>, - ops::CumprodGradOpCUDAKernel>); diff --git a/paddle/fluid/operators/cumprod_op.h b/paddle/fluid/operators/cumprod_op.h deleted file mode 100644 index 74ed2008ae98380388d874529264c6b6c0b5a49a..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/cumprod_op.h +++ /dev/null @@ -1,170 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/platform/for_range.h" -#include "paddle/phi/kernels/funcs/complex_functors.h" - -namespace paddle { -namespace operators { -using Tensor = framework::Tensor; - -static void GetCumprodDimInfo(const framework::DDim& dim, int cumprod_dim, - size_t* outer_dim, size_t* mid_dim, - size_t* inner_dim) { - PADDLE_ENFORCE_GE( - cumprod_dim, -dim.size(), - platform::errors::InvalidArgument( - "The input dim of CumprodOp should be larger than the opposite " - "rank of input x which is %d.But received dim=%d", - -dim.size(), cumprod_dim)); - PADDLE_ENFORCE_LT(cumprod_dim, dim.size(), - platform::errors::InvalidArgument( - "The input dim of CumprodOp should be smaller than the " - "rank of input x which is %d.But received dim=%d", - dim.size(), cumprod_dim)); - if (cumprod_dim < 0) cumprod_dim += dim.size(); - - *outer_dim = 1; - for (int i = 0; i < cumprod_dim; ++i) { - *outer_dim *= dim[i]; - } - *mid_dim = dim[cumprod_dim]; - *inner_dim = 1; - for (int i = cumprod_dim + 1; i < dim.size(); ++i) { - *inner_dim *= dim[i]; - } -} - -template -class CumprodOpCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* x = context.Input("X"); - Tensor* out = context.Output("Out"); - int dim = context.Attr("dim"); - - auto* x_data = x->data(); - auto* out_data = out->mutable_data(context.GetPlace()); - framework::DDim shape = x->dims(); - - size_t outer_dim = 1; - size_t mid_dim = 1; - size_t inner_dim = 1; - GetCumprodDimInfo(shape, dim, &outer_dim, &mid_dim, &inner_dim); - - for (size_t i = 0; i < outer_dim; i++) { - for (size_t j = 0; j < mid_dim; j++) { - for (size_t k = 0; k < inner_dim; k++) { - size_t pos = i * mid_dim * inner_dim + j * inner_dim + k; - if (j == 0) { - out_data[pos] = x_data[pos]; - } else { - out_data[pos] = out_data[pos - inner_dim] * x_data[pos]; - } - } - } - } - } -}; - -template -class CumprodGradOpCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const { - const Tensor* d_out = context.Input(framework::GradVarName("Out")); - const Tensor* x = context.Input("X"); - const Tensor* out = context.Input("Out"); - - int dim = context.Attr("dim"); - framework::DDim shape = x->dims(); - Tensor* d_x = context.Output(framework::GradVarName("X")); - - auto* d_out_data = d_out->data(); - auto* x_data = x->data(); - auto* out_data = out->data(); - auto* d_x_data = d_x->mutable_data(context.GetPlace()); - - auto place = context.GetPlace(); - const auto& dev_ctx = - context.template device_context(); - - size_t outer_dim = 1; - size_t mid_dim = 1; - size_t inner_dim = 1; - GetCumprodDimInfo(shape, dim, &outer_dim, &mid_dim, &inner_dim); - size_t numel = outer_dim * mid_dim * inner_dim; - - // deal with complex - const T* x_data_deal; - const T* out_data_deal; - memory::AllocationPtr x_conj; - memory::AllocationPtr out_conj; - if (framework::IsComplex::value) { - x_conj = memory::Alloc(place, numel * sizeof(T)); - auto* x_data_conj = reinterpret_cast(x_conj->ptr()); - out_conj = memory::Alloc(place, numel * sizeof(T)); - auto* out_data_conj = reinterpret_cast(out_conj->ptr()); - - platform::ForRange for_range_x(dev_ctx, - numel); - phi::funcs::ConjFunctor functor_x(x_data, numel, x_data_conj); - for_range_x(functor_x); - - platform::ForRange for_range_out(dev_ctx, - numel); - phi::funcs::ConjFunctor functor_out(out_data, numel, out_data_conj); - for_range_out(functor_out); - - x_data_deal = x_data_conj; - out_data_deal = out_data_conj; - } else { - x_data_deal = x_data; - out_data_deal = out_data; - } - - for (size_t i = 0; i < outer_dim; i++) { - for (size_t k = 0; k < inner_dim; k++) { - for (size_t j = 0; j < mid_dim; j++) { - size_t index = i * mid_dim * inner_dim + j * inner_dim + k; - d_x_data[index] = 0; - for (size_t n = 0; n < mid_dim; n++) { - size_t pos = i * mid_dim * inner_dim + n * inner_dim + k; - T elem; - if (j == 0) { - elem = d_out_data[pos]; - } else { - elem = d_out_data[pos] * out_data_deal[index - inner_dim]; - } - if (pos > index) { - for (size_t m = index + inner_dim; m <= pos; m += inner_dim) { - elem *= x_data_deal[m]; - } - } else if (pos < index) { - elem = static_cast(0); - } - d_x_data[index] += elem; - } - } - } - } - } -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/deformable_conv_op.cc b/paddle/fluid/operators/deformable_conv_op.cc index b15efc5f84bdd0a62f3ee5deca01b1e601f19aed..6e15fd090b8c4feeb8837efb392a2d3a6a6b80c7 100644 --- a/paddle/fluid/operators/deformable_conv_op.cc +++ b/paddle/fluid/operators/deformable_conv_op.cc @@ -338,8 +338,6 @@ REGISTER_OPERATOR(deformable_conv, ops::DeformableConvOp, REGISTER_OPERATOR(deformable_conv_grad, ops::DeformableConvGradOp); -REGISTER_OP_CPU_KERNEL(deformable_conv, ops::DeformableConvCPUKernel, - ops::DeformableConvCPUKernel); REGISTER_OP_CPU_KERNEL(deformable_conv_grad, ops::DeformableConvGradCPUKernel, ops::DeformableConvGradCPUKernel); diff --git a/paddle/fluid/operators/deformable_conv_op.cu b/paddle/fluid/operators/deformable_conv_op.cu index 2c7d905c79b37e9b1c8777d62f1b593c8a8866a5..ad10abf9c647b588e8c66dea89588e344c46ae69 100644 --- a/paddle/fluid/operators/deformable_conv_op.cu +++ b/paddle/fluid/operators/deformable_conv_op.cu @@ -446,108 +446,6 @@ __global__ void FilterGradAddupGpuKernel(const int nthreads, const int n, } } -template -class DeformableConvCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const Tensor* input = ctx.Input("Input"); - const Tensor offset = *ctx.Input("Offset"); - const Tensor mask = *ctx.Input("Mask"); - Tensor filter = *ctx.Input("Filter"); - Tensor* output = ctx.Output("Output"); - output->mutable_data(ctx.GetPlace()); - - auto& dev_ctx = ctx.cuda_device_context(); - - const int groups = ctx.Attr("groups"); - const int deformable_groups = ctx.Attr("deformable_groups"); - const int im2col_step = ctx.Attr("im2col_step"); - const std::vector strides = ctx.Attr>("strides"); - const std::vector paddings = ctx.Attr>("paddings"); - const std::vector dilations = ctx.Attr>("dilations"); - - const int batch_size = static_cast(input->dims()[0]); - - std::vector filter_shape_vec(phi::vectorize(filter.dims())); - std::vector output_shape_vec(phi::vectorize(output->dims())); - - // col_shape_vec: {c_i * k_h * k_w, im2col_step, o_h, o_w} - std::vector col_buffer_shape_vec(filter_shape_vec.size()); - col_buffer_shape_vec[0] = - input->dims()[1] * filter.dims()[2] * filter.dims()[3]; - col_buffer_shape_vec[1] = im2col_step; - for (size_t j = 0; j < filter_shape_vec.size() - 2; ++j) { - col_buffer_shape_vec[j + 2] = output_shape_vec[j + 2]; - } - framework::DDim col_shape(phi::make_ddim(col_buffer_shape_vec)); - std::vector output_buffer_shape_vec(1); - output_buffer_shape_vec[0] = batch_size * output_shape_vec[1] * - output_shape_vec[2] * output_shape_vec[3]; - framework::DDim output_shape(phi::make_ddim(output_buffer_shape_vec)); - Tensor col_buffer; - Tensor output_buffer; - col_buffer = ctx.AllocateTmpTensor(col_shape, dev_ctx); - output_buffer = - ctx.AllocateTmpTensor(output_shape, dev_ctx); - - int64_t M = output_shape_vec[1] / groups; - int64_t N = im2col_step * output_shape_vec[2] * output_shape_vec[3]; - int64_t K = - input->dims()[1] * filter_shape_vec[2] * filter_shape_vec[3] / groups; - - Tensor weight_3d; - weight_3d.ShareDataWith(filter).Resize(phi::make_ddim({groups, M, K})); - Tensor col_buffer_3d; - col_buffer_3d.ShareDataWith(col_buffer) - .Resize(phi::make_ddim({groups, K, N})); - Tensor output_4d; - output_4d.ShareDataWith(output_buffer) - .Resize(phi::make_ddim({batch_size / im2col_step, groups, M, N})); - output_4d.mutable_data(ctx.GetPlace()); - framework::DDim input_shape = - phi::slice_ddim(input->dims(), 1, input->dims().size()); - std::vector input_shape_vec = phi::vectorize(input_shape); - - int input_dim = input->numel() / input->dims()[0]; - int input_offset_dim = offset.numel() / offset.dims()[0]; - int input_mask_dim = mask.numel() / mask.dims()[0]; - - auto blas = phi::funcs::GetBlas(dev_ctx); - - const T* input_ptr = input->data(); - const T* offset_ptr = offset.data(); - const T* mask_ptr = mask.data(); - col_buffer.mutable_data(ctx.GetPlace()); - T* col_buffer_ptr = col_buffer.data(); - - for (int i = 0; i < batch_size / im2col_step; ++i) { - ModulatedDeformableIm2col( - ctx.device_context(), input_ptr + i * im2col_step * input_dim, - offset_ptr + i * im2col_step * input_offset_dim, - mask_ptr + i * im2col_step * input_mask_dim, input_shape_vec, - col_buffer_shape_vec, filter_shape_vec, paddings, strides, dilations, - deformable_groups, col_buffer_ptr); - - Tensor output_3d = output_4d.Slice(i, i + 1).Resize( - phi::slice_ddim(output_4d.dims(), 1, output_4d.dims().size())); - for (int g = 0; g < groups; ++g) { - Tensor weight_3d_slice = weight_3d.Slice(g, g + 1).Resize( - phi::slice_ddim(weight_3d.dims(), 1, weight_3d.dims().size())); - Tensor col_buffer_3d_slice = - col_buffer_3d.Slice(g, g + 1).Resize(phi::slice_ddim( - col_buffer_3d.dims(), 1, col_buffer_3d.dims().size())); - Tensor output_3d_slice = output_3d.Slice(g, g + 1).Resize( - phi::slice_ddim(output_3d.dims(), 1, output_3d.dims().size())); - - blas.MatMul(weight_3d_slice, false, col_buffer_3d_slice, false, T(1.0), - &output_3d_slice, T(0.0)); - } - } - output->ShareDataWith(output_buffer) - .Resize(phi::make_ddim(output_shape_vec)); - } -}; - template class DeformableConvGradCUDAKernel : public framework::OpKernel { public: @@ -740,9 +638,6 @@ class DeformableConvGradCUDAKernel : public framework::OpKernel { namespace ops = paddle::operators; using CUDA = paddle::platform::CUDADeviceContext; -REGISTER_OP_CUDA_KERNEL(deformable_conv, - ops::DeformableConvCUDAKernel, - ops::DeformableConvCUDAKernel); REGISTER_OP_CUDA_KERNEL(deformable_conv_grad, ops::DeformableConvGradCUDAKernel, ops::DeformableConvGradCUDAKernel); diff --git a/paddle/fluid/operators/deformable_conv_op.h b/paddle/fluid/operators/deformable_conv_op.h index 66961655ee6ffa88e162477ad424eb10a0702b27..1176b96987ed6fbd0077e68d5bb0d4ece5c4b4f0 100644 --- a/paddle/fluid/operators/deformable_conv_op.h +++ b/paddle/fluid/operators/deformable_conv_op.h @@ -318,102 +318,6 @@ void FilterGradAddupCPUKernel(const int nthreads, const int n, const int height, } } -template -class DeformableConvCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("Input"); - auto* offset = ctx.Input("Offset"); - auto* mask = ctx.Input("Mask"); - Tensor filter = *ctx.Input("Filter"); - Tensor* output = ctx.Output("Output"); - output->mutable_data(ctx.GetPlace()); - - auto& dev_ctx = ctx.template device_context(); - - const int groups = ctx.Attr("groups"); - const int deformable_groups = ctx.Attr("deformable_groups"); - const int im2col_step = ctx.Attr("im2col_step"); - const std::vector strides = ctx.Attr>("strides"); - const std::vector paddings = ctx.Attr>("paddings"); - const std::vector dilations = ctx.Attr>("dilations"); - - const int batch_size = static_cast(input->dims()[0]); - - std::vector filter_shape_vec(phi::vectorize(filter.dims())); - std::vector output_shape_vec(phi::vectorize(output->dims())); - - // col_shape_vec: {c_i * k_h * k_w, im2col_step, o_h, o_w} - std::vector col_buffer_shape_vec(filter_shape_vec.size()); - col_buffer_shape_vec[0] = - input->dims()[1] * filter.dims()[2] * filter.dims()[3]; - col_buffer_shape_vec[1] = im2col_step; - for (size_t j = 0; j < filter_shape_vec.size() - 2; ++j) { - col_buffer_shape_vec[j + 2] = output_shape_vec[j + 2]; - } - framework::DDim col_shape(phi::make_ddim(col_buffer_shape_vec)); - std::vector output_buffer_shape_vec(1); - output_buffer_shape_vec[0] = batch_size * output_shape_vec[1] * - output_shape_vec[2] * output_shape_vec[3]; - framework::DDim output_shape(phi::make_ddim(output_buffer_shape_vec)); - Tensor col_buffer; - Tensor output_buffer; - col_buffer = ctx.AllocateTmpTensor(col_shape, dev_ctx); - output_buffer = - ctx.AllocateTmpTensor(output_shape, dev_ctx); - int64_t M = output_shape_vec[1] / groups; - int64_t N = im2col_step * output_shape_vec[2] * output_shape_vec[3]; - int64_t K = - input->dims()[1] * filter_shape_vec[2] * filter_shape_vec[3] / groups; - - Tensor weight_3d; - weight_3d.ShareDataWith(filter).Resize(phi::make_ddim({groups, M, K})); - Tensor col_buffer_3d; - col_buffer_3d.ShareDataWith(col_buffer) - .Resize(phi::make_ddim({groups, K, N})); - Tensor output_4d; - output_4d.ShareDataWith(output_buffer) - .Resize(phi::make_ddim({batch_size / im2col_step, groups, M, N})); - output_4d.mutable_data(ctx.GetPlace()); - framework::DDim input_shape = - phi::slice_ddim(input->dims(), 1, input->dims().size()); - std::vector input_shape_vec = phi::vectorize(input_shape); - int input_dim = input->numel() / input->dims()[0]; - int input_offset_dim = offset->numel() / offset->dims()[0]; - int input_mask_dim = mask->numel() / mask->dims()[0]; - auto blas = phi::funcs::GetBlas(dev_ctx); - const T* input_ptr = input->data(); - const T* offset_ptr = offset->data(); - const T* mask_ptr = mask->data(); - col_buffer.mutable_data(ctx.GetPlace()); - T* col_buffer_ptr = col_buffer.data(); - for (int i = 0; i < batch_size / im2col_step; ++i) { - ModulatedDeformableIm2colCPU( - dev_ctx, input_ptr + i * im2col_step * input_dim, - offset_ptr + i * im2col_step * input_offset_dim, - mask_ptr + i * im2col_step * input_mask_dim, input_shape_vec, - col_buffer_shape_vec, filter_shape_vec, paddings, strides, dilations, - deformable_groups, col_buffer_ptr); - Tensor output_3d = output_4d.Slice(i, i + 1).Resize( - phi::slice_ddim(output_4d.dims(), 1, output_4d.dims().size())); - // get the product of pixel and weight - for (int g = 0; g < groups; ++g) { - Tensor weight_3d_slice = weight_3d.Slice(g, g + 1).Resize( - phi::slice_ddim(weight_3d.dims(), 1, weight_3d.dims().size())); - Tensor col_buffer_3d_slice = - col_buffer_3d.Slice(g, g + 1).Resize(phi::slice_ddim( - col_buffer_3d.dims(), 1, col_buffer_3d.dims().size())); - Tensor output_3d_slice = output_3d.Slice(g, g + 1).Resize( - phi::slice_ddim(output_3d.dims(), 1, output_3d.dims().size())); - blas.MatMul(weight_3d_slice, false, col_buffer_3d_slice, false, T(1.0), - &output_3d_slice, T(0.0)); - } - } - output->ShareDataWith(output_buffer) - .Resize(phi::make_ddim(output_shape_vec)); - } -}; - template class DeformableConvGradCPUKernel : public framework::OpKernel { public: diff --git a/paddle/fluid/operators/detection/yolo_box_op.cc b/paddle/fluid/operators/detection/yolo_box_op.cc index 0d9fbf612f73c428fb8050fcfcc319ddafabe482..35e389090175f7768244b95b1d388ea0d735c2d5 100644 --- a/paddle/fluid/operators/detection/yolo_box_op.cc +++ b/paddle/fluid/operators/detection/yolo_box_op.cc @@ -9,8 +9,10 @@ See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -235,10 +237,13 @@ class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(yolo_box, YoloBoxInferShapeFunctor, + PD_INFER_META(phi::YoloBoxInferMeta)); REGISTER_OPERATOR( yolo_box, ops::YoloBoxOp, ops::YoloBoxOpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); + paddle::framework::EmptyGradOpMaker, + YoloBoxInferShapeFunctor); REGISTER_OP_VERSION(yolo_box) .AddCheckpoint( diff --git a/paddle/fluid/operators/determinant_op.cc b/paddle/fluid/operators/determinant_op.cc index 98247fbc862bbc199316a4d4c8971d0f4a159544..6959b5cf811069cc66321d2129a2b69d4e922f09 100644 --- a/paddle/fluid/operators/determinant_op.cc +++ b/paddle/fluid/operators/determinant_op.cc @@ -13,6 +13,10 @@ // limitations under the License. #include "paddle/fluid/operators/determinant_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -20,11 +24,6 @@ namespace operators { class DeterminantOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "determinant"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "determinant"); - } }; class DeterminantOpMaker : public framework::OpProtoAndCheckerMaker { @@ -44,19 +43,6 @@ class DeterminantGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", - "DeterminantGradOp"); - OP_INOUT_CHECK(ctx->HasInput("Out"), "Input", "Out", "DeterminantGradOp"); - OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input", - framework::GradVarName("Out"), "DeterminantGradOp"); - OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Input")), "Output", - framework::GradVarName("Input"), "DeterminantGradOp"); - - ctx->SetOutputDim(framework::GradVarName("Input"), - ctx->GetInputDim("Input")); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -162,19 +148,17 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(SlogDeterminantGradNoNeedBufferVarsInferer, namespace ops = paddle::operators; namespace plat = paddle::platform; +DECLARE_INFER_SHAPE_FUNCTOR(determinant, DeterminantInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); REGISTER_OPERATOR(determinant, ops::DeterminantOp, ops::DeterminantOpMaker, ops::DeterminantGradOpMaker, - ops::DeterminantGradOpMaker); - -REGISTER_OPERATOR(determinant_grad, ops::DeterminantGradOp) + ops::DeterminantGradOpMaker, + DeterminantInferShapeFunctor); -REGISTER_OP_CPU_KERNEL(determinant, - ops::DeterminantKernel, - ops::DeterminantKernel); - -REGISTER_OP_CPU_KERNEL( - determinant_grad, ops::DeterminantGradKernel, - ops::DeterminantGradKernel); +DECLARE_INFER_SHAPE_FUNCTOR(determinant_grad, DeterminantGradInferShapeFunctor, + PD_INFER_META(phi::GeneralUnaryGradInferMeta)); +REGISTER_OPERATOR(determinant_grad, ops::DeterminantGradOp, + DeterminantGradInferShapeFunctor); REGISTER_OPERATOR(slogdeterminant, ops::SlogDeterminantOp, ops::SlogDeterminantOpMaker, diff --git a/paddle/fluid/operators/determinant_op.cu b/paddle/fluid/operators/determinant_op.cu index d19d4c3d093860c1f603e4d752063b7a858c0460..d8237fa3004e65ce74ece55e4d0e62b5564e5c5f 100644 --- a/paddle/fluid/operators/determinant_op.cu +++ b/paddle/fluid/operators/determinant_op.cu @@ -17,14 +17,6 @@ limitations under the License. */ namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL( - determinant, ops::DeterminantKernel, - ops::DeterminantKernel); - -REGISTER_OP_CUDA_KERNEL( - determinant_grad, - ops::DeterminantGradKernel, - ops::DeterminantGradKernel); REGISTER_OP_CUDA_KERNEL( slogdeterminant, ops::SlogDeterminantKernel, diff --git a/paddle/fluid/operators/determinant_op.h b/paddle/fluid/operators/determinant_op.h index f89ecd37222870f73d00870c9454bf5590d504e3..a1fe8a25665ec84b38a535f541a2cbe33d0a7fcf 100644 --- a/paddle/fluid/operators/determinant_op.h +++ b/paddle/fluid/operators/determinant_op.h @@ -22,12 +22,15 @@ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/for_range.h" #include "paddle/phi/kernels/complex_kernel.h" +#include "paddle/phi/kernels/elementwise_kernel.h" #include "paddle/phi/kernels/full_kernel.h" +#include "paddle/phi/kernels/funcs/common_shape.h" #include "paddle/phi/kernels/funcs/diag_functor.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/matrix_inverse.h" #include "paddle/phi/kernels/funcs/unsqueeze.h" -#include "paddle/phi/kernels/math_kernel.h" +#include "paddle/phi/kernels/impl/determinant_grad_kernel_impl.h" +#include "paddle/phi/kernels/impl/determinant_kernel_impl.h" #include "paddle/phi/kernels/matmul_kernel.h" #include "paddle/phi/kernels/transpose_kernel.h" @@ -40,232 +43,6 @@ T sign(T val) { return static_cast(T(0) < val) - (val < T(0)); } -template -class EigenMatrix {}; - -template <> -class EigenMatrix { - public: - using MatrixType = Eigen::MatrixXf; -}; - -template <> -class EigenMatrix { - public: - using MatrixType = Eigen::MatrixXd; -}; - -inline int64_t GetBatchCount(const framework::DDim dims) { - int64_t batch_count = 1; - auto dim_size = dims.size(); - PADDLE_ENFORCE_GE( - dim_size, 2, - platform::errors::InvalidArgument( - "the input matrix dimension size should greater than 2.")); - - // Cumulative multiplying each dimension until the last 2 to get the batch - // count, - // for example a tensor with shape [3,3,3,3], the batch count of matrices is - // 9. - for (int64_t i = 0; i < dims.size() - 2; i++) { - batch_count *= dims[i]; - } - - return batch_count; -} - -template -struct DeterminantFunctor { - void operator()(const Tensor& input, const framework::ExecutionContext ctx, - int64_t rank, int64_t batch_count, Tensor* output) { - std::vector input_vec; - std::vector output_vec; - framework::TensorToVector(input, ctx.device_context(), &input_vec); - for (int64_t i = 0; i < batch_count; ++i) { // maybe can be parallel - auto begin_iter = input_vec.begin() + i * rank * rank; - auto end_iter = input_vec.begin() + (i + 1) * rank * rank; - std::vector sub_vec(begin_iter, - end_iter); // get every square matrix data - typename EigenMatrix::MatrixType matrix(rank, rank); - for (int64_t i = 0; i < rank; ++i) { - for (int64_t j = 0; j < rank; ++j) { - matrix(i, j) = sub_vec[rank * i + j]; - } - } - output_vec.push_back(matrix.determinant()); - } - framework::TensorFromVector(output_vec, output); - } -}; -template -class DeterminantKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* input = context.Input("Input"); - auto input_dim = vectorize(input->dims()); - auto input_dim_size = input_dim.size(); - auto* output = context.Output("Out"); - - auto batch_count = GetBatchCount(input->dims()); - VLOG(2) << "input dim:" << input->dims(); - PADDLE_ENFORCE_GE( - input_dim_size, 2, - platform::errors::InvalidArgument( - "the input matrix dimension size should greater than 2.")); - PADDLE_ENFORCE_EQ(input_dim[input_dim_size - 1], - input_dim[input_dim_size - 2], - platform::errors::InvalidArgument( - "the input matrix should be square matrix.")); - auto rank = input_dim[input_dim_size - 1]; // square matrix length - DeterminantFunctor()(*input, context, rank, batch_count, output); - auto output_dims = phi::slice_ddim(input->dims(), 0, input_dim_size - 2); - if (input_dim_size > 2) { - output->Resize(output_dims); - } else { - // when input is a two-dimension matrix, The det value is a number. - output->Resize({1}); - } - VLOG(2) << "output dim:" << output->dims(); - } -}; - -template -struct FoundZeroFunctor { - FoundZeroFunctor(const T* x, int64_t numel, bool* res) - : x_(x), numel_(numel), res_(res) {} - HOSTDEVICE void operator()(size_t idx) const { - if (*res_ || idx >= static_cast(numel_)) { - // founded zero number - return; - } - *res_ = (x_[idx] == static_cast(0)); - } - const T* x_; - int64_t numel_; - bool* res_; -}; - -template -inline bool CheckMatrixInvertible(const framework::ExecutionContext& ctx, - const framework::Tensor* det) { - auto& dev_ctx = ctx.template device_context(); - auto numel = det->numel(); - - framework::Tensor dev_tensor; - auto* data = dev_tensor.mutable_data({1}, ctx.GetPlace()); - - // set false - phi::funcs::SetConstant zero; - zero(dev_ctx, &dev_tensor, false); - - // find whether zero - platform::ForRange for_range(dev_ctx, numel); - FoundZeroFunctor functor(det->data(), numel, data); - for_range(functor); - - // copy to host - dev_ctx.Wait(); - framework::Tensor cpu_tensor; - framework::TensorCopy(dev_tensor, platform::CPUPlace(), &cpu_tensor); - - // if founded zero, the matrix is not invertible - // else the matrix is invertible - auto* res = cpu_tensor.data(); - return !(*res); -} - -template -class DeterminantGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto& orig_dev_ctx = context.template device_context(); - const auto* input = context.Input("Input"); - const auto* det = context.Input("Out"); - const auto* grad = - context.Input(framework::GradVarName("Out")); - auto* ddet = - context.Output(framework::GradVarName("Input")); - - auto input_dims_size = input->dims().size(); - if (input_dims_size > 2) { - PADDLE_ENFORCE_EQ( - grad->dims().size() + 2, input_dims_size, - platform::errors::InvalidArgument( - "The grad tensor of det dims size should 2 less than" - " input tensor's, but here differ %d", - input_dims_size - grad->dims().size())); - } else if (input_dims_size == 2) { - // input dims size 2 and grad dims size 1 is possible - PADDLE_ENFORCE_EQ( - grad->dims().size(), 1, - platform::errors::InvalidArgument( - "The grad tensor of det dims size should 2 less than" - " input tensor's, but here differ %d", - input_dims_size - grad->dims().size())); - } else { - // checked in forward, pass - } - - auto& dev_ctx = static_cast< - const typename framework::ConvertToPhiContext::TYPE&>( - orig_dev_ctx); - - // Check Whether the matrix is invertible - // (matrix A not invertible) == (det(A)=0) - if (!CheckMatrixInvertible(context, det)) { - // The matrix is not invertible - VLOG(3) << "The input matrix not invertible!"; - ddet->Resize(input->dims()); - phi::Full(dev_ctx, phi::vectorize(input->dims()), static_cast(0.0f), - ddet); - return; - } - - // The matrix is invertible - // let |A| = Determinant(A) - // Ref to https://people.maths.ox.ac.uk/gilesm/files/NA-08-01.pdf - // we set d|A| = unsqueeze(dA * |A|, [-1, -2]) * inverse(A).transpose(-2, - // -1) - - // First: inverse(A) - framework::Tensor inverse_A; - // A must be square matrices! - inverse_A.Resize(input->dims()); - inverse_A.mutable_data(context.GetPlace()); - - phi::funcs::MatrixInverseFunctor mat_inv; - mat_inv(orig_dev_ctx, *input, &inverse_A); - - VLOG(3) << "inverse(A) dims: " << inverse_A.dims(); - - // Second: inverse(A).transpose(-2, -1) - framework::Tensor transpose_inverse_A = - phi::TransposeLast2Dim(dev_ctx, inverse_A); - - VLOG(3) << "(dA * |A|).transpose(-2, -1) dims: " - << transpose_inverse_A.dims(); - - // Third: dA * |A| - auto mul_dA_detA = phi::Multiply(dev_ctx, *grad, *det); - VLOG(3) << "dA * |A| dims: " << mul_dA_detA.dims(); - - // Fourth: unsqueeze(dA * |A|, [-1, -2]) - auto unsqueeze1 = phi::funcs::Unsqueeze(mul_dA_detA, -1); - auto unsqueeze2 = phi::funcs::Unsqueeze(unsqueeze1, -2); - VLOG(3) << "unsqueezed(dA * |A|) dims: " << unsqueeze2.dims(); - - // Finally: unsqueeze(dA * |A|) * inverse(A) - auto res = phi::Multiply(dev_ctx, unsqueeze2, transpose_inverse_A); - - VLOG(3) << "unsqueeze(dA * |A|) * inverse(A) dims: " << res.dims(); - - framework::TensorCopy(res, context.GetPlace(), ddet); - - ddet->Resize(input->dims()); - VLOG(3) << "d|A| dims: " << ddet->dims(); - } -}; - template struct SlogDeterminantFunctor { void operator()(const Tensor& input, const framework::ExecutionContext ctx, @@ -280,7 +57,7 @@ struct SlogDeterminantFunctor { auto end_iter = input_vec.begin() + (i + 1) * rank * rank; std::vector sub_vec(begin_iter, end_iter); // get every square matrix data - typename EigenMatrix::MatrixType matrix(rank, rank); + typename phi::detail::EigenMatrix::MatrixType matrix(rank, rank); for (int64_t i = 0; i < rank; ++i) { for (int64_t j = 0; j < rank; ++j) { matrix(i, j) = sub_vec[rank * i + j]; @@ -311,7 +88,7 @@ class SlogDeterminantKernel : public framework::OpKernel { auto input_dim_size = input_dim.size(); auto* output = context.Output("Out"); - auto batch_count = GetBatchCount(input->dims()); + auto batch_count = phi::detail::GetBatchCount(input->dims()); VLOG(2) << "input dim:" << input->dims(); PADDLE_ENFORCE_GE( input_dim_size, 2, @@ -370,7 +147,9 @@ class SlogDeterminantGradKernel : public framework::OpKernel { // (matrix A not invertible) == (absslogdet(A)=0) auto slogdet_vec = slogdet->Split(1, 0); auto absslogdet_val = slogdet_vec[0]; - if (!CheckMatrixInvertible(context, &absslogdet_val)) { + if (!phi::detail::CheckMatrixInvertible< + T, typename framework::ConvertToPhiContext::TYPE>( + dev_ctx, &absslogdet_val)) { // The matrix is not invertible VLOG(3) << "The input matrix not invertible!"; dslogdet->Resize(input->dims()); diff --git a/paddle/fluid/operators/dropout_op.cc b/paddle/fluid/operators/dropout_op.cc index 6d52ce45c4c10099dbeb4d4fadbf91f8c390ef46..3d9950902acfe80a3cfef6c9efa2c6370e685c32 100644 --- a/paddle/fluid/operators/dropout_op.cc +++ b/paddle/fluid/operators/dropout_op.cc @@ -14,7 +14,9 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -25,17 +27,6 @@ class DropoutOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Dropout"); - - auto x_dims = ctx->GetInputDim("X"); - ctx->SetOutputDim("Out", x_dims); - if (ctx->Attrs().Get("is_test") == false) { - ctx->SetOutputDim("Mask", x_dims); - } - ctx->ShareLoD("X", /*->*/ "Out"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -173,7 +164,11 @@ class DropoutGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(dropout, DropoutInferShapeFunctor, + PD_INFER_META(phi::DropoutInferMeta)); + REGISTER_OPERATOR(dropout, ops::DropoutOp, ops::DropoutOpMaker, ops::DropoutGradOpMaker, - ops::DropoutGradOpMaker); + ops::DropoutGradOpMaker, + DropoutInferShapeFunctor); REGISTER_OPERATOR(dropout_grad, ops::DropoutOpGrad); diff --git a/paddle/fluid/operators/eig_op.h b/paddle/fluid/operators/eig_op.h index 5e4c83e1a45ebdb96a0e764cfa2d3997442ae1ea..6daf05a9d778dfb194225f59321ffc3eb40235db 100644 --- a/paddle/fluid/operators/eig_op.h +++ b/paddle/fluid/operators/eig_op.h @@ -21,13 +21,13 @@ #include "paddle/fluid/operators/transpose_op.h" #include "paddle/fluid/platform/for_range.h" #include "paddle/phi/kernels/complex_kernel.h" +#include "paddle/phi/kernels/elementwise_kernel.h" #include "paddle/phi/kernels/funcs/complex_functors.h" #include "paddle/phi/kernels/funcs/diag_functor.h" #include "paddle/phi/kernels/funcs/lapack/lapack_function.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/slice.h" #include "paddle/phi/kernels/funcs/unsqueeze.h" -#include "paddle/phi/kernels/math_kernel.h" #include "paddle/phi/kernels/matmul_kernel.h" #include "paddle/phi/kernels/transpose_kernel.h" diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h index a995877778e4770ea8ae64c051a71b31c1fb1e29..c28abb916b7a7d59d5a1974bed63e43b2f32ef2c 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h @@ -27,7 +27,7 @@ limitations under the License. */ // only can include the headers in paddle/phi/include dirs #include "paddle/phi/kernels/elementwise_grad_kernel.h" -#include "paddle/phi/kernels/math_kernel.h" +#include "paddle/phi/kernels/elementwise_kernel.h" #endif namespace paddle { diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h index 58a3123c7e332f50b0830577436528f1e8df1cdf..6f4aba93d56e2a8227a8578067ac934d41243fb6 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h @@ -18,7 +18,7 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_op.h" #include "paddle/fluid/platform/cpu_info.h" -#include "paddle/phi/kernels/math_kernel.h" +#include "paddle/phi/kernels/elementwise_kernel.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc index 838df2e1625912dad127b672228f9cc64eb7cec3..f9347d281043ecc63acdb8ca2fb0a18dae4adc47 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,100 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h" -namespace paddle { -namespace framework { -class ExecutionContext; -} // namespace framework -namespace platform { -class CPUDeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -template -class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - ElemwiseGradKernel::Compute(ctx); - using Tensor = framework::Tensor; - - auto& dev_ctx = - ctx.template device_context(); - const auto& onednn_engine = dev_ctx.GetEngine(); - - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - - auto tz = phi::vectorize(dout->dims()); - memory::data_type dout_type = framework::ToMKLDNNDataType( - framework::TransToProtoVarType(dout->dtype())); - platform::ReorderMKLDNNHandler handler( - tz, framework::TransToProtoVarType(dout->dtype()), dout_type, - onednn_engine); - - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - auto reorder_src_memory_p = handler.AcquireSrcMemory( - dout->format(), platform::to_void_cast(dout->data())); - - if (dx) { - auto reorder_dst_memory_p = - handler.AcquireDstMemory(dx, dout->format(), ctx.GetPlace()); - auto reorder_p = - handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p); - platform::RecordEvent record_reorder( - "int_reorder", platform::TracerEventType::UserDefined, 2, - platform::EventRole::kUniqueOp); - reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p); - astream.wait(); - - dx->set_layout(DataLayout::kMKLDNN); - dx->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p)); - } - - if (dy) { - // Direct copy - if (dout->dims() == dy->dims()) { - auto reorder_dst_memory_p = - handler.AcquireDstMemory(dy, dout->format(), ctx.GetPlace()); - auto reorder_p = - handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p); - platform::RecordEvent record_reorder( - "int_reorder", platform::TracerEventType::UserDefined, 2, - platform::EventRole::kUniqueOp); - reorder_p->execute(astream, *reorder_src_memory_p, - *reorder_dst_memory_p); - astream.wait(); - - dy->set_layout(DataLayout::kMKLDNN); - dy->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p)); - } else { - // Broadcasting - platform::ReductionMKLDNNHandler handler_sum( - dnnl::algorithm::reduction_sum, 0.0f, 0.0f, onednn_engine, - ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy)); - auto dy_memory_p = handler_sum.AcquireDstMemory(dy); - auto reduction_p = handler_sum.AcquireForwardPrimitive(); - reduction_p->execute(astream, {{DNNL_ARG_SRC, *reorder_src_memory_p}, - {DNNL_ARG_DST, *dy_memory_p}}); - astream.wait(); - - dy->set_layout(DataLayout::kMKLDNN); - dy->set_format( - platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape( - phi::vectorize(dy->dims())))); - } - } - } -}; - -} // namespace operators -} // namespace paddle - namespace ops = paddle::operators; REGISTER_OP_KERNEL( @@ -116,6 +24,8 @@ REGISTER_OP_KERNEL( ops::EltwiseMKLDNNKernel, ops::EltwiseMKLDNNKernel) -REGISTER_OP_KERNEL(elementwise_add_grad, MKLDNN, ::paddle::platform::CPUPlace, - ops::EltwiseAddMKLDNNGradKernel, - ops::EltwiseAddMKLDNNGradKernel) +REGISTER_OP_KERNEL( + elementwise_add_grad, MKLDNN, ::paddle::platform::CPUPlace, + ops::EltwiseMKLDNNGradKernel, + ops::EltwiseMKLDNNGradKernel) diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc index 367d602f5902e816a468d43ccfa009fe35a045fc..c68aa8d3d1b46c9013c6fe6a12510f0cdb744682 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc @@ -1,146 +1,28 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h" - -namespace paddle { -namespace framework { -class ExecutionContext; -} // namespace framework -namespace platform { -class CPUDeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -template -class EltwiseDivMKLDNNGradKernel : public ElemwiseGradKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - ElemwiseGradKernel::Compute(ctx); - - auto& dev_ctx = - ctx.template device_context(); - const auto& mkldnn_engine = dev_ctx.GetEngine(); - - auto* y = ctx.Input("Y"); - auto* out = ctx.Input("Out"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - int axis = ctx.Attr("axis"); - - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - - if (dx) { - // dx = dout / y - - platform::BinaryMKLDNNHandler handler( - dnnl::algorithm::binary_div, axis, mkldnn_engine, ctx.GetPlace(), - dout, y, dx, 1.0f, 1.0f, 1.0f); - - const auto src_dout_memory = handler.AcquireSrcMemory(dout); - const auto src_y_memory = handler.AcquireSecondSrcMemory(y); - const auto dst_dx_memory = handler.AcquireDstMemory(dx); - - const auto binary_prim = handler.AcquireForwardPrimitive(); - - const std::unordered_map args = { - {DNNL_ARG_SRC_0, *src_dout_memory}, - {DNNL_ARG_SRC_1, *src_y_memory}, - {DNNL_ARG_DST, *dst_dx_memory}}; - - binary_prim->execute(astream, args); - astream.wait(); - - dx->set_layout(framework::DataLayout::kMKLDNN); - dx->set_format(platform::GetMKLDNNFormat(*dst_dx_memory)); - } - - if (dy) { - // dy = -dout * out / y - - platform::BinaryMKLDNNHandler y_handler( - dnnl::algorithm::binary_div, axis, mkldnn_engine, ctx.GetPlace(), y, - y, nullptr, 1.0f, 1.0f, 1.0f); - - const auto y_memory = y_handler.AcquireSrcMemory(y); - - dnnl::post_ops po; - po.append_binary(dnnl::algorithm::binary_div, y_memory->get_desc()); - - platform::BinaryMKLDNNHandler handler( - dnnl::algorithm::binary_mul, axis, mkldnn_engine, ctx.GetPlace(), - dout, out, nullptr, -1.0f, 1.0f, 1.0f, po); - - const auto src_dout_memory = handler.AcquireSrcMemory(dout); - const auto src_out_memory = handler.AcquireSecondSrcMemory(out); - - // If broadcasting is in use then let's write to temporary - // buffer allocated by oneDNN - const auto dst_dy_memory = (dout->dims() == dy->dims()) - ? handler.AcquireDstMemory(dy) - : handler.AcquireDstMemory(); - - const auto binary_prim = handler.AcquireForwardPrimitive(); - - const std::unordered_map args = { - {DNNL_ARG_SRC_0, *src_dout_memory}, - {DNNL_ARG_SRC_1, *src_out_memory}, - {DNNL_ARG_DST, *dst_dy_memory}, - {DNNL_ARG_ATTR_MULTIPLE_POST_OP(0) | DNNL_ARG_SRC_1, *y_memory}}; - - binary_prim->execute(astream, args); - astream.wait(); - - dy->set_layout(framework::DataLayout::kMKLDNN); - - // Reduction is needed for broadcasting scenario - if (dout->dims() != dy->dims()) { - platform::ReductionMKLDNNHandler handler_sum( - dnnl::algorithm::reduction_sum, 0.0f, 0.0f, mkldnn_engine, - ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy)); - auto dy_memory_p = handler_sum.AcquireDstMemory(dy); - auto reduction_p = handler_sum.AcquireForwardPrimitive(); - - // As source we use mem object with results from binary operation - reduction_p->execute(astream, {{DNNL_ARG_SRC, *dst_dy_memory}, - {DNNL_ARG_DST, *dy_memory_p}}); - astream.wait(); - dy->set_format( - platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape( - phi::vectorize(dy->dims())))); - - } else { - dy->set_format(platform::GetMKLDNNFormat(*dst_dy_memory)); - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -// TODO(piotrekobi) add int8, uint8 support -REGISTER_OP_KERNEL(elementwise_div, MKLDNN, paddle::platform::CPUPlace, - ops::EltwiseMKLDNNKernel, - ops::EltwiseMKLDNNKernel) - -REGISTER_OP_KERNEL(elementwise_div_grad, MKLDNN, paddle::platform::CPUPlace, - ops::EltwiseDivMKLDNNGradKernel, - ops::EltwiseDivMKLDNNGradKernel) +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_KERNEL(elementwise_div, MKLDNN, paddle::platform::CPUPlace, + ops::EltwiseMKLDNNKernel, + ops::EltwiseMKLDNNKernel) + +REGISTER_OP_KERNEL( + elementwise_div_grad, MKLDNN, paddle::platform::CPUPlace, + ops::EltwiseMKLDNNGradKernel, + ops::EltwiseMKLDNNGradKernel) diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h index ad8fd317013908e8908dff8bea3440e24779454e..d1a1aa3008c8b33690ecd9ea85501ad0178f592a 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h @@ -15,20 +15,35 @@ #pragma once #include #include -#include "paddle/fluid/operators/elementwise/elementwise_add_op.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/framework/data_layout_transform.h" +#include "paddle/fluid/operators/elementwise/elementwise_op.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/platform/mkldnn_reuse.h" namespace paddle { namespace operators { -using framework::DataLayout; -using framework::Tensor; using dnnl::memory; using dnnl::primitive; using dnnl::stream; +using framework::DataLayout; +using framework::Tensor; + +inline std::vector CalculateBroadcastedDims(const Tensor* x, + const Tensor* y) { + const auto src_tz = phi::vectorize(x->dims()); + const auto dst_tz = phi::vectorize(y->dims()); + + size_t j = 0; + std::vector dst_tz_ex(src_tz.size(), 1); + for (size_t i = 0; i < src_tz.size(); ++i) { + dst_tz_ex[i] = (src_tz[i] != dst_tz[j]) ? 1 : dst_tz[j++]; + if (j == dst_tz.size()) break; + } + + return dst_tz_ex; +} template class EltwiseMKLDNNKernel : public framework::OpKernel { @@ -103,7 +118,7 @@ class EltwiseMKLDNNKernel : public framework::OpKernel { // operation. const bool reuse_x_memopry = x->numel() == z->numel() && x->IsSharedBufferWith(*z); - std::shared_ptr dst_memory = nullptr; + std::shared_ptr dst_memory; if (reuse_x_memopry) { dst_memory = src_x_memory; // NOTE(chenfeiyu): when the output reuses memory from other tensor rather @@ -135,19 +150,187 @@ class EltwiseMKLDNNKernel : public framework::OpKernel { } }; -inline std::vector CalculateBroadcastedDims(const Tensor* x, - const Tensor* y) { - const auto src_tz = phi::vectorize(x->dims()); - const auto dst_tz = phi::vectorize(y->dims()); +template +class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + ElemwiseGradKernel::Compute(ctx); + using Tensor = framework::Tensor; - size_t j = 0; - std::vector dst_tz_ex(src_tz.size(), 1); - for (size_t i = 0; i < src_tz.size(); ++i) { - dst_tz_ex[i] = (src_tz[i] != dst_tz[j]) ? 1 : dst_tz[j++]; - if (j == dst_tz.size()) break; - } + auto& dev_ctx = + ctx.template device_context(); + const auto& onednn_engine = dev_ctx.GetEngine(); - return dst_tz_ex; -} + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Input("Out"); + + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + auto* dout = ctx.Input(framework::GradVarName("Out")); + + int axis = ctx.Attr("axis"); + + auto tz = phi::vectorize(dout->dims()); + auto proto_type_dout = framework::TransToProtoVarType(dout->dtype()); + + platform::ReorderMKLDNNHandler reorder_handler( + tz, proto_type_dout, framework::ToMKLDNNDataType(proto_type_dout), + onednn_engine); + + auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory( + dout->format(), platform::to_void_cast(dout->data())); + + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + + if (dx) { + std::shared_ptr dst_memory; + + // elementwise_add & elementwise_sub + if (BINARY_OP == dnnl::algorithm::binary_add || + BINARY_OP == dnnl::algorithm::binary_sub) { + dst_memory = reorder_handler.AcquireDstMemory(dx, dout->format(), + ctx.GetPlace()); + auto reorder_p = + reorder_handler.AcquireReorder(dst_memory, reorder_src_memory_p); + platform::RecordEvent record_reorder( + "int_reorder", platform::TracerEventType::UserDefined, 2, + platform::EventRole::kUniqueOp); + + reorder_p->execute(astream, *reorder_src_memory_p, *dst_memory); + } else { // elementwise_mul & elementwise_div + platform::BinaryMKLDNNHandler binary_handler( + BINARY_OP, axis, onednn_engine, ctx.GetPlace(), dout, y, dx, 1.0f, + 1.0f, 1.0f); + + const auto src_dout_memory = binary_handler.AcquireSrcMemory(dout); + const auto src_y_memory = binary_handler.AcquireSecondSrcMemory(y); + dst_memory = binary_handler.AcquireDstMemory(dx); + + const auto binary_prim = binary_handler.AcquireForwardPrimitive(); + + const std::unordered_map args = { + {DNNL_ARG_SRC_0, *src_dout_memory}, + {DNNL_ARG_SRC_1, *src_y_memory}, + {DNNL_ARG_DST, *dst_memory}}; + + binary_prim->execute(astream, args); + } + astream.wait(); + + dx->set_layout(framework::DataLayout::kMKLDNN); + dx->set_format(platform::GetMKLDNNFormat(*dst_memory)); + } + + if (dy) { + dnnl::primitive_attr broadcast_reduction_attr; + std::shared_ptr broadcast_src_memory; + std::shared_ptr dst_memory; + + // elementwise_add & elementwise_sub + if (BINARY_OP == dnnl::algorithm::binary_add || + BINARY_OP == dnnl::algorithm::binary_sub) { + if (dout->dims() == dy->dims()) { + auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory( + dy, dout->format(), ctx.GetPlace()); + + dnnl::primitive_attr reorder_attr; + std::vector scales(1); + scales[0] = (BINARY_OP == dnnl::algorithm::binary_add) ? 1 : -1; + reorder_attr.set_output_scales(0, scales); + auto reorder_p = std::make_shared( + *(reorder_src_memory_p), *(reorder_dst_memory_p), reorder_attr); + platform::RecordEvent record_reorder( + "int_reorder", platform::TracerEventType::UserDefined, 2, + platform::EventRole::kUniqueOp); + reorder_p->execute(astream, *reorder_src_memory_p, + *reorder_dst_memory_p); + + dst_memory = reorder_dst_memory_p; + } else { + broadcast_src_memory = reorder_src_memory_p; + } + } else { // elementwise_mul & elementwise_div + std::unordered_map args; + std::shared_ptr binary_prim; + std::shared_ptr post_op_memory; + std::shared_ptr src_0_memory; + std::shared_ptr src_1_memory; + + platform::BinaryMKLDNNHandler binary_handler( + dnnl::algorithm::binary_mul, axis, onednn_engine, ctx.GetPlace(), + dout, x, nullptr, 1.0f, 1.0f, 1.0f); + + src_1_memory = binary_handler.AcquireSecondSrcMemory(x); + + if (BINARY_OP == dnnl::algorithm::binary_div) { + platform::BinaryMKLDNNHandler post_op_binary_handler( + dnnl::algorithm::binary_div, axis, onednn_engine, ctx.GetPlace(), + y, y, nullptr, 1.0f, 1.0f, 1.0f); + + post_op_memory = post_op_binary_handler.AcquireSrcMemory(y); + + dnnl::post_ops po; + po.append_binary(dnnl::algorithm::binary_div, + post_op_memory->get_desc()); + + binary_handler = platform::BinaryMKLDNNHandler( + dnnl::algorithm::binary_mul, axis, onednn_engine, ctx.GetPlace(), + dout, out, nullptr, -1.0f, 1.0f, 1.0f, po); + + src_1_memory = binary_handler.AcquireSecondSrcMemory(out); + } + + src_0_memory = binary_handler.AcquireSrcMemory(dout); + + const auto dst_dy_memory = (dout->dims() == dy->dims()) + ? binary_handler.AcquireDstMemory(dy) + : binary_handler.AcquireDstMemory(); + + binary_prim = binary_handler.AcquireForwardPrimitive(); + args = {{DNNL_ARG_SRC_0, *src_0_memory}, + {DNNL_ARG_SRC_1, *src_1_memory}, + {DNNL_ARG_DST, *dst_dy_memory}}; + + if (BINARY_OP == dnnl::algorithm::binary_div) + args.insert({DNNL_ARG_ATTR_MULTIPLE_POST_OP(0) | DNNL_ARG_SRC_1, + *post_op_memory}); + + binary_prim->execute(astream, args); + broadcast_src_memory = dst_dy_memory; + dst_memory = dst_dy_memory; + } + astream.wait(); + dy->set_layout(DataLayout::kMKLDNN); + + if (dout->dims() != dy->dims()) { + // Broadcasting + if (BINARY_OP == dnnl::algorithm::binary_sub) { + dnnl::post_ops po; + po.append_eltwise(1.0f, dnnl::algorithm::eltwise_linear, -1.0f, 0); + broadcast_reduction_attr.set_post_ops(po); + } + + platform::ReductionMKLDNNHandler reduction_handler( + dnnl::algorithm::reduction_sum, 0.0f, 0.0f, onednn_engine, + ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy), + broadcast_reduction_attr); + dst_memory = reduction_handler.AcquireDstMemory(dy); + + auto reduction_p = reduction_handler.AcquireForwardPrimitive(); + + reduction_p->execute(astream, { + {DNNL_ARG_SRC, *broadcast_src_memory}, + {DNNL_ARG_DST, *dst_memory}, + }); + astream.wait(); + dy->set_format(platform::GetMKLDNNFormat(dst_memory->get_desc().reshape( + phi::vectorize(dy->dims())))); + } else { + dy->set_format(platform::GetMKLDNNFormat(*dst_memory)); + } + } + } +}; } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc index c03794012ff3b793684222c62f423edd6e8637f1..0ef5c5e628ce62084305fc95e66862a15822ecb3 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc @@ -1,127 +1,19 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. #include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h" -namespace paddle { -namespace framework { -class ExecutionContext; -} // namespace framework -namespace platform { -class CPUDeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -template -class EltwiseMulMKLDNNGradKernel : public ElemwiseGradKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - ElemwiseGradKernel::Compute(ctx); - - auto& dev_ctx = - ctx.template device_context(); - const auto& mkldnn_engine = dev_ctx.GetEngine(); - - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - int axis = ctx.Attr("axis"); - - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - - if (dx) { - // dx = dout*y - platform::BinaryMKLDNNHandler handler( - dnnl::algorithm::binary_mul, axis, mkldnn_engine, ctx.GetPlace(), - dout, y, dx, 1.0f, 1.0f, 1.0f); - - const auto src_dout_memory = handler.AcquireSrcMemory(dout); - const auto src_y_memory = handler.AcquireSecondSrcMemory(y); - const auto dst_dx_memory = handler.AcquireDstMemory(dx); - - const auto binary_prim = handler.AcquireForwardPrimitive(); - - const std::unordered_map args = { - {DNNL_ARG_SRC_0, *src_dout_memory}, - {DNNL_ARG_SRC_1, *src_y_memory}, - {DNNL_ARG_DST, *dst_dx_memory}}; - - binary_prim->execute(astream, args); - astream.wait(); - - dx->set_layout(framework::DataLayout::kMKLDNN); - dx->set_format(platform::GetMKLDNNFormat(*dst_dx_memory)); - } - - if (dy) { - // dy = dout*x - // Handler is having nullptr passed instead of output tensor as - // we want Dst buffer to be allocated by oneDNN not to use Tensor - platform::BinaryMKLDNNHandler handler( - dnnl::algorithm::binary_mul, axis, mkldnn_engine, ctx.GetPlace(), - dout, x, nullptr, 1.0f, 1.0f, 1.0f); - - const auto src_dout_memory = handler.AcquireSrcMemory(dout); - const auto src_x_memory = handler.AcquireSecondSrcMemory(x); - - // If broadcasting is in use then let's write to temporary - // buffer allocated by oneDNN - const auto dst_dy_memory = (dout->dims() == dy->dims()) - ? handler.AcquireDstMemory(dy) - : handler.AcquireDstMemory(); - - const auto binary_prim = handler.AcquireForwardPrimitive(); - - const std::unordered_map args = { - {DNNL_ARG_SRC_0, *src_dout_memory}, - {DNNL_ARG_SRC_1, *src_x_memory}, - {DNNL_ARG_DST, *dst_dy_memory}}; - - binary_prim->execute(astream, args); - astream.wait(); - - dy->set_layout(framework::DataLayout::kMKLDNN); - - // Reduction is needed for broadcasting scenario - if (dout->dims() != dy->dims()) { - platform::ReductionMKLDNNHandler handler_sum( - dnnl::algorithm::reduction_sum, 0.0f, 0.0f, mkldnn_engine, - ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy)); - auto dy_memory_p = handler_sum.AcquireDstMemory(dy); - auto reduction_p = handler_sum.AcquireForwardPrimitive(); - // As source we use mem object with results from binary operation - reduction_p->execute(astream, {{DNNL_ARG_SRC, *dst_dy_memory}, - {DNNL_ARG_DST, *dy_memory_p}}); - astream.wait(); - dy->set_format( - platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape( - phi::vectorize(dy->dims())))); - - } else { - dy->set_format(platform::GetMKLDNNFormat(*dst_dy_memory)); - } - } - } -}; - -} // namespace operators -} // namespace paddle - namespace ops = paddle::operators; REGISTER_OP_KERNEL( @@ -132,6 +24,8 @@ REGISTER_OP_KERNEL( ops::EltwiseMKLDNNKernel, ops::EltwiseMKLDNNKernel) -REGISTER_OP_KERNEL(elementwise_mul_grad, MKLDNN, ::paddle::platform::CPUPlace, - ops::EltwiseMulMKLDNNGradKernel, - ops::EltwiseMulMKLDNNGradKernel) +REGISTER_OP_KERNEL( + elementwise_mul_grad, MKLDNN, ::paddle::platform::CPUPlace, + ops::EltwiseMKLDNNGradKernel, + ops::EltwiseMKLDNNGradKernel) diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc index 3c799008a2abcf3fc59da7b759c9d43f3e940e8e..510373831eb6db5c7ffed6e8e58cbfb0ae268a50 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc @@ -1,5 +1,4 @@ - -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -13,113 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h" -namespace paddle { -namespace framework { -class ExecutionContext; -} // namespace framework -namespace platform { -class CPUDeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -template -class EltwiseSubMKLDNNGradKernel : public ElemwiseGradKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - ElemwiseGradKernel::Compute(ctx); - using Tensor = framework::Tensor; - - auto& dev_ctx = - ctx.template device_context(); - const auto& onednn_engine = dev_ctx.GetEngine(); - - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - - auto tz = phi::vectorize(dout->dims()); - memory::data_type dout_type = framework::ToMKLDNNDataType( - framework::TransToProtoVarType(dout->dtype())); - platform::ReorderMKLDNNHandler handler( - tz, framework::TransToProtoVarType(dout->dtype()), dout_type, - onednn_engine); - - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - auto reorder_src_memory_p = handler.AcquireSrcMemory( - dout->format(), platform::to_void_cast(dout->data())); - - if (dx) { - auto reorder_dst_memory_p = - handler.AcquireDstMemory(dx, dout->format(), ctx.GetPlace()); - auto reorder_p = - handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p); - platform::RecordEvent record_reorder( - "int_reorder", platform::TracerEventType::UserDefined, 2, - platform::EventRole::kUniqueOp); - - reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p); - astream.wait(); - - dx->set_layout(DataLayout::kMKLDNN); - dx->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p)); - } - - if (dy) { - // Direct copy - if (dout->dims() == dy->dims()) { - auto reorder_dst_memory_p = - handler.AcquireDstMemory(dy, dout->format(), ctx.GetPlace()); - - dnnl::primitive_attr reorder_attr; - std::vector scales = {-1}; - reorder_attr.set_output_scales(0, scales); - auto reorder_p = std::make_shared( - *(reorder_src_memory_p), *(reorder_dst_memory_p), reorder_attr); - platform::RecordEvent record_reorder( - "int_reorder", platform::TracerEventType::UserDefined, 2, - platform::EventRole::kUniqueOp); - reorder_p->execute(astream, *reorder_src_memory_p, - *reorder_dst_memory_p); - astream.wait(); - - dy->set_layout(DataLayout::kMKLDNN); - dy->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p)); - } else { - // Broadcasting - - dnnl::post_ops po; - po.append_eltwise(1.0f, dnnl::algorithm::eltwise_linear, -1.0f, 0); - dnnl::primitive_attr attr; - attr.set_post_ops(po); - - platform::ReductionMKLDNNHandler handler_sum( - dnnl::algorithm::reduction_sum, 0.0f, 0.0f, onednn_engine, - ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy), attr); - - auto dy_memory_p = handler_sum.AcquireDstMemory(dy); - auto reduction_p = handler_sum.AcquireForwardPrimitive(); - - reduction_p->execute(astream, { - {DNNL_ARG_SRC, *reorder_src_memory_p}, - {DNNL_ARG_DST, *dy_memory_p}, - }); - astream.wait(); - - dy->set_layout(DataLayout::kMKLDNN); - dy->set_format( - platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape( - phi::vectorize(dy->dims())))); - } - } - } -}; - -} // namespace operators -} // namespace paddle namespace ops = paddle::operators; @@ -131,6 +24,8 @@ REGISTER_OP_KERNEL( ops::EltwiseMKLDNNKernel, ops::EltwiseMKLDNNKernel) -REGISTER_OP_KERNEL(elementwise_sub_grad, MKLDNN, ::paddle::platform::CPUPlace, - ops::EltwiseSubMKLDNNGradKernel, - ops::EltwiseSubMKLDNNGradKernel) +REGISTER_OP_KERNEL( + elementwise_sub_grad, MKLDNN, ::paddle::platform::CPUPlace, + ops::EltwiseMKLDNNGradKernel, + ops::EltwiseMKLDNNGradKernel) diff --git a/paddle/fluid/operators/expand_as_v2_op.cc b/paddle/fluid/operators/expand_as_v2_op.cc index 97a35a34f23e96707269482e29da13a15538cdca..9361edd43bf15ac0eee4a4de618027af79b78b56 100755 --- a/paddle/fluid/operators/expand_as_v2_op.cc +++ b/paddle/fluid/operators/expand_as_v2_op.cc @@ -12,7 +12,9 @@ limitations under the License. */ #include "paddle/fluid/operators/expand_as_v2_op.h" #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -22,27 +24,6 @@ using framework::Tensor; class ExpandAsV2Op : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - protected: - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ExpandAsV2"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "ExpandAsV2"); - auto x_dims = ctx->GetInputDim("X"); - auto target_shape = ctx->Attrs().Get>("target_shape"); - PADDLE_ENFORCE_GE( - target_shape.size(), static_cast(x_dims.size()), - platform::errors::InvalidArgument( - "The rank of target_shape must be greater than or equal " - "to the rank of Input(X). But received Input(X): input " - "rank %u; received target_shape: rank %u.", - x_dims.size(), target_shape.size())); - PADDLE_ENFORCE_LE(target_shape.size(), MAX_RANK_SUPPORTED, - platform::errors::InvalidArgument( - "The rank of target_shape must be less than or equal " - "to %d. But received: rank %u.", - MAX_RANK_SUPPORTED, target_shape.size())); - ctx->SetOutputDim("Out", phi::make_ddim(target_shape)); - } }; class ExpandAsV2OpMaker : public framework::OpProtoAndCheckerMaker { @@ -116,9 +97,12 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(ExpandAsV2GradNoNeedBufVarsInferer, "X"); } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(expand_as_v2, ExpandAsInferShapeFunctor, + PD_INFER_META(phi::ExpandAsInferMeta)); REGISTER_OPERATOR(expand_as_v2, ops::ExpandAsV2Op, ops::ExpandAsV2OpMaker, ops::ExpandAsV2GradOpMaker, - ops::ExpandAsV2GradOpMaker); + ops::ExpandAsV2GradOpMaker, + ExpandAsInferShapeFunctor); REGISTER_OPERATOR(expand_as_v2_grad, ops::ExpandAsV2GradOp, ops::ExpandAsV2GradNoNeedBufVarsInferer); diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu index 9f7e4fb8d5749cf6bd54ed3e3bf9699199c0d3e6..70597be393c35e6939b83d86ce2f9be8f2c36805 100644 --- a/paddle/fluid/operators/fake_quantize_op.cu +++ b/paddle/fluid/operators/fake_quantize_op.cu @@ -28,13 +28,14 @@ __global__ void FindAbsMaxKernel(const T* in, const int n, T* out) { extern __shared__ char* shared_max_data_tmp[]; auto shared_max_data = reinterpret_cast(shared_max_data_tmp); if (gridDim.x > 1) { - shared_max_data[tid] = T(0); + T local_max_data = T(0); for (int i = bid; i < n; i += blockDim.x * gridDim.x) { T tmp = abs(in[i]); - if (tmp > shared_max_data[tid]) { - shared_max_data[tid] = tmp; + if (tmp > local_max_data) { + local_max_data = tmp; } } + shared_max_data[tid] = local_max_data; } else { if (bid < n) { shared_max_data[tid] = abs(in[bid]); @@ -83,13 +84,14 @@ __global__ void FindChannelAbsMaxKernelQuantAxis0(const T* in, const int n, int channel_size = n / c; const T* in_c = in + blockIdx.x * channel_size; extern __shared__ T shared_max_data[]; - shared_max_data[tid] = T(0); + T local_max_data = T(0); for (int i = tid; i < channel_size; i += blockDim.x) { T tmp = fabs(in_c[i]); - if (tmp > shared_max_data[tid]) { - shared_max_data[tid] = tmp; + if (tmp > local_max_data) { + local_max_data = tmp; } } + shared_max_data[tid] = local_max_data; __syncthreads(); for (int i = blockDim.x / 2; i > 0; i >>= 1) { if (tid < i && (shared_max_data[tid] < shared_max_data[tid + i])) { @@ -113,13 +115,14 @@ __global__ void FindChannelAbsMaxKernelQuantAxis1(const T* in, const int n, int tid = threadIdx.x; int bid = blockIdx.x; const T* in_current = in + tid * cout_wh_size + bid * wh_size; - shared_max_data[tid] = T(0); + T local_max_data = T(0); for (int i = 0; i < wh_size; i++) { T tmp = fabs(in_current[i]); - if (tmp > shared_max_data[tid]) { - shared_max_data[tid] = tmp; + if (tmp > local_max_data) { + local_max_data = tmp; } } + shared_max_data[tid] = local_max_data; __syncthreads(); int len = blockDim.x; @@ -404,6 +407,19 @@ struct FindRangeAbsMaxFunctor { } }; +template +__global__ void FindMovingAverageAbsMaxKernel(const T* in_state, + const T* in_accum, + const T* cur_scale, const T rate, + T* out_state, T* out_accum, + T* out_scale) { + T state = rate * (*in_state) + T(1.0f); + T accum = rate * (*in_accum) + (*cur_scale); + *out_state = state; + *out_accum = accum; + *out_scale = accum / state; +} + template struct FindRangeAbsMaxFunctor; template @@ -415,29 +431,14 @@ struct FindMovingAverageAbsMaxFunctor { framework::Tensor* out_accum, framework::Tensor* out_scale) { const auto gpu_place = ctx.GetPlace(); - T accum; - T state; - T scale; - memory::Copy(platform::CPUPlace(), &accum, gpu_place, in_accum.data(), - sizeof(T), ctx.stream()); - memory::Copy(platform::CPUPlace(), &state, gpu_place, in_state.data(), - sizeof(T), ctx.stream()); - memory::Copy(platform::CPUPlace(), &scale, gpu_place, cur_scale, sizeof(T), - ctx.stream()); - ctx.Wait(); - T rate_t = static_cast(rate); - state = rate_t * state + static_cast(1.0); - accum = rate_t * accum + scale; - scale = accum / state; - - memory::Copy(gpu_place, out_accum->mutable_data(gpu_place), - platform::CPUPlace(), &accum, sizeof(T), ctx.stream()); - memory::Copy(gpu_place, out_state->mutable_data(gpu_place), - platform::CPUPlace(), &state, sizeof(T), ctx.stream()); - memory::Copy(gpu_place, out_scale->mutable_data(gpu_place), - platform::CPUPlace(), &scale, sizeof(T), ctx.stream()); - ctx.Wait(); + T* out_state_data = out_state->mutable_data(gpu_place); + T* out_accum_data = out_accum->mutable_data(gpu_place); + T* out_scale_data = out_scale->mutable_data(gpu_place); + + FindMovingAverageAbsMaxKernel<<<1, 1, 0, ctx.stream()>>>( + in_state.data(), in_accum.data(), cur_scale, rate_t, + out_state_data, out_accum_data, out_scale_data); } }; diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cc b/paddle/fluid/operators/fused/conv_fusion_op.cc index c445a28c084f67f2688e17994cb622903b73c707..e60fc44e9a6ffc106a9c6957c2365e7b44c467b9 100644 --- a/paddle/fluid/operators/fused/conv_fusion_op.cc +++ b/paddle/fluid/operators/fused/conv_fusion_op.cc @@ -120,6 +120,142 @@ class Conv2DFusionOp : public operators::ConvOp { ctx->SetOutputsDim("Outputs", output_shapes); } } + + std::vector ComputeOutputShape( + framework::InferShapeContext* ctx) const { + OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "Conv"); + OP_INOUT_CHECK(ctx->HasInput("Filter"), "Input", "Filter", "Conv"); + + auto in_dims = ctx->GetInputDim("Input"); + auto filter_dims = ctx->GetInputDim("Filter"); + + std::vector strides = ctx->Attrs().Get>("strides"); + std::vector paddings = ctx->Attrs().Get>("paddings"); + std::string padding_algorithm = + ctx->Attrs().Get("padding_algorithm"); + int groups = ctx->Attrs().Get("groups"); + std::vector dilations = + ctx->Attrs().Get>("dilations"); + int dilation_size = dilations.size(); + for (int i = 0; i < dilation_size; ++i) { + PADDLE_ENFORCE_GT( + dilations[i], 0, + platform::errors::InvalidArgument( + "The dilation of Op(Conv) should be larget than 0, but received " + "dilation is %d.", + dilations[i])); + } + const std::string data_format = + ctx->Attrs().Get("data_format"); + + // MKL-DNN Kernels are using NCHW order of dims description + // so we ignore data_format consideration for MKL-DNN kernel + const bool channel_last = (ctx->IsRunMKLDNNKernel() == false) && + (data_format == "NHWC" || data_format == "NDHWC"); + + PADDLE_ENFORCE_EQ( + in_dims.size() == 4 || in_dims.size() == 5, true, + platform::errors::InvalidArgument( + "The input of Op(Conv) should be a 4-D or 5-D Tensor. But " + "received: input's dimension is %u, input's shape is [%s].", + in_dims.size(), in_dims)); + + PADDLE_ENFORCE_EQ( + in_dims.size(), filter_dims.size(), + platform::errors::InvalidArgument( + "The input's dimension and filter's dimension of " + "Op(Conv) should be equal. But received: the input's shape is " + "[%s], " + "the input's dimension is %d; the filter's shape is [%s], " + "the filter's dimension is %d.", + in_dims, in_dims.size(), filter_dims, filter_dims.size())); + + int stride_size = strides.size(); + for (int i = 0; i < stride_size; ++i) { + PADDLE_ENFORCE_GT( + strides[i], 0, + platform::errors::InvalidArgument( + "The stride of Op(Conv) should be larget than 0, but received " + "stride is %d.", + strides[i])); + } + + int in_sub_stride_size = in_dims.size() - stride_size; + PADDLE_ENFORCE_EQ( + in_dims.size(), strides.size() + 2U, + platform::errors::InvalidArgument( + "The difference of input's dimension and Attr(strides)'s " + "length must be euqal to 2 for Op(Conv). " + "But received: input's dimension is %d, input's shape is [%s]; " + "Attr(stride)'s length is %d, Attr(stride) is [%s]; " + "difference of input's dimention and Attr(strides)'s length = %u.", + in_dims.size(), in_dims, strides.size(), phi::make_ddim(strides), + in_sub_stride_size)); + + const auto input_channels = + channel_last ? in_dims[in_dims.size() - 1] : in_dims[1]; + + PADDLE_ENFORCE_EQ( + input_channels, filter_dims[1] * groups, + platform::errors::InvalidArgument( + "The number of input's channels should be equal to filter's " + "channels " + "* groups for Op(Conv). But received: the input's channels is %d, " + "the input's shape is [%s]; the filter's channels is %d, the " + "filter's shape is [%s]; the groups is %d, the data_format is %s. " + "The error may come from wrong data_format setting.", + input_channels, in_dims, filter_dims[1], filter_dims, groups, + data_format)); + PADDLE_ENFORCE_EQ( + filter_dims[0] % groups, 0, + platform::errors::InvalidArgument( + "The number of output's channels (filter's first dimension) of " + "Op(Conv) should be divided by groups. But received: " + "the output channels is %d, the filter's shape is [%s], " + "the groups is %d.", + filter_dims[0], filter_dims, groups)); + + if (ctx->IsRuntime()) { + PADDLE_ENFORCE_GT( + filter_dims[0], 0, + platform::errors::InvalidArgument( + "the size of filter at axis 0 should be greater than 0")); + } + + framework::DDim in_data_dims; + if (channel_last) { + in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); + } else { + in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); + } + + framework::DDim filter_data_dims = + phi::slice_ddim(filter_dims, 2, filter_dims.size()); + + std::vector ksize = phi::vectorize(filter_data_dims); + UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, + in_data_dims, strides, ksize); + + std::vector output_shape({in_dims[0]}); + if (!channel_last) { + output_shape.push_back(filter_dims[0]); + } + for (int i = 0; i < in_data_dims.size(); ++i) { + if ((!ctx->IsRuntime()) && + (in_data_dims[i] <= 0 || filter_dims[i + 2] <= 0)) { + output_shape.push_back(-1); + } else { + output_shape.push_back( + ConvOutputSize(in_data_dims[i], filter_data_dims[i], dilations[i], + paddings[2 * i], paddings[2 * i + 1], strides[i])); + } + } + if (channel_last) { + output_shape.push_back(filter_dims[0]); + } + + return output_shape; + } }; // TODO(qingqing): add gradient operator for conv2d_fusion diff --git a/paddle/fluid/operators/fused/fused_dropout_test.h b/paddle/fluid/operators/fused/fused_dropout_test.h index 18c7187fc8e64c9fed8a86a984954b5420c1e5b5..a9b72a9cdf397f026f6ce24d83cc13066a3fd000 100644 --- a/paddle/fluid/operators/fused/fused_dropout_test.h +++ b/paddle/fluid/operators/fused/fused_dropout_test.h @@ -25,14 +25,16 @@ limitations under the License. */ #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/operators/layer_norm_kernel.cu.h" #include "paddle/fluid/string/printf.h" +#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/layer_norm_kernel.h" namespace framework = paddle::framework; namespace platform = paddle::platform; namespace memory = paddle::memory; USE_OP_ITSELF(dropout); -USE_OP(layer_norm); +USE_OP_ITSELF(layer_norm); template using CudnnDataType = platform::CudnnDataType; @@ -136,18 +138,23 @@ void LayerNorm(const std::vector> &scale, const platform::CUDADeviceContext &ctx) { framework::Scope scope; auto place = ctx.GetPlace(); + paddle::optional scale_opt = paddle::none; if (scale.size() > 0) { auto var_scale = scope.Var("Scale"); auto tensor_scale = var_scale->GetMutable(); framework::TensorFromVector(scale, ctx, tensor_scale); tensor_scale->Resize({cols}); + scale_opt = *tensor_scale; } + paddle::optional bias_opt = paddle::none; if (bias.size() > 0) { auto var_bias = scope.Var("Bias"); auto tensor_bias = var_bias->GetMutable(); framework::TensorFromVector(bias, ctx, tensor_bias); tensor_bias->Resize({cols}); + + bias_opt = *tensor_bias; } auto var_x = scope.Var("X"); @@ -157,20 +164,19 @@ void LayerNorm(const std::vector> &scale, auto var_y = scope.Var("Y"); auto tensor_y = var_y->GetMutable(); + tensor_y->Resize({rows, cols}); auto var_mean = scope.Var("Mean"); auto tensor_mean = var_mean->GetMutable(); + tensor_mean->Resize({rows}); auto var_variance = scope.Var("Variance"); auto tensor_variance = var_variance->GetMutable(); - - framework::AttributeMap attrs; - attrs.insert({"epsilon", epsilon}); - - auto op = framework::OpRegistry::CreateOp( - "layer_norm", {{"X", {"X"}}, {"Scale", {"Scale"}}, {"Bias", {"Bias"}}}, - {{"Y", {"Y"}}, {"Mean", {"Mean"}}, {"Variance", {"Variance"}}}, attrs); - op->Run(scope, place); + tensor_variance->Resize({rows}); + ctx.Wait(); + phi::LayerNormKernel(static_cast(ctx), *tensor_x, + scale_opt, bias_opt, 1e-5, 1, false, tensor_y, + tensor_mean, tensor_variance); framework::TensorToVector(*tensor_y, ctx, y); framework::TensorToVector(*tensor_mean, ctx, means); framework::TensorToVector(*tensor_variance, ctx, vars); diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu index 032440d7f0478dc087e3ba38274f2a31a9a66a23..c7e1f4a5463fe11b9fa96f147b71004140130399 100644 --- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu +++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu @@ -198,7 +198,6 @@ struct TestFusedLayernormResidualDropoutBias { residual_vec[i * cols + j] + out2[i * cols + j]; } } - LayerNorm(scale_vec, layernorm_bias_vec, correct_out, &correct_means, &correct_vars, &correct_layernorm_out, epsilon, rows, cols, *ctx); diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc index 8a405cc6fc1baefe997fb5b6133a56d6a2fc0438..9f2b48a24b44700dc93e9eba09ea2dd2a900bdfa 100644 --- a/paddle/fluid/operators/gather_op.cc +++ b/paddle/fluid/operators/gather_op.cc @@ -12,12 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/gather_op.h" #include #include #include + +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -26,58 +31,6 @@ class GatherOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Input(X) of GatherOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("Index"), true, - platform::errors::InvalidArgument( - "Input(Index) of GatherOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(Out) of GatherOp should not be null.")); - - auto index_dims = ctx->GetInputDim("Index"); - - if (index_dims.size() == 2) { - PADDLE_ENFORCE_EQ( - index_dims[1], 1, - platform::errors::InvalidArgument( - "The last dim of index should be 1 when it is 2D, but we get %d", - index_dims[1])); - } else { - PADDLE_ENFORCE_EQ( - index_dims.size(), 1, - platform::errors::InvalidArgument( - "The index should be 1D, when it is not 2D, but we get %d", - index_dims.size())); - } - - auto axis = ctx->Attrs().Get("axis"); - auto input_dim = ctx->GetInputDim("X"); - if (ctx->HasInput("Axis") || axis == 0) { - // if HasInput("Axis"), we can not obtain correct shape of output - int batch_size = index_dims[0]; - framework::DDim output_dims(input_dim); - output_dims[0] = batch_size; - ctx->SetOutputDim("Out", output_dims); - ctx->ShareLoD("X", /*->*/ "Out"); - } else { - int index_size = index_dims[0]; - std::vector out_dim_vec; - for (int i = 0; i < axis; i++) { - out_dim_vec.push_back(input_dim[i]); - } - out_dim_vec.push_back(index_size); - for (int i = axis + 1; i < input_dim.size(); i++) { - out_dim_vec.push_back(input_dim[i]); - } - auto output_dims = phi::make_ddim(out_dim_vec); - ctx->SetOutputDim("Out", output_dims); - ctx->ShareLoD("X", /*->*/ "Out"); - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -100,11 +53,6 @@ class GatherGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); - ctx->ShareLoD("X", /*-->*/ framework::GradVarName("X")); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -193,22 +141,18 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(GatherGradNoNeedBufferVarInferer, "X"); } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(gather, GatherInferShapeFunctor, + PD_INFER_META(phi::GatherInferMeta)); REGISTER_OPERATOR(gather, ops::GatherOp, ops::GatherOpMaker, ops::GatherGradOpMaker, - ops::GatherGradOpMaker); + ops::GatherGradOpMaker, + GatherInferShapeFunctor); +DECLARE_INFER_SHAPE_FUNCTOR(gather_grad, GatherGradInferShapeFunctor, + PD_INFER_META(phi::GeneralUnaryGradInferMeta)); REGISTER_OPERATOR(gather_grad, ops::GatherGradOp, - ops::GatherGradNoNeedBufferVarInferer); -REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel, - ops::GatherOpKernel, ops::GatherOpKernel, - ops::GatherOpKernel, - ops::GatherOpKernel, - ops::GatherOpKernel); -REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel, - ops::GatherGradientOpKernel, - ops::GatherGradientOpKernel, - ops::GatherGradientOpKernel, - ops::GatherGradientOpKernel, - ops::GatherGradientOpKernel); + ops::GatherGradNoNeedBufferVarInferer, + GatherGradInferShapeFunctor); + REGISTER_OP_VERSION(gather) .AddCheckpoint(R"ROC(upgrad gather, add a new input [Axis])ROC", paddle::framework::compatible::OpVersionDesc().NewInput( diff --git a/paddle/fluid/operators/gather_op.cu b/paddle/fluid/operators/gather_op.cu deleted file mode 100644 index e0db2f26d3e0534f924cc709b98689fb3f1a5cc6..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/gather_op.cu +++ /dev/null @@ -1,152 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/operators/gather_op.h" -#include "paddle/phi/kernels/funcs/gather.cu.h" -#include "paddle/phi/kernels/funcs/scatter.cu.h" - -namespace paddle { -namespace operators { - -template -class GatherOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "This kernel only runs on GPU device.")); - auto *x = ctx.Input("X"); - auto *index = ctx.Input("Index"); - auto *output = ctx.Output("Out"); - - int axis = ctx.Attr("axis"); - - // get axis from tensor - if (ctx.HasInput("Axis")) { - Tensor cpu_axis; - const Tensor *axis_tensor = ctx.Input("Axis"); - framework::TensorCopy(*axis_tensor, platform::CPUPlace(), &cpu_axis); - const auto &axis_type = - framework::TransToProtoVarType(axis_tensor->dtype()); - if (axis_type == framework::proto::VarType::INT32) { - axis = static_cast(cpu_axis.data()[0]); - } else if (axis_type == framework::proto::VarType::INT64) { - axis = static_cast(cpu_axis.data()[0]); - } else if (axis_type == framework::proto::VarType::INT16) { - axis = static_cast(cpu_axis.data()[0]); - } - } - const auto &place = ctx.GetPlace(); - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - const auto &dev_ctx = ctx.cuda_device_context(); - if (axis != 0) { - if (index_type == framework::proto::VarType::INT32) { - phi::funcs::GatherV2CUDAFunction(x, index, axis, output, - dev_ctx); - } else if (index_type == framework::proto::VarType::INT64) { - phi::funcs::GatherV2CUDAFunction(x, index, axis, output, - dev_ctx); - } else if (index_type == framework::proto::VarType::INT16) { - phi::funcs::GatherV2CUDAFunction(x, index, axis, output, - dev_ctx); - } - return; - } - - output->mutable_data(ctx.GetPlace()); - if (x->numel() == 0) return; - if (index_type == framework::proto::VarType::INT32) { - phi::funcs::GPUGather(dev_ctx, *x, *index, output); - } else if (index_type == framework::proto::VarType::INT64) { - phi::funcs::GPUGather(dev_ctx, *x, *index, output); - } else if (index_type == framework::proto::VarType::INT16) { - phi::funcs::GPUGather(dev_ctx, *x, *index, output); - } - } -}; - -template -class GatherGradOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "This kernel only runs on GPU device.")); - auto *index = ctx.Input("Index"); - auto *dX = ctx.Output(framework::GradVarName("X")); - auto *dO = ctx.Input(framework::GradVarName("Out")); - - int axis = ctx.Attr("axis"); - if (ctx.HasInput("Axis")) { - const Tensor *axis_tensor = ctx.Input("Axis"); - Tensor cpu_axis; - framework::TensorCopy(*axis_tensor, platform::CPUPlace(), &cpu_axis); - const auto &axis_type = - framework::TransToProtoVarType(axis_tensor->dtype()); - if (axis_type == framework::proto::VarType::INT32) { - axis = static_cast(cpu_axis.data()[0]); - } else if (axis_type == framework::proto::VarType::INT64) { - axis = static_cast(cpu_axis.data()[0]); - } - } - - const auto &dev_ctx = ctx.cuda_device_context(); - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - if (axis != 0) { - if (index_type == framework::proto::VarType::INT32) { - phi::funcs::GatherV2GradCUDAFunction(dO, index, axis, dX, - dev_ctx); - } else if (index_type == framework::proto::VarType::INT64) { - phi::funcs::GatherV2GradCUDAFunction(dO, index, axis, dX, - dev_ctx); - } - return; - } - - dX->mutable_data(ctx.GetPlace()); - auto dxt = framework::EigenVector::Flatten(*dX); - auto &place = *ctx.template device_context() - .eigen_device(); - dxt.device(place) = dxt.constant(static_cast(0)); - if (dO->numel() == 0) return; - if (index_type == framework::proto::VarType::INT32) { - phi::funcs::GPUScatterAssign(dev_ctx, *dO, *index, dX, - ctx.Attr("overwrite")); - } else if (index_type == framework::proto::VarType::INT64) { - phi::funcs::GPUScatterAssign(dev_ctx, *dO, *index, dX, - ctx.Attr("overwrite")); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel, - ops::GatherOpCUDAKernel, - ops::GatherOpCUDAKernel, - ops::GatherOpCUDAKernel, - ops::GatherOpCUDAKernel, - ops::GatherOpCUDAKernel, - ops::GatherOpCUDAKernel); -REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel, - ops::GatherGradOpCUDAKernel, - ops::GatherGradOpCUDAKernel, - ops::GatherGradOpCUDAKernel, - ops::GatherGradOpCUDAKernel, - ops::GatherGradOpCUDAKernel); diff --git a/paddle/fluid/operators/gather_op.h b/paddle/fluid/operators/gather_op.h deleted file mode 100644 index 94de694b2f9bc484cdb60298b60d5a9433dac181..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/gather_op.h +++ /dev/null @@ -1,133 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/gather.h" -#include "paddle/phi/kernels/funcs/scatter.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class GatherOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet("This kernel only runs on CPU.")); - - auto *x = ctx.Input("X"); - auto *index = ctx.Input("Index"); - auto *output = ctx.Output("Out"); - - int axis = ctx.Attr("axis"); - // get axis from tensor - if (ctx.HasInput("Axis")) { - const Tensor *axis_tensor = ctx.Input("Axis"); - const auto &axis_type = axis_tensor->dtype(); - if (axis_type == phi::DataType::INT32) { - axis = static_cast(axis_tensor->data()[0]); - } else if (axis_type == phi::DataType::INT64) { - axis = static_cast(axis_tensor->data()[0]); - } - } - const auto &index_type = index->dtype(); - auto &dev_ctx = ctx.template device_context(); - if (axis != 0) { - if (index_type == phi::DataType::INT32) { - phi::funcs::GatherV2Function(dev_ctx, x, index, axis, - output); - } else if (index_type == phi::DataType::INT64) { - phi::funcs::GatherV2Function(dev_ctx, x, index, axis, - output); - } - return; - } - - output->mutable_data(ctx.GetPlace()); - if (x->numel() == 0) return; - if (index_type == phi::DataType::INT32) { - phi::funcs::CPUGather(dev_ctx, *x, *index, output); - } else if (index_type == phi::DataType::INT64) { - phi::funcs::CPUGather(dev_ctx, *x, *index, output); - } - } -}; - -template -class GatherGradientOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet("This kernel only runs on CPU.")); - - auto *index = ctx.Input("Index"); - auto *dX = ctx.Output(framework::GradVarName("X")); - auto *dO = ctx.Input(framework::GradVarName("Out")); - - int axis = ctx.Attr("axis"); - if (ctx.HasInput("Axis")) { - const Tensor *axis_tensor = ctx.Input("Axis"); - const auto &axis_type = axis_tensor->dtype(); - if (axis_type == phi::DataType::INT32) { - axis = static_cast(axis_tensor->data()[0]); - } else if (axis_type == phi::DataType::INT64) { - axis = static_cast(axis_tensor->data()[0]); - } - } - const auto &index_type = index->dtype(); - auto &dev_ctx = ctx.template device_context(); - - if (axis != 0) { - if (index_type == phi::DataType::INT32) { - phi::funcs::GatherV2GradFunction(dev_ctx, dO, index, axis, - dX); - } else if (index_type == phi::DataType::INT64) { - phi::funcs::GatherV2GradFunction(dev_ctx, dO, index, axis, - dX); - } - return; - } - - dX->mutable_data(ctx.GetPlace()); - auto dxt = framework::EigenVector::Flatten(*dX); - auto &place = *dev_ctx.eigen_device(); - dxt.device(place) = dxt.constant(static_cast(0)); - if (dO->numel() == 0) return; - bool overwrite = ctx.Attr("overwrite"); - - if (index_type == phi::DataType::INT32) { - if (overwrite) { - phi::funcs::ScatterAssign(dev_ctx, *dO, *index, dX); - } else { - phi::funcs::ScatterAssignAdd(dev_ctx, *dO, *index, dX); - } - } else if (index_type == phi::DataType::INT64) { - if (overwrite) { - phi::funcs::ScatterAssign(dev_ctx, *dO, *index, dX); - } else { - phi::funcs::ScatterAssignAdd(dev_ctx, *dO, *index, dX); - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/gather_op_npu.cc b/paddle/fluid/operators/gather_op_npu.cc index 21093f585b59eea24a231b4dcdf264dc16178fbd..f996b1ede2f0fdbf7739d579380d71e9dc3448e7 100644 --- a/paddle/fluid/operators/gather_op_npu.cc +++ b/paddle/fluid/operators/gather_op_npu.cc @@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/gather_op.h" #include #include #include + +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/platform/device/npu/npu_info.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" diff --git a/paddle/fluid/operators/gather_op_npu_test.cc b/paddle/fluid/operators/gather_op_npu_test.cc index 3dce380360815c292153ef2bfb1a447357c90acb..b42050eabe300bea59c95c50c356d9e115c0dddf 100644 --- a/paddle/fluid/operators/gather_op_npu_test.cc +++ b/paddle/fluid/operators/gather_op_npu_test.cc @@ -24,16 +24,15 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/gather_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace f = paddle::framework; namespace p = paddle::platform; -USE_OP(gather); +USE_OP_ITSELF(gather); USE_OP_DEVICE_KERNEL(gather, NPU); -USE_OP(gather_grad); +USE_OP_ITSELF(gather_grad); USE_OP_DEVICE_KERNEL(gather_grad, NPU); template diff --git a/paddle/fluid/operators/gather_op_xpu.cc b/paddle/fluid/operators/gather_op_xpu.cc index 28f2f7d473bef308f581266bdb1925864aca4b78..6c691aa14ae77acc3c4ebc2077ea9182e4354d54 100644 --- a/paddle/fluid/operators/gather_op_xpu.cc +++ b/paddle/fluid/operators/gather_op_xpu.cc @@ -13,15 +13,18 @@ See the License for the specific language governing permissions and limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/gather_op.h" #include #include #include + +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/phi/core/ddim.h" namespace paddle { namespace operators { +using Tensor = framework::Tensor; + template class GatherOpXPUKernel : public framework::OpKernel { using XPUType = typename XPUTypeTrait::Type; diff --git a/paddle/fluid/operators/gelu_op.cc b/paddle/fluid/operators/gelu_op.cc index 3d338f00d4fcbf4be35b2392a10c275526dc5d4b..3be2606bfc93984f918adf595b522fe6bfca72be 100644 --- a/paddle/fluid/operators/gelu_op.cc +++ b/paddle/fluid/operators/gelu_op.cc @@ -14,10 +14,11 @@ limitations under the License. */ #include #include -#include - -#include "paddle/fluid/operators/gelu_op.h" -#include "paddle/fluid/platform/float16.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -29,18 +30,6 @@ class GeluOp : public framework::OperatorWithKernel { const framework::AttributeMap &attrs) : OperatorWithKernel(type, inputs, outputs, attrs) {} - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Input(%s) of GeluOp should not be null.", "X")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(%s) of GeluOp should not be null.", "Out")); - - ctx->ShareDim("X", /*->*/ "Out"); - ctx->ShareLoD("X", /*->*/ "Out"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -156,13 +145,10 @@ class GeluGradOpMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(gelu, GeluInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); REGISTER_OPERATOR(gelu, ops::GeluOp, ops::GeluOpMaker, ops::GeluGradOpMaker, - ops::GeluGradOpMaker); + ops::GeluGradOpMaker, + GeluInferShapeFunctor); REGISTER_OPERATOR(gelu_grad, ops::GeluGradOp); -REGISTER_OP_CPU_KERNEL( - gelu, ops::GeluKernel, - ops::GeluKernel); -REGISTER_OP_CPU_KERNEL( - gelu_grad, ops::GeluGradKernel, - ops::GeluGradKernel); diff --git a/paddle/fluid/operators/gelu_op.cu b/paddle/fluid/operators/gelu_op.cu deleted file mode 100644 index ef836ab72f001a540e081d7e9975ca5ee28758be..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/gelu_op.cu +++ /dev/null @@ -1,320 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/amp/fp16_type_traits.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" -#include "paddle/fluid/operators/gelu_op.h" - -DECLARE_bool(use_fast_math); - -namespace paddle { -namespace operators { - -#ifdef __NVCC__ -template -static __device__ __forceinline__ float FP32FastTanh(float x) { -#if __CUDA_ARCH__ >= 750 && CUDA_VERSION >= 11000 - if (FastMode) { - float y; - asm("tanh.approx.f32 %0,%1; \n\t" : "=f"(y) : "f"(x)); - return y; - } -#endif - return tanhf(x); -} - -template -static __device__ __forceinline__ float FP32GeluFwd(float x) { - auto tanh_out = - FP32FastTanh(0.79788456f * x * (1.0f + 0.044715f * x * x)); - return x * 0.5f * (1.0f + tanh_out); -} - -template -static __device__ __forceinline__ float FP32GeluBwd(float x, float y_g) { - auto tanh_out = - FP32FastTanh(0.79788456f * x * (1.0f + 0.044715f * x * x)); - auto tmp = 0.5f * x * ((1.0f - tanh_out * tanh_out) * - (0.79788456f + 0.1070322243f * x * x)) + - 0.5f * (1.0f + tanh_out); - return tmp * y_g; -} - -template -static __global__ void FP16FastGeluFwdCUDAKernel(const __half* x, __half* y, - size_t n) { - size_t offset = - static_cast(threadIdx.x + blockIdx.x * blockDim.x) * VecSize; - size_t stride = static_cast(blockDim.x * gridDim.x) * VecSize; - for (; offset < n; offset += stride) { - using ArrT = phi::AlignedVector<__half, VecSize>; - ArrT in_arr = *reinterpret_cast(x + offset); -#pragma unroll - for (int i = 0; i < VecSize; ++i) { - float tmp = __half2float(in_arr[i]); - in_arr[i] = __float2half(FP32GeluFwd(tmp)); - } - *reinterpret_cast(y + offset) = in_arr; - } -} - -template -static __global__ void FP16FastGeluBwdCUDAKernel(const __half* x, - const __half* y_g, __half* x_g, - size_t n) { - size_t offset = - static_cast(threadIdx.x + blockIdx.x * blockDim.x) * VecSize; - size_t stride = static_cast(blockDim.x * gridDim.x) * VecSize; - for (; offset < n; offset += stride) { - using ArrT = phi::AlignedVector<__half, VecSize>; - ArrT x_in_arr = *reinterpret_cast(x + offset); - ArrT y_g_in_arr = *reinterpret_cast(y_g + offset); -#pragma unroll - for (int i = 0; i < VecSize; ++i) { - __half2 tmp_fp16_2; - tmp_fp16_2.x = x_in_arr[i]; - tmp_fp16_2.y = y_g_in_arr[i]; - float2 tmp_fp32_2 = __half22float2(tmp_fp16_2); - x_in_arr[i] = - __float2half(FP32GeluBwd(tmp_fp32_2.x, tmp_fp32_2.y)); - } - *reinterpret_cast(x_g + offset) = x_in_arr; - } -} - -static bool TryLaunchFP16FastGeluFwdVectorizeCUDAKernel( - const platform::CUDADeviceContext& dev_ctx, const __half* x, __half* y, - size_t n) { - auto is_aligned = [](const void* p, size_t alignment) { - return reinterpret_cast(p) % alignment == 0; - }; - -#define PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(__vec_size, __use_fast_math) \ - do { \ - constexpr auto kAlignment = \ - alignof(phi::AlignedVector<__half, __vec_size>); \ - if (n % __vec_size == 0 && is_aligned(x, kAlignment) && \ - is_aligned(y, kAlignment)) { \ - size_t thread = std::min(512, dev_ctx.GetMaxThreadsPerBlock()); \ - size_t block = (n / __vec_size + thread - 1) / thread; \ - block = std::min(block, dev_ctx.GetCUDAMaxGridDimSize()[0]); \ - VLOG(10) << "Use FP16 fast gelu fwd kernel, block = " << block \ - << " , thread = " << thread; \ - FP16FastGeluFwdCUDAKernel< \ - __vec_size, \ - __use_fast_math><<>>(x, y, n); \ - return true; \ - } \ - } while (0) - - if (FLAGS_use_fast_math) { - PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(8, true); - } else { - PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(8, false); - } - -#undef PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL - return false; -} - -static bool TryLaunchFP16FastGeluBwdVectorizeCUDAKernel( - const platform::CUDADeviceContext& dev_ctx, const __half* x, - const __half* y_g, __half* x_g, size_t n) { - auto is_aligned = [](const void* p, size_t alignment) { - return reinterpret_cast(p) % alignment == 0; - }; - -#define PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(__vec_size, __use_fast_math) \ - do { \ - constexpr auto kAlignment = \ - alignof(phi::AlignedVector<__half, __vec_size>); \ - if (n % __vec_size == 0 && is_aligned(x, kAlignment) && \ - is_aligned(x, kAlignment) && is_aligned(y_g, kAlignment) && \ - is_aligned(x_g, kAlignment)) { \ - size_t thread = std::min(512, dev_ctx.GetMaxThreadsPerBlock()); \ - size_t block = (n / __vec_size + thread - 1) / thread; \ - block = std::min(block, dev_ctx.GetCUDAMaxGridDimSize()[0]); \ - VLOG(10) << "Use FP16 fast gelu bwd kernel, block = " << block \ - << " , thread = " << thread; \ - FP16FastGeluBwdCUDAKernel< \ - __vec_size, \ - __use_fast_math><<>>(x, y_g, \ - x_g, n); \ - return true; \ - } \ - } while (0) - - if (FLAGS_use_fast_math) { - PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(8, true); - } else { - PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(8, false); - } - -#undef PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL - return false; -} -#endif - -template -struct GeluWithApproximateFunctor { - using MPType = typename details::MPTypeTrait::Type; - inline HOSTDEVICE T operator()(T arg_x) { - // this function is tanh approximation of gelu - MPType x = static_cast(arg_x); - MPType one = static_cast(1); - MPType half = static_cast(0.5); - MPType kAlpha = static_cast(M_2_SQRTPI * M_SQRT1_2); - auto tanh_out = - tanh(kAlpha * x * (one + static_cast(GELU_CONSTANT) * x * x)); - MPType out = x * half * (one + tanh_out); - return static_cast(out); - } -}; - -template -struct GeluWithoutApproximateFunctor { - using MPType = typename details::MPTypeTrait::Type; - inline HOSTDEVICE T operator()(T arg_x) { - // actual gelu with approximation = false - MPType x = static_cast(arg_x); - return static_cast(x * normcdf(x)); - } -}; - -template -class GeluKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* out = context.Output("Out"); - auto* in = context.Input("X"); - auto approximate = context.Attr("approximate"); - out->mutable_data(in->place()); - - std::vector ins = {in}; - std::vector outs = {out}; - const auto& dev_ctx = - context.template device_context(); - - if (approximate) { -#ifdef __NVCC__ - if (std::is_same::value) { - size_t n = in->numel(); - const auto* in_ptr = reinterpret_cast(in->data()); - auto* out_ptr = reinterpret_cast<__half*>(out->data()); - if (TryLaunchFP16FastGeluFwdVectorizeCUDAKernel(dev_ctx, in_ptr, - out_ptr, n)) { - return; - } - } -#endif - paddle::operators::LaunchElementwiseCudaKernel( - dev_ctx, ins, &outs, 0, GeluWithApproximateFunctor()); - } else { - paddle::operators::LaunchElementwiseCudaKernel( - dev_ctx, ins, &outs, 0, GeluWithoutApproximateFunctor()); - } - } -}; - -template -struct GeluWithApproximateGradFunctor { - using MPType = typename details::MPTypeTrait::Type; - inline HOSTDEVICE T operator()(T arg_x, T arg_dout) { - MPType x = static_cast(arg_x); - MPType dout = static_cast(arg_dout); - MPType one = static_cast(1); - MPType half = static_cast(0.5); - MPType kAlpha = static_cast(M_2_SQRTPI * M_SQRT1_2); - MPType kBeta = - kAlpha * static_cast(GELU_CONSTANT) * static_cast(3); - auto cube_x = x * x * x; - auto tanh_out = - tanh(kAlpha * ((static_cast(GELU_CONSTANT) * cube_x) + x)); - auto ans = - half * (one + tanh_out + - (one - tanh_out * tanh_out) * (x * kAlpha + kBeta * cube_x)); - return static_cast(ans * dout); - } -}; - -template -struct GeluWithoutApproximateGradFunctor { - using MPType = typename details::MPTypeTrait::Type; - inline HOSTDEVICE T operator()(T arg_x, T arg_dout) { - MPType x = static_cast(arg_x); - MPType dout = static_cast(arg_dout); - constexpr MPType kBeta = M_2_SQRTPI * M_SQRT1_2 * static_cast(0.5); - const MPType cdf = normcdf(x); - const MPType pdf = exp(static_cast(-0.5) * x * x) * kBeta; - return static_cast(dout * (cdf + x * pdf)); - } -}; - -template -class GeluGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* dout = - context.Input(framework::GradVarName("Out")); - auto* dx = context.Output(framework::GradVarName("X")); - auto approximate = context.Attr("approximate"); - dx->mutable_data(dout->place()); - - std::vector ins = {x, dout}; - std::vector outs = {dx}; - const auto& dev_ctx = - context.template device_context(); - if (approximate) { -#ifdef __NVCC__ - if (std::is_same::value) { - size_t n = x->numel(); - const auto* x_ptr = reinterpret_cast(x->data()); - const auto* y_g_ptr = reinterpret_cast(dout->data()); - auto* x_g_ptr = reinterpret_cast<__half*>(dx->data()); - if (TryLaunchFP16FastGeluBwdVectorizeCUDAKernel(dev_ctx, x_ptr, y_g_ptr, - x_g_ptr, n)) { - return; - } - } -#endif - paddle::operators::LaunchElementwiseCudaKernel( - dev_ctx, ins, &outs, 0, GeluWithApproximateGradFunctor()); - } else { - paddle::operators::LaunchElementwiseCudaKernel( - dev_ctx, ins, &outs, 0, GeluWithoutApproximateGradFunctor()); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - gelu, ops::GeluKernel, - ops::GeluKernel, - ops::GeluKernel); -REGISTER_OP_CUDA_KERNEL( - gelu_grad, ops::GeluGradKernel, - ops::GeluGradKernel, - ops::GeluGradKernel); diff --git a/paddle/fluid/operators/gelu_op.h b/paddle/fluid/operators/gelu_op.h deleted file mode 100644 index d4fed8a868ff9e66f64c90ab9352e824ab673217..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/gelu_op.h +++ /dev/null @@ -1,233 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#ifndef _USE_MATH_DEFINES -#define _USE_MATH_DEFINES -#endif -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/float16.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" - -#ifdef PADDLE_WITH_MKLDNN -#include "paddle/fluid/platform/mkldnn_helper.h" -#endif - -namespace paddle { -namespace operators { - -#define GELU_CONSTANT 0.044715 - -template -struct GeluFunctor { - template - void operator()(Device d, X x, Out out, bool approximate) const { - if (approximate) { - // gelu(x) = 0.5 * x * (1 + tanh(sqrt(2 / \pi) * (x + 0.044715 * x^{3}))) - if (std::is_same::value) { - VLOG(4) << "cast from float16 to float before computing"; - auto casted_x = x.template cast(); - auto temp = - (static_cast(M_2_SQRTPI * M_SQRT1_2) * - (casted_x + static_cast(GELU_CONSTANT) * casted_x.cube())) - .tanh(); - out.device(d) = (casted_x * static_cast(0.5) * - (static_cast(1) + temp)) - .template cast(); - } else { - auto temp = (static_cast(M_2_SQRTPI * M_SQRT1_2) * - (x + static_cast(GELU_CONSTANT) * x.cube())) - .tanh(); - out.device(d) = x * static_cast(0.5) * (static_cast(1) + temp); - } - } else { -#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \ - !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) - auto x_data = x.data(); - auto out_data = out.data(); - int n = std::min(x.size(), out.size()); - - std::memset(out_data, 0, n * sizeof(T)); - phi::funcs::CBlas::AXPY(n, static_cast(M_SQRT1_2), x_data, 1, - out_data, 1); - phi::funcs::CBlas::VMERF(n, out_data, out_data, VML_LA); - for (int i = 0; i < n; i++) { - out_data[i] += static_cast(1); - } - phi::funcs::CBlas::VMUL(n, x_data, out_data, out_data); - for (int i = 0; i < n; i++) { - out_data[i] *= static_cast(0.5); - } -#else - // gelu(x) = 0.5 * x * (1 + erf(x / sqrt(2))) - if (std::is_same::value) { - VLOG(4) << "cast from float16 to float before computing"; - auto casted_x = x.template cast(); - auto temp = (casted_x * static_cast(M_SQRT1_2)).erf(); - out.device(d) = (casted_x * static_cast(0.5) * - (static_cast(1) + temp)) - .template cast(); - } else { - auto temp = (x * static_cast(M_SQRT1_2)).erf(); - out.device(d) = x * static_cast(0.5) * (static_cast(1) + temp); - } -#endif - } - } -}; - -template -struct GeluGradFunctor { - template - void operator()(Device d, X x, dOut dout, dX dx, bool approximate) const { - if (approximate) { - if (std::is_same::value) { - VLOG(4) << "cast from float16 to float before computing"; - auto casted_x = x.template cast(); - auto casted_dout = dout.template cast(); - - const float kAlpha = static_cast(M_2_SQRTPI * M_SQRT1_2); - const float kBeta = - kAlpha * static_cast(GELU_CONSTANT) * static_cast(3); - const auto y = - (kAlpha * - ((static_cast(GELU_CONSTANT) * casted_x.cube()) + casted_x)) - .tanh(); - dx.device(d) = (static_cast(0.5) * casted_dout * - (static_cast(1) + y + - (casted_x - casted_x * y.square()) * - (kAlpha + kBeta * casted_x.square()))) - .template cast(); - } else { - const T kAlpha = static_cast(M_2_SQRTPI * M_SQRT1_2); - const T kBeta = - kAlpha * static_cast(GELU_CONSTANT) * static_cast(3); - const auto y = - (kAlpha * ((static_cast(GELU_CONSTANT) * x.cube()) + x)).tanh(); - dx.device(d) = static_cast(0.5) * dout * - (static_cast(1) + y + - (x - x * y.square()) * (kAlpha + kBeta * x.square())); - } - } else { -#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \ - !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) - auto x_data = x.data(); - auto dx_data = dx.data(); - auto dout_data = dout.data(); - int n = std::min(x.size(), dx.size()); - - auto first = static_cast(std::malloc(n * sizeof(T))); - std::memset(first, 0, n * sizeof(T)); - auto second = static_cast(std::malloc(n * sizeof(T))); - std::memset(second, 0, n * sizeof(T)); - - // first = (0.5 * (1 + erf(x / sqrt(2)))) - phi::funcs::CBlas::AXPY(n, static_cast(M_SQRT1_2), x_data, 1, first, - 1); - phi::funcs::CBlas::VMERF(n, first, first, VML_LA); - for (int i = 0; i < n; i++) { - first[i] += static_cast(1); - } - phi::funcs::CBlas::SCAL(n, static_cast(0.5), first, 1); - - // second = (0.5 * 2/sqrt(pi) * 1/sqrt(2) * x * exp(-0.5 * x^2)) - phi::funcs::CBlas::VSQUARE(n, x_data, second); - phi::funcs::CBlas::SCAL(n, -static_cast(0.5), second, 1); - phi::funcs::CBlas::VEXP(n, second, second); - phi::funcs::CBlas::VMUL(n, x_data, second, second); - phi::funcs::CBlas::SCAL( - n, static_cast(0.5 * M_2_SQRTPI * M_SQRT1_2), second, 1); - - // dx = dout * (first + second); - phi::funcs::CBlas::VADD(n, first, second, first); - phi::funcs::CBlas::VMUL(n, dout_data, first, dx_data); - - std::free(first); - std::free(second); -#else - // gelu_grad(x) = dout * 0.5 * (1 + erf(x / sqrt(2)) + x * sqrt(2 / pi) * - // exp(- x^2 / 2) - if (std::is_same::value) { - VLOG(4) << "cast from float16 to float before computing"; - auto casted_x = x.template cast(); - auto casted_dout = dout.template cast(); - auto first = static_cast(0.5) * - (static_cast(1) + - ((casted_x * static_cast(M_SQRT1_2)).erf())); - auto second = static_cast(0.5 * M_2_SQRTPI * M_SQRT1_2) * - casted_x * - (-static_cast(0.5) * casted_x.square()).exp(); - dx.device(d) = (casted_dout * (first + second)).template cast(); - } else { - auto first = - static_cast(0.5) * - (static_cast(1) + ((x * static_cast(M_SQRT1_2)).erf())); - - auto second = static_cast(0.5 * M_2_SQRTPI * M_SQRT1_2) * x * - (-static_cast(0.5) * x.square()).exp(); - dx.device(d) = dout * (first + second); - } -#endif - } - } -}; - -template -class GeluKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* out = context.Output("Out"); - auto* in = context.Input("X"); - auto approximate = context.Attr("approximate"); - out->mutable_data(in->place()); - - auto eigen_out = framework::EigenVector::Flatten(*out); - auto eigen_in = framework::EigenVector::Flatten(*in); - auto& place = - *context.template device_context().eigen_device(); - - GeluFunctor functor; - functor(place, eigen_in, eigen_out, approximate); - } -}; - -template -class GeluGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* dout = - context.Input(framework::GradVarName("Out")); - auto* dx = context.Output(framework::GradVarName("X")); - auto approximate = context.Attr("approximate"); - dx->mutable_data(dout->place()); - - auto eigen_x = framework::EigenVector::Flatten(*x); - auto eigen_dout = framework::EigenVector::Flatten(*dout); - auto eigen_dx = framework::EigenVector::Flatten(*dx); - auto& place = - *context.template device_context().eigen_device(); - - GeluGradFunctor functor; - functor(place, eigen_x, eigen_dout, eigen_dx, approximate); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/gelu_op_npu.cc b/paddle/fluid/operators/gelu_op_npu.cc index 18bbc7f4929c6493db9161d0415c0728eb8689c0..c5297dd9cd404b7637c2eec79dafcc027509ddcb 100644 --- a/paddle/fluid/operators/gelu_op_npu.cc +++ b/paddle/fluid/operators/gelu_op_npu.cc @@ -15,7 +15,9 @@ limitations under the License. */ #include #include -#include "paddle/fluid/operators/gelu_op.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/gelu_op_npu_test.cc b/paddle/fluid/operators/gelu_op_npu_test.cc index f3ac53138328dbfad12c6d530a6517f40c658677..b132b3170756d95adfde51e6d6ce7a5f0f25ca26 100644 --- a/paddle/fluid/operators/gelu_op_npu_test.cc +++ b/paddle/fluid/operators/gelu_op_npu_test.cc @@ -30,7 +30,7 @@ limitations under the License. */ namespace f = paddle::framework; namespace p = paddle::platform; -USE_OP(gelu); +USE_OP_ITSELF(gelu); USE_OP_DEVICE_KERNEL(gelu, NPU); template diff --git a/paddle/fluid/operators/gelu_op_xpu.cc b/paddle/fluid/operators/gelu_op_xpu.cc index b8c2e9becf2950d12f87ec5d61c05f3bf0010b12..559d2448ad94525d623e24fc8fb6c5e3881b58e3 100644 --- a/paddle/fluid/operators/gelu_op_xpu.cc +++ b/paddle/fluid/operators/gelu_op_xpu.cc @@ -14,9 +14,9 @@ limitations under the License. */ #include #include - -#include "paddle/fluid/operators/gelu_op.h" - +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/tensor.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/grid_sampler_op.cc b/paddle/fluid/operators/grid_sampler_op.cc index 04aa6a3e10f6e3f55f9845d1b4b6bd6aa762c016..f6d3fd898469113dcffce76a84e4c292603707c6 100644 --- a/paddle/fluid/operators/grid_sampler_op.cc +++ b/paddle/fluid/operators/grid_sampler_op.cc @@ -12,12 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/grid_sampler_op.h" #include #include + +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -27,43 +31,6 @@ using Tensor = framework::Tensor; class GridSampleOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "GridSampler"); - OP_INOUT_CHECK(ctx->HasInput("Grid"), "Input", "Grid", "GridSampler"); - OP_INOUT_CHECK(ctx->HasOutput("Output"), "Output", "Output", "GridSampler"); - - auto x_dims = ctx->GetInputDim("X"); - auto grid_dims = ctx->GetInputDim("Grid"); - PADDLE_ENFORCE_EQ(x_dims.size(), 4, - platform::errors::InvalidArgument( - "Input(X) of GridSampleOp should be 4-D Tensor, but " - "received X dimension size(%d)", - x_dims.size())); - PADDLE_ENFORCE_EQ(grid_dims.size(), 4, - platform::errors::InvalidArgument( - "Input(Grid) of GridSampleOp should be 4-D Tensor, " - "but received X dimension size(%d)", - grid_dims.size())); - if (ctx->IsRuntime() || grid_dims[3] > 0) { - PADDLE_ENFORCE_EQ( - grid_dims[3], 2, - platform::errors::InvalidArgument( - "Input(Grid) dimension[3] should be 2, but received %d", - grid_dims[3])); - } - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_EQ( - grid_dims[0], x_dims[0], - platform::errors::InvalidArgument( - "Input(X) and Input(Grid) dimension[0] should be equal, but " - "received X dimension[0](%d) != Grid dimension[0](%d)", - x_dims[0], grid_dims[0])); - } - - ctx->SetOutputDim("Output", - {x_dims[0], x_dims[1], grid_dims[1], grid_dims[2]}); - ctx->ShareLoD("X", "Output"); - } protected: framework::OpKernelType GetExpectedKernelType( @@ -173,18 +140,6 @@ class GridSampleOpMaker : public framework::OpProtoAndCheckerMaker { class GridSampleOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output", - framework::GradVarName("X"), "grid_sampler"); - auto input_dims = ctx->GetInputDim("X"); - auto grid_dims = ctx->GetInputDim("Grid"); - if (ctx->HasOutput(framework::GradVarName("X"))) { - ctx->SetOutputDim(framework::GradVarName("X"), input_dims); - } - if (ctx->HasOutput(framework::GradVarName("Grid"))) { - ctx->SetOutputDim(framework::GradVarName("Grid"), grid_dims); - } - } protected: framework::OpKernelType GetExpectedKernelType( @@ -224,19 +179,16 @@ class GridSampleGradMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(grid_sampler, GridSamplerInferShapeFunctor, + PD_INFER_META(phi::GridSampleBaseInferMeta)); REGISTER_OPERATOR(grid_sampler, ops::GridSampleOp, ops::GridSampleOpMaker, ops::GridSampleGradMaker, - ops::GridSampleGradMaker); -REGISTER_OPERATOR(grid_sampler_grad, ops::GridSampleOpGrad); - -REGISTER_OP_CPU_KERNEL( - grid_sampler, - ops::GridSampleOpKernel, - ops::GridSampleOpKernel); -REGISTER_OP_CPU_KERNEL( - grid_sampler_grad, - ops::GridSampleGradOpKernel, - ops::GridSampleGradOpKernel); + ops::GridSampleGradMaker, + GridSamplerInferShapeFunctor); +DECLARE_INFER_SHAPE_FUNCTOR(grid_sampler_grad, GridSamplerGradInferShapeFunctor, + PD_INFER_META(phi::GeneralBinaryGradInferMeta)); +REGISTER_OPERATOR(grid_sampler_grad, ops::GridSampleOpGrad, + GridSamplerGradInferShapeFunctor); REGISTER_OP_VERSION(grid_sampler) .AddCheckpoint( diff --git a/paddle/fluid/operators/grid_sampler_op.cu b/paddle/fluid/operators/grid_sampler_op.cu deleted file mode 100644 index a227a8e312765b4311314ea884f2c32443924fbc..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/grid_sampler_op.cu +++ /dev/null @@ -1,492 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/grid_sampler_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" -#include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -namespace paddle { -namespace operators { - -static __forceinline__ __device__ bool in_bounds(int h, int w, int H, int W) { - return h >= 0 && h < H && w >= 0 && w < W; -} - -template -static __forceinline__ __device__ void atomic_add(T* data, int h, int w, int sH, - int sW, int H, int W, - T delta) { - if (in_bounds(h, w, H, W)) { - platform::CudaAtomicAdd(data + h * sH + w * sW, delta); - } -} - -template -static __forceinline__ __device__ T _unnormalize(T coord, int size, - bool align_corners) { - if (align_corners) { - return ((coord + 1.f) / 2) * (size - 1); - } else { - return ((coord + 1.f) * size - 1) / 2; - } -} - -template -static __forceinline__ __device__ T clip_indexes(T in, int max_value) { - return min(static_cast(max_value), max(in, static_cast(0))); -} - -template -static __forceinline__ __device__ T reflect_indexes(T in, int twice_low, - int twice_high) { - if (twice_low == twice_high) { - return static_cast(0); - } - T min = static_cast(twice_low) / 2; - T span = static_cast(twice_high - twice_low) / 2; - in = fabs(in - min); - T extra = fmod(in, span); - int flips = static_cast(floor(in / span)); - if (flips % 2 == 0) { - return extra + min; - } else { - return span - extra + min; - } -} - -template -static __forceinline__ __device__ T compute_positions(T coord, int size, - PaddingMode padding_mode, - bool align_corners) { - coord = _unnormalize(coord, size, align_corners); - if (padding_mode == PaddingMode::border) { - coord = clip_indexes(coord, size - 1); - } else if (padding_mode == PaddingMode::reflect) { - if (align_corners) { - coord = reflect_indexes(coord, 0, 2 * (size - 1)); - } else { - coord = reflect_indexes(coord, -1, 2 * size - 1); - } - coord = clip_indexes(coord, size - 1); - } - return coord; -} - -template -static __forceinline__ __device__ T _unnormalize_with_mask(T coord, int size, - bool align_corners, - T* grad_in) { - if (align_corners) { - *grad_in = static_cast(size - 1) / 2; - return ((coord + 1.f) / 2) * (size - 1); - } else { - *grad_in = static_cast(size) / 2; - return ((coord + 1.f) * size - 1) / 2; - } -} - -template -static __forceinline__ __device__ T clip_indexes_with_mask(T in, int clip_limit, - T* grad_in) { - if (in <= static_cast(0)) { - *grad_in = static_cast(0); - return static_cast(0); - } else { - T max = static_cast(clip_limit - 1); - if (in >= max) { - *grad_in = static_cast(0); - return max; - } else { - *grad_in = static_cast(1); - return in; - } - } -} - -template -static __forceinline__ __device__ T -reflect_indexes_with_mask(T in, int twice_low, int twice_high, T* grad_in) { - if (twice_low == twice_high) { - *grad_in = static_cast(0); - return static_cast(0); - } - int grad_in_mult_; - T min = static_cast(twice_low) / 2; - T span = static_cast(twice_high - twice_low) / 2; - in = in - min; - if (in < static_cast(0)) { - grad_in_mult_ = -1; - in = -in; - } else { - grad_in_mult_ = 1; - } - T extra = fmod(in, span); - int flips = static_cast(floor(in / span)); - if (flips % 2 == 0) { - *grad_in = static_cast(grad_in_mult_); - return extra + min; - } else { - *grad_in = static_cast(-grad_in_mult_); - return span - extra + min; - } -} - -template -static __forceinline__ __device__ T -compute_positions_with_mask(T coord, int size, PaddingMode padding_mode, - bool align_corners, T* grad_in) { - T grad_clip, grad_refl; - coord = _unnormalize_with_mask(coord, size, align_corners, grad_in); - if (padding_mode == PaddingMode::border) { - coord = clip_indexes_with_mask(coord, size, &grad_clip); - *grad_in = (*grad_in) * grad_clip; - } else if (padding_mode == PaddingMode::reflect) { - if (align_corners) { - coord = reflect_indexes_with_mask(coord, 0, 2 * (size - 1), &grad_refl); - } else { - coord = reflect_indexes_with_mask(coord, -1, 2 * size - 1, &grad_refl); - } - coord = clip_indexes_with_mask(coord, size, &grad_clip); - *grad_in = (*grad_in) * grad_refl * grad_clip; - } - - return coord; -} - -template -__global__ void grid_sample_cuda_kernel(const int nthreads, int n, int out_c, - int out_h, int out_w, int in_h, - int in_w, const T* input, const T* grid, - T* output, const Mode mode, - const PaddingMode padding_mode, - bool align_corners) { - int inp_sN = out_c * in_h * in_w; - - int inp_sC = in_h * in_w; - int inp_sH = in_w; - int inp_sW = 1; - int grid_sN = out_h * out_w * 2; - int grid_sH = out_w * 2; - int grid_sW = 2; - int grid_sCoor = 1; - int out_sN = out_c * out_h * out_w; - int out_sC = out_h * out_w; - int out_sH = out_w; - int out_sW = 1; - CUDA_KERNEL_LOOP(index, nthreads) { - const int w = index % out_w; - const int h = (index / out_w) % out_h; - const int n = index / (out_h * out_w); - const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW; - - T ix = grid[grid_offset]; - T iy = grid[grid_offset + grid_sCoor]; - - ix = compute_positions(ix, in_w, padding_mode, align_corners); - iy = compute_positions(iy, in_h, padding_mode, align_corners); - if (mode == Mode::bilinear) { - int ix_nw = static_cast(floor(ix)); - int iy_nw = static_cast(floor(iy)); - int ix_ne = ix_nw + 1; - int iy_ne = iy_nw; - int ix_sw = ix_nw; - int iy_sw = iy_nw + 1; - int ix_se = ix_nw + 1; - int iy_se = iy_nw + 1; - - T nw = (ix_se - ix) * (iy_se - iy); - T ne = (ix - ix_sw) * (iy_sw - iy); - T sw = (ix_ne - ix) * (iy - iy_ne); - T se = (ix - ix_nw) * (iy - iy_nw); - - auto inp_offset_NC = n * inp_sN; - - auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW; - for (int c = 0; c < out_c; - ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) { - *out_ptr_NCHW = static_cast(0); - if (in_bounds(iy_nw, ix_nw, in_h, in_w)) { - *out_ptr_NCHW += - input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW] * nw; - } - if (in_bounds(iy_ne, ix_ne, in_h, in_w)) { - *out_ptr_NCHW += - input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW] * ne; - } - if (in_bounds(iy_sw, ix_sw, in_h, in_w)) { - *out_ptr_NCHW += - input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW] * sw; - } - if (in_bounds(iy_se, ix_se, in_h, in_w)) { - *out_ptr_NCHW += - input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW] * se; - } - } - } else if (mode == Mode::nearest) { - int ix_nearest = static_cast(std::nearbyint(ix)); - int iy_nearest = static_cast(std::nearbyint(iy)); - auto inp_offset_NC = n * inp_sN; - auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW; - for (int c = 0; c < out_c; - ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) { - if (in_bounds(iy_nearest, ix_nearest, in_h, in_w)) { - *out_ptr_NCHW = - input[inp_offset_NC + iy_nearest * inp_sH + ix_nearest * inp_sW]; - } else { - *out_ptr_NCHW = static_cast(0); - } - } - } - } -} - -template -class GridSampleOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.cuda_device_context(); - auto align_corners = ctx.Attr("align_corners"); - auto padding_mode_s = ctx.Attr("padding_mode"); - auto mode_s = ctx.Attr("mode"); - PaddingMode padding_mode; - Mode mode; - if (padding_mode_s == "border") { - padding_mode = PaddingMode::border; - } else if (padding_mode_s == "reflection") { - padding_mode = PaddingMode::reflect; - } else { - padding_mode = PaddingMode::zeros; - } - - if (mode_s == "nearest") { - mode = Mode::nearest; - } else { - mode = Mode::bilinear; - } - - auto* input = ctx.Input("X"); - auto* grid = ctx.Input("Grid"); - const int n = grid->dims()[0]; - const int out_h = grid->dims()[1]; - const int out_w = grid->dims()[2]; - const int c = input->dims()[1]; - const int in_h = input->dims()[2]; - const int in_w = input->dims()[3]; - VLOG(3) << "n: " << n << "; c: " << c << "; out_h: " << out_h - << "; out_w: " << out_w; - auto* output = ctx.Output("Output"); - auto* output_data = output->mutable_data(ctx.GetPlace()); - VLOG(3) << "out dims: " << output->dims()[0] << "; " << output->dims()[1] - << "; " << output->dims()[2] << "; " << output->dims()[3]; - int count = static_cast(n * out_h * out_w); - auto cu_stream = dev_ctx.stream(); - platform::GpuLaunchConfig config = - platform::GetGpuLaunchConfig1D(dev_ctx, count); - grid_sample_cuda_kernel< - T><<>>( - count, n, c, out_h, out_w, in_h, in_w, input->data(), - grid->data(), output_data, mode, padding_mode, align_corners); - } -}; - -template -__global__ void grid_sampler_cuda_backward_kernel( - const int nthreads, const T* grad_output, const T* input, const T* grid, - int n, int out_c, int out_h, int out_w, int in_h, int in_w, T* grad_input, - T* grad_grid, const Mode mode, const PaddingMode padding_mode, - bool align_corners) { - int inp_sN = out_c * in_h * in_w; - int inp_sC = in_h * in_w; - int inp_sH = in_w; - int inp_sW = 1; - int grid_sN = out_h * out_w * 2; - int grid_sH = out_w * 2; - int grid_sW = 2; - int grid_sCoor = 1; - - int gOut_sN = out_c * out_h * out_w; - int gOut_sC = out_h * out_w; - int gOut_sH = out_w; - int gOut_sW = 1; - - CUDA_KERNEL_LOOP(index, nthreads) { - const int w = index % out_w; - const int h = (index / out_w) % out_h; - const int n = index / (out_h * out_w); - const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW; - - T ix = grid[grid_offset]; - T iy = grid[grid_offset + grid_sCoor]; - - T gix_mult, giy_mult; - ix = compute_positions_with_mask(ix, in_w, padding_mode, align_corners, - &gix_mult); - iy = compute_positions_with_mask(iy, in_h, padding_mode, align_corners, - &giy_mult); - - if (mode == Mode::bilinear) { - int ix_nw = static_cast(floor(ix)); - int iy_nw = static_cast(floor(iy)); - int ix_ne = ix_nw + 1; - int iy_ne = iy_nw; - int ix_sw = ix_nw; - int iy_sw = iy_nw + 1; - int ix_se = ix_nw + 1; - int iy_se = iy_nw + 1; - - T nw = (ix_se - ix) * (iy_se - iy); - T ne = (ix - ix_sw) * (iy_sw - iy); - T sw = (ix_ne - ix) * (iy - iy_ne); - T se = (ix - ix_nw) * (iy - iy_nw); - - T gix = static_cast(0), giy = static_cast(0); - int gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW; - T* gInp_ptr_NC = grad_input + n * inp_sN; - int inp_offset_NC = n * inp_sN; - for (int c = 0; c < out_c; ++c, inp_offset_NC += inp_sC, - gInp_ptr_NC += inp_sC, gOut_offset += gOut_sC) { - T gOut = grad_output[gOut_offset]; - - atomic_add(gInp_ptr_NC, iy_nw, ix_nw, inp_sH, inp_sW, in_h, in_w, - nw * gOut); - atomic_add(gInp_ptr_NC, iy_ne, ix_ne, inp_sH, inp_sW, in_h, in_w, - ne * gOut); - atomic_add(gInp_ptr_NC, iy_sw, ix_sw, inp_sH, inp_sW, in_h, in_w, - sw * gOut); - atomic_add(gInp_ptr_NC, iy_se, ix_se, inp_sH, inp_sW, in_h, in_w, - se * gOut); - - if (in_bounds(iy_nw, ix_nw, in_h, in_w)) { - T nw_val = input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW]; - gix -= nw_val * (iy_se - iy) * gOut; - giy -= nw_val * (ix_se - ix) * gOut; - } - if (in_bounds(iy_ne, ix_ne, in_h, in_w)) { - T ne_val = input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW]; - gix += ne_val * (iy_sw - iy) * gOut; - giy -= ne_val * (ix - ix_sw) * gOut; - } - if (in_bounds(iy_sw, ix_sw, in_h, in_w)) { - T sw_val = input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW]; - gix -= sw_val * (iy - iy_ne) * gOut; - giy += sw_val * (ix_ne - ix) * gOut; - } - if (in_bounds(iy_se, ix_se, in_h, in_w)) { - T se_val = input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW]; - gix += se_val * (iy - iy_nw) * gOut; - giy += se_val * (ix - ix_nw) * gOut; - } - } - - if (grad_grid != nullptr) { - T* gGrid_ptr_NHW = grad_grid + index * grid_sW; - gGrid_ptr_NHW[0] = gix_mult * gix; - gGrid_ptr_NHW[1] = giy_mult * giy; - } - } else if (mode == Mode::nearest) { - int ix_nearest = static_cast(std::nearbyint(ix)); - int iy_nearest = static_cast(std::nearbyint(iy)); - - int gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW; - T* gInp_ptr_NC = grad_input + n * inp_sN; - for (int c = 0; c < out_c; - ++c, gInp_ptr_NC += inp_sC, gOut_offset += gOut_sC) { - atomic_add(gInp_ptr_NC, iy_nearest, ix_nearest, inp_sH, inp_sW, in_h, - in_w, grad_output[gOut_offset]); - } - - if (grad_grid != nullptr) { - T* gGrid_ptr_NHW = grad_grid + index * grid_sW; - gGrid_ptr_NHW[0] = static_cast(0); - gGrid_ptr_NHW[1] = static_cast(0); - } - } - } -} - -template -class GridSampleGradOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.cuda_device_context(); - auto align_corners = ctx.Attr("align_corners"); - auto padding_mode_s = ctx.Attr("padding_mode"); - auto mode_s = ctx.Attr("mode"); - - PaddingMode padding_mode; - Mode mode; - if (padding_mode_s == "border") { - padding_mode = PaddingMode::border; - } else if (padding_mode_s == "reflection") { - padding_mode = PaddingMode::reflect; - } else { - padding_mode = PaddingMode::zeros; - } - - if (mode_s == "nearest") { - mode = Mode::nearest; - } else { - mode = Mode::bilinear; - } - - auto* input = ctx.Input("X"); - auto* grid = ctx.Input("Grid"); - auto* output_grad = ctx.Input(framework::GradVarName("Output")); - - const int n = grid->dims()[0]; - const int out_h = grid->dims()[1]; - const int out_w = grid->dims()[2]; - const int c = input->dims()[1]; - const int in_h = input->dims()[2]; - const int in_w = input->dims()[3]; - - auto* input_grad = ctx.Output(framework::GradVarName("X")); - input_grad->mutable_data(ctx.GetPlace()); - phi::funcs::SetConstant()( - ctx.template device_context(), - input_grad, static_cast(0)); - - T* grid_grad_data = nullptr; - if (ctx.HasOutput(framework::GradVarName("Grid"))) { - auto* grid_grad = ctx.Output(framework::GradVarName("Grid")); - grid_grad_data = grid_grad->mutable_data(ctx.GetPlace()); - } - - int count = static_cast(n * out_h * out_w); - auto cu_stream = dev_ctx.stream(); - platform::GpuLaunchConfig config = - platform::GetGpuLaunchConfig1D(dev_ctx, count); - grid_sampler_cuda_backward_kernel< - T><<>>( - count, output_grad->data(), input->data(), grid->data(), n, c, - out_h, out_w, in_h, in_w, input_grad->data(), grid_grad_data, mode, - padding_mode, align_corners); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL(grid_sampler, ops::GridSampleOpCUDAKernel, - ops::GridSampleOpCUDAKernel); -REGISTER_OP_CUDA_KERNEL(grid_sampler_grad, - ops::GridSampleGradOpCUDAKernel, - ops::GridSampleGradOpCUDAKernel); diff --git a/paddle/fluid/operators/grid_sampler_op.h b/paddle/fluid/operators/grid_sampler_op.h deleted file mode 100644 index 93e96694270a458844bbcabf78f2559975902c2f..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/grid_sampler_op.h +++ /dev/null @@ -1,600 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/core/hostdevice.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -enum class Mode { - bilinear, - nearest, -}; - -enum class PaddingMode { zeros, border, reflect }; - -using Tensor = framework::Tensor; -template -using EigenTensor = framework::EigenTensor; - -using Array3 = Eigen::DSizes; -using Array4 = Eigen::DSizes; - -template -static inline bool isInBound(T x, T y, T x_max, T y_max) { - if (x < 0 || x > x_max || y < 0 || y > y_max) { - return false; - } - return true; -} - -template -static inline void unnormalize(const platform::CPUDeviceContext& ctx, - Tensor* grid_slice, - const int max_val, // height-1 or width-1 - bool align_corners) { - auto& place = *ctx.eigen_device(); - auto grid_slice_t = EigenTensor::From(*grid_slice); - - if (!align_corners) { - auto factor = static_cast((max_val + 1) * 0.5); - grid_slice_t.device(place) = - (grid_slice_t + static_cast(1)) * factor - static_cast(0.5); - } else { - auto factor = static_cast(max_val * 0.5); - grid_slice_t.device(place) = (grid_slice_t + static_cast(1)) * factor; - } -} - -template -static inline void clip(const platform::CPUDeviceContext& ctx, - Tensor* grid_slice, - const int max_val, // height-1 or width-1 - bool align_corners, std::string padding_mode) { - auto& place = *ctx.eigen_device(); - auto grid_slice_t = EigenTensor::From(*grid_slice); - if (padding_mode == "border") { - grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast(0)) - .cwiseMin(static_cast(max_val)); - } else if (padding_mode == "reflection") { - if (align_corners) { - auto double_range = static_cast(max_val * 2); - auto grid_abs = grid_slice_t.abs(); - auto extra = grid_abs - (grid_abs / double_range).floor() * double_range; - grid_slice_t.device(place) = extra.cwiseMin(double_range - extra); - if (max_val == 0) { - grid_slice_t.device(place) = grid_slice_t.constant(static_cast(0)); - } - } else { - auto double_range = static_cast((max_val + 1) * 2); - auto grid_abs = (grid_slice_t + static_cast(0.5)).abs(); - auto extra = grid_abs - (grid_abs / double_range).floor() * double_range; - grid_slice_t.device(place) = - extra.cwiseMin(double_range - extra) - static_cast(0.5); - grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast(0)) - .cwiseMin(static_cast(max_val)); - } - } -} - -template -static inline void clipWithMask(const platform::CPUDeviceContext& ctx, - const int max_val, // height-1 or width-1 - bool align_corners, std::string padding_mode, - Tensor* grid_slice, Tensor* grid_scale) { - auto& place = *ctx.eigen_device(); - grid_scale->mutable_data(grid_slice->dims(), ctx.GetPlace()); - - auto grid_slice_t = EigenTensor::From(*grid_slice); - auto factor = static_cast(max_val * 0.5); - if (!align_corners) { - factor = static_cast((max_val + 1) * 0.5); - } - auto grid_scale_t = EigenTensor::From(*grid_scale).setConstant(factor); - - if (padding_mode == "border") { - // auto bounded_lo = grid_slice_t.cwiseMax(static_cast(0)); - auto res = grid_slice_t.cwiseMax(static_cast(0)) - .cwiseMin(static_cast(max_val)); - - auto in_bound = (res == grid_slice_t); - grid_scale_t.device(place) = grid_scale_t * in_bound.template cast(); - grid_slice_t.device(place) = res; - } else if (padding_mode == "reflection") { - if (align_corners) { - auto double_range = static_cast(max_val * 2); - auto is_neg = (grid_slice_t < static_cast(0)); - auto grid_abs = grid_slice_t.abs(); - auto extra = grid_abs - (grid_abs / double_range).floor() * double_range; - auto one_more_flip = (extra > (double_range - extra)); - grid_scale_t.device(place) = - grid_scale_t * ((is_neg == one_more_flip).template cast() - - (is_neg != one_more_flip).template cast()); - grid_slice_t.device(place) = extra.cwiseMin(double_range - extra); - if (max_val == 0) { - grid_slice_t.device(place) = grid_slice_t.constant(static_cast(0)); - } - } else { - auto double_range = static_cast((max_val + 1) * 2); - auto grid_abs = (grid_slice_t + static_cast(0.5)).abs(); - auto is_neg = ((grid_slice_t + static_cast(0.5)) < static_cast(0)); - auto extra = grid_abs - (grid_abs / double_range).floor() * double_range; - auto one_more_flip = (extra > (double_range - extra)); - auto reflected = - extra.cwiseMin(double_range - extra) - static_cast(0.5); - auto clipped = reflected.cwiseMax(static_cast(0)) - .cwiseMin(static_cast(max_val)); - auto in_bound = (clipped == reflected).template cast(); - grid_scale_t.device(place) = - grid_scale_t * ((is_neg == one_more_flip).template cast() - - (is_neg != one_more_flip).template cast()) * - in_bound; - grid_slice_t.device(place) = clipped; - } - } -} - -template -static void calcGridLocations(const platform::CPUDeviceContext& ctx, - const Tensor& grid, const int in_h, - const int in_w, bool align_corners, - std::string padding_mode, Tensor* grid_x, - Tensor* grid_y) { - const int n = grid.dims()[0]; - const int out_h = grid.dims()[1]; - const int out_w = grid.dims()[2]; - - // split grid with shape (n, h, w, 2) into (x, y) by the 3rd Dim - T* grid_x_data = grid_x->mutable_data({n, out_h, out_w}, ctx.GetPlace()); - T* grid_y_data = grid_y->mutable_data({n, out_h, out_w}, ctx.GetPlace()); - const T* grid_data = grid.data(); - for (int i = 0; i < n * out_h * out_w; i++) { - grid_x_data[i] = grid_data[2 * i]; - grid_y_data[i] = grid_data[(2 * i) + 1]; - } - - unnormalize(ctx, grid_x, in_w - 1, align_corners); - unnormalize(ctx, grid_y, in_h - 1, align_corners); - - clip(ctx, grid_x, in_w - 1, align_corners, padding_mode); - clip(ctx, grid_y, in_h - 1, align_corners, padding_mode); -} - -template -static void calcGridLocationsWithGrad(const platform::CPUDeviceContext& ctx, - const Tensor& grid, const int in_h, - const int in_w, bool align_corners, - std::string padding_mode, Tensor* grid_x, - Tensor* grid_y, Tensor* grid_x_scale, - Tensor* grid_y_scale) { - const int n = grid.dims()[0]; - const int out_h = grid.dims()[1]; - const int out_w = grid.dims()[2]; - - // split grid with shape (n, h, w, 2) into (x, y) by the 3rd Dim - T* grid_x_data = grid_x->mutable_data({n, out_h, out_w}, ctx.GetPlace()); - T* grid_y_data = grid_y->mutable_data({n, out_h, out_w}, ctx.GetPlace()); - - const T* grid_data = grid.data(); - for (int i = 0; i < n * out_h * out_w; i++) { - grid_x_data[i] = grid_data[2 * i]; - grid_y_data[i] = grid_data[(2 * i) + 1]; - } - - unnormalize(ctx, grid_x, in_w - 1, align_corners); - unnormalize(ctx, grid_y, in_h - 1, align_corners); - - clipWithMask(ctx, in_w - 1, align_corners, padding_mode, grid_x, - grid_x_scale); - clipWithMask(ctx, in_h - 1, align_corners, padding_mode, grid_y, - grid_y_scale); -} - -template -static void getGridPointValue(const Tensor& input, Tensor* output, - const Tensor& x, const Tensor& y) { - const int n = input.dims()[0]; - const int c = input.dims()[1]; - const int in_h = input.dims()[2]; - const int in_w = input.dims()[3]; - const int out_h = x.dims()[1]; - const int out_w = x.dims()[2]; - auto x_t = EigenTensor::From(x); - auto y_t = EigenTensor::From(y); - auto output_t = EigenTensor::From(*output).setConstant((T)0); - auto input_t = EigenTensor::From(input); - - for (int i = 0; i < n; i++) { - for (int k = 0; k < out_h; k++) { - for (int l = 0; l < out_w; l++) { - if (isInBound(x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1), - (T)(in_h - 1))) { - for (int j = 0; j < c; j++) { - output_t(i, j, k, l) = - input_t(i, j, static_cast(round(y_t(i, k, l))), - static_cast(round(x_t(i, k, l)))); - } - } - } - } - } -} - -template -static void allNeigbors(const platform::CPUDeviceContext& ctx, - const Tensor& input, Tensor* grid_x, Tensor* grid_y, - Tensor* x_w, Tensor* x_e, Tensor* y_n, - Tensor* y_s, // positions - Tensor* d_w, Tensor* d_e, Tensor* d_n, - Tensor* d_s, // distance - Tensor* v_wn, Tensor* v_en, Tensor* v_ws, - Tensor* v_es) { // values - auto& place = *ctx.eigen_device(); - - const int c = input.dims()[1]; - const int n = grid_x->dims()[0]; - const int out_h = grid_x->dims()[1]; - const int out_w = grid_x->dims()[2]; - // calculate coords of 4 corner points - x_w->mutable_data({n, out_h, out_w}, ctx.GetPlace()); - x_e->mutable_data({n, out_h, out_w}, ctx.GetPlace()); - y_n->mutable_data({n, out_h, out_w}, ctx.GetPlace()); - y_s->mutable_data({n, out_h, out_w}, ctx.GetPlace()); - auto x_w_t = EigenTensor::From(*x_w); - auto x_e_t = EigenTensor::From(*x_e); - auto y_n_t = EigenTensor::From(*y_n); - auto y_s_t = EigenTensor::From(*y_s); - - auto grid_x_t = EigenTensor::From(*grid_x); - auto grid_y_t = EigenTensor::From(*grid_y); - - x_w_t.device(place) = grid_x_t.floor(); - x_e_t.device(place) = x_w_t + static_cast(1); - y_n_t.device(place) = grid_y_t.floor(); - y_s_t.device(place) = y_n_t + static_cast(1); - - // calculate distances to 4 sides - d_w->mutable_data({n, out_h, out_w}, ctx.GetPlace()); - d_e->mutable_data({n, out_h, out_w}, ctx.GetPlace()); - d_n->mutable_data({n, out_h, out_w}, ctx.GetPlace()); - d_s->mutable_data({n, out_h, out_w}, ctx.GetPlace()); - auto d_w_t = EigenTensor::From(*d_w); - auto d_e_t = EigenTensor::From(*d_e); - auto d_n_t = EigenTensor::From(*d_n); - auto d_s_t = EigenTensor::From(*d_s); - d_w_t.device(place) = grid_x_t - x_w_t; - d_e_t.device(place) = x_e_t - grid_x_t; - d_n_t.device(place) = grid_y_t - y_n_t; - d_s_t.device(place) = y_s_t - grid_y_t; - - // calc 4 corner points value - v_wn->mutable_data({n, c, out_h, out_w}, ctx.GetPlace()); - v_en->mutable_data({n, c, out_h, out_w}, ctx.GetPlace()); - v_ws->mutable_data({n, c, out_h, out_w}, ctx.GetPlace()); - v_es->mutable_data({n, c, out_h, out_w}, ctx.GetPlace()); - getGridPointValue(input, v_wn, *x_w, *y_n); - getGridPointValue(input, v_en, *x_e, *y_n); - getGridPointValue(input, v_ws, *x_w, *y_s); - getGridPointValue(input, v_es, *x_e, *y_s); -} - -template -static void bilinearInter(const platform::CPUDeviceContext& ctx, - const Tensor& input, Tensor* grid_x, Tensor* grid_y, - Tensor* out) { - auto& place = *ctx.eigen_device(); - const int n = grid_x->dims()[0]; - const int out_h = grid_x->dims()[1]; - const int out_w = grid_x->dims()[2]; - const int c = input.dims()[1]; - - Tensor x_w, x_e, y_n, y_s; - Tensor d_w, d_e, d_n, d_s; - Tensor v_wn, v_en, v_ws, v_es; - - allNeigbors(ctx, input, grid_x, grid_y, &x_w, &x_e, &y_n, &y_s, &d_w, &d_e, - &d_n, &d_s, &v_wn, &v_en, &v_ws, &v_es); - - auto d_w_t = EigenTensor::From(d_w); - auto d_e_t = EigenTensor::From(d_e); - auto d_n_t = EigenTensor::From(d_n); - auto d_s_t = EigenTensor::From(d_s); - - auto d_w_scaled_t = - d_w_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1)); - auto d_e_scaled_t = - d_e_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1)); - auto d_n_scaled_t = - d_n_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1)); - auto d_s_scaled_t = - d_s_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1)); - auto v_wn_t = EigenTensor::From(v_wn); - auto v_en_t = EigenTensor::From(v_en); - auto v_ws_t = EigenTensor::From(v_ws); - auto v_es_t = EigenTensor::From(v_es); - auto output_t = EigenTensor::From(*out); - // bilinear interpolaetion by 4 corner points - output_t.device(place) = v_wn_t * d_e_scaled_t * d_s_scaled_t + - v_en_t * d_w_scaled_t * d_s_scaled_t + - v_ws_t * d_e_scaled_t * d_n_scaled_t + - v_es_t * d_w_scaled_t * d_n_scaled_t; -} - -template -static void nearestInter(const platform::CPUDeviceContext& ctx, - const Tensor& input, Tensor* grid_x, Tensor* grid_y, - Tensor* out) { - auto& place = *ctx.eigen_device(); - - auto grid_x_t = EigenTensor::From(*grid_x); - auto grid_y_t = EigenTensor::From(*grid_y); - grid_x_t = grid_x_t.round(); - grid_y_t = grid_y_t.round(); - getGridPointValue(input, out, *grid_x, *grid_y); -} - -template -static void gatherOutputGradToInputGrad(const Tensor& output_grad, - Tensor* input_grad, const Tensor& x, - const Tensor& y, const Tensor& d1, - const Tensor& d2) { - const int n = output_grad.dims()[0]; - const int c = output_grad.dims()[1]; - const int out_h = output_grad.dims()[2]; - const int out_w = output_grad.dims()[3]; - const int in_h = input_grad->dims()[2]; - const int in_w = input_grad->dims()[3]; - auto x_t = EigenTensor::From(x); - auto y_t = EigenTensor::From(y); - auto d1_t = EigenTensor::From(d1); - auto d2_t = EigenTensor::From(d2); - auto input_grad_t = EigenTensor::From(*input_grad); - auto output_grad_t = EigenTensor::From(output_grad); - - for (int i = 0; i < n; i++) { - for (int k = 0; k < out_h; k++) { - for (int l = 0; l < out_w; l++) { - if (isInBound(x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1), - (T)(in_h - 1))) { - for (int j = 0; j < c; j++) { - input_grad_t(i, j, static_cast(round(y_t(i, k, l))), - static_cast(round(x_t(i, k, l)))) += - output_grad_t(i, j, k, l) * d1_t(i, k, l) * d2_t(i, k, l); - } - } - } - } - } -} - -template -static void gatherOutputGradToInputGrad(const Tensor& output_grad, - Tensor* input_grad, const Tensor& x, - const Tensor& y) { - const int n = output_grad.dims()[0]; - const int c = output_grad.dims()[1]; - const int out_h = output_grad.dims()[2]; - const int out_w = output_grad.dims()[3]; - const int in_h = input_grad->dims()[2]; - const int in_w = input_grad->dims()[3]; - auto x_t = EigenTensor::From(x); - auto y_t = EigenTensor::From(y); - auto input_grad_t = EigenTensor::From(*input_grad); - auto output_grad_t = EigenTensor::From(output_grad); - for (int i = 0; i < n; i++) { - for (int k = 0; k < out_h; k++) { - for (int l = 0; l < out_w; l++) { - if (isInBound(x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1), - (T)(in_h - 1))) { - for (int j = 0; j < c; j++) { - input_grad_t(i, j, static_cast(round(y_t(i, k, l))), - static_cast(round(x_t(i, k, l)))) += - output_grad_t(i, j, k, l); - } - } - } - } - } -} - -template -static void gatherBilinearGrad(const platform::CPUDeviceContext& ctx, - const Tensor& input, const Tensor& output_grad, - Tensor* grid_x, Tensor* grid_y, - Tensor* grid_x_scale, Tensor* grid_y_scale, - Tensor* input_grad, Tensor* grid_grad) { - const int n = grid_x->dims()[0]; - const int out_h = grid_x->dims()[1]; - const int out_w = grid_x->dims()[2]; - const int c = input.dims()[1]; - - Tensor x_w, x_e, y_n, y_s; - Tensor d_w, d_e, d_n, d_s; - Tensor v_wn, v_en, v_ws, v_es; - - allNeigbors(ctx, input, - grid_x, // grid_x - grid_y, // grid_y - &x_w, &x_e, &y_n, &y_s, &d_w, &d_e, &d_n, &d_s, &v_wn, &v_en, - &v_ws, &v_es); - - // gather output grad value to input grad by corner point coords and weight - gatherOutputGradToInputGrad(output_grad, input_grad, x_w, y_n, d_e, d_s); - gatherOutputGradToInputGrad(output_grad, input_grad, x_w, y_s, d_e, d_n); - gatherOutputGradToInputGrad(output_grad, input_grad, x_e, y_n, d_w, d_s); - gatherOutputGradToInputGrad(output_grad, input_grad, x_e, y_s, d_w, d_n); - - auto v_wn_t = EigenTensor::From(v_wn); - auto v_en_t = EigenTensor::From(v_en); - auto v_ws_t = EigenTensor::From(v_ws); - auto v_es_t = EigenTensor::From(v_es); - - auto d_w_t = EigenTensor::From(d_w); - auto d_e_t = EigenTensor::From(d_e); - auto d_n_t = EigenTensor::From(d_n); - auto d_s_t = EigenTensor::From(d_s); - - auto output_grad_t = EigenTensor::From(output_grad); - - if (grid_grad != nullptr) { - Tensor grid_grad_x, grid_grad_y; - grid_grad_x.mutable_data({n, out_h, out_w}, ctx.GetPlace()); - grid_grad_y.mutable_data({n, out_h, out_w}, ctx.GetPlace()); - auto grid_grad_x_t = - EigenTensor::From(grid_grad_x).setConstant(static_cast(0.0)); - auto grid_grad_y_t = - EigenTensor::From(grid_grad_y).setConstant(static_cast(0.0)); - for (int i = 0; i < n; i++) { - for (int j = 0; j < c; j++) { - for (int k = 0; k < out_h; k++) { - for (int l = 0; l < out_w; l++) { - grid_grad_x_t(i, k, l) += - ((v_en_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_s_t(i, k, l) + - (v_es_t(i, j, k, l) - v_ws_t(i, j, k, l)) * d_n_t(i, k, l)) * - output_grad_t(i, j, k, l); - grid_grad_y_t(i, k, l) += - ((v_ws_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_e_t(i, k, l) + - (v_es_t(i, j, k, l) - v_en_t(i, j, k, l)) * d_w_t(i, k, l)) * - output_grad_t(i, j, k, l); - } - } - } - } - - // const T x_max = static_cast(in_w - 1); - // const T y_max = static_cast(in_h - 1); - - auto grid_x_scale_t = EigenTensor::From(*grid_x_scale); - auto grid_y_scale_t = EigenTensor::From(*grid_y_scale); - grid_grad_x_t = grid_grad_x_t * grid_x_scale_t; - grid_grad_y_t = grid_grad_y_t * grid_y_scale_t; - - // gather grid_grad [x, y] in 3rd Dim - T* grid_grad_data = grid_grad->data(); - T* grid_grad_x_data = grid_grad_x.data(); - T* grid_grad_y_data = grid_grad_y.data(); - for (int i = 0; i < n * out_h * out_w; i++) { - grid_grad_data[2 * i] = grid_grad_x_data[i]; - grid_grad_data[2 * i + 1] = grid_grad_y_data[i]; - } - } -} - -template -class GridSampleOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto align_corners = ctx.Attr("align_corners"); - auto padding_mode = ctx.Attr("padding_mode"); - auto mode = ctx.Attr("mode"); - - auto* input = ctx.Input("X"); - auto* grid = ctx.Input("Grid"); - - const int n = grid->dims()[0]; - const int out_h = grid->dims()[1]; - const int out_w = grid->dims()[2]; - const int c = input->dims()[1]; - const int in_h = input->dims()[2]; - const int in_w = input->dims()[3]; - - auto* output = ctx.Output("Output"); - output->mutable_data({n, c, out_h, out_w}, ctx.GetPlace()); - phi::funcs::SetConstant()( - ctx.template device_context(), output, - static_cast(0)); - - Tensor grid_x, grid_y; - calcGridLocations( - ctx.template device_context(), *grid, in_h, - in_w, align_corners, padding_mode, &grid_x, &grid_y); - if (mode == "bilinear") { - bilinearInter( - ctx.template device_context(), *input, - &grid_x, &grid_y, output); - } else if (mode == "nearest") { - auto grid_x_t = EigenTensor::From(grid_x); - auto grid_y_t = EigenTensor::From(grid_y); - grid_x_t = grid_x_t.round(); - grid_y_t = grid_y_t.round(); - getGridPointValue(*input, output, grid_x, grid_y); - } - } -}; - -template -class GridSampleGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto align_corners = ctx.Attr("align_corners"); - auto padding_mode = ctx.Attr("padding_mode"); - auto mode = ctx.Attr("mode"); - - auto* input = ctx.Input("X"); - auto* grid = ctx.Input("Grid"); - auto* output_grad = ctx.Input(framework::GradVarName("Output")); - - const int n = grid->dims()[0]; - const int out_h = grid->dims()[1]; - const int out_w = grid->dims()[2]; - const int c = input->dims()[1]; - const int in_h = input->dims()[2]; - const int in_w = input->dims()[3]; - - auto* input_grad = ctx.Output(framework::GradVarName("X")); - input_grad->mutable_data({n, c, in_h, in_w}, ctx.GetPlace()); - phi::funcs::SetConstant()( - ctx.template device_context(), input_grad, - static_cast(0)); - - Tensor* grid_grad = nullptr; - if (ctx.HasOutput(framework::GradVarName("Grid"))) { - grid_grad = ctx.Output(framework::GradVarName("Grid")); - grid_grad->mutable_data({n, out_h, out_w, 2}, ctx.GetPlace()); - phi::funcs::SetConstant()( - ctx.template device_context(), grid_grad, - static_cast(0)); - } - - Tensor grid_x, grid_y; - Tensor grid_x_scale, grid_y_scale; - calcGridLocationsWithGrad( - ctx.template device_context(), *grid, in_h, - in_w, align_corners, padding_mode, &grid_x, &grid_y, &grid_x_scale, - &grid_y_scale); - if (mode == "bilinear") { - gatherBilinearGrad(ctx.template device_context(), - *input, *output_grad, &grid_x, &grid_y, - &grid_x_scale, &grid_y_scale, input_grad, - grid_grad); - } else { - auto grid_x_t = EigenTensor::From(grid_x); - auto grid_y_t = EigenTensor::From(grid_y); - grid_x_t = grid_x_t.round(); - grid_y_t = grid_y_t.round(); - gatherOutputGradToInputGrad(*output_grad, input_grad, grid_x, grid_y); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc index 9575ab54b32bda9292e5d266010484a34eae3e54..93f0d3d334f271ec7e40e38e9d654ad7f8ba3c59 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc @@ -12,9 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/hierarchical_sigmoid_op.h" #include #include + +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/multiary.h" + namespace paddle { namespace operators { @@ -60,31 +64,6 @@ namespace operators { class HierarchicalSigmoidOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "hsigmoid"); - OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", "hsigmoid"); - OP_INOUT_CHECK(ctx->HasInput("W"), "Input", "W", "hsigmoid"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "hsigmoid"); - OP_INOUT_CHECK(ctx->HasOutput("PreOut"), "Output", "PreOut", "hsigmoid"); - - auto with_prefetch = ctx->Attrs().Get("remote_prefetch"); - if (with_prefetch) { - OP_INOUT_CHECK(ctx->HasOutput("W_Out"), "Output", "W_Out", "hsigmoid"); - } - const int64_t input_dims = ctx->GetInputDim("X")[0]; - const int64_t label_dims = ctx->GetInputDim("Label")[0]; - PADDLE_ENFORCE_EQ(input_dims, label_dims, - platform::errors::InvalidArgument( - "The first dimension of " - "input and label is expected to be the same. " - "But received input's first dimension is %d; " - "label's first dimension is %d.", - input_dims, label_dims)); - - std::vector output_shape({input_dims, 1}); - ctx->SetOutputDim("Out", phi::make_ddim(output_shape)); - ctx->ShareLoD("X", /*->*/ "Out"); - } protected: framework::OpKernelType GetExpectedKernelType( @@ -272,22 +251,14 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER( } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR( - hierarchical_sigmoid, ops::HierarchicalSigmoidOp, - ops::HierarchicalSigmoidOpMaker, - ops::HierarchicalSigmoidGradMaker, - ops::HierarchicalSigmoidGradMaker); +DECLARE_INFER_SHAPE_FUNCTOR(hierarchical_sigmoid, + HierarchicalSigmoidInferShapeFunctor, + PD_INFER_META(phi::HierarchicalSigmoidInferMeta)); +REGISTER_OPERATOR(hierarchical_sigmoid, ops::HierarchicalSigmoidOp, + ops::HierarchicalSigmoidOpMaker, + ops::HierarchicalSigmoidGradMaker, + ops::HierarchicalSigmoidGradMaker, + HierarchicalSigmoidInferShapeFunctor); REGISTER_OPERATOR(hierarchical_sigmoid_grad, ops::HierarchicalSigmoidGradOp, ops::HierarchicalSigmoidGradOpGradVarTypeInference, ops::HierarchicalSigmoidGradOpNoNeedBufferVarInferer); -REGISTER_OP_CPU_KERNEL( - hierarchical_sigmoid, - ops::HierarchicalSigmoidOpKernel, - ops::HierarchicalSigmoidOpKernel); -REGISTER_OP_CPU_KERNEL( - hierarchical_sigmoid_grad, - ops::HierarchicalSigmoidGradOpKernel, - ops::HierarchicalSigmoidGradOpKernel); diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h deleted file mode 100644 index f11b28cfefb071182eb99cce3d8c2b7f2343cdf6..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.h +++ /dev/null @@ -1,222 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/mixed_vector.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/clip_op.h" -#include "paddle/fluid/operators/math/matrix_bit_code.h" -#include "paddle/fluid/platform/transform.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -template -using EigenMatrix = framework::EigenMatrix; -using platform::Transform; -using framework::LoDTensor; - -static std::vector PathToRows(const LoDTensor& path) { - std::set rows; - const int64_t* paths = path.data(); - for (int64_t i = 0; i < path.numel(); ++i) { - int64_t row = paths[i]; - if (row < 0) { - continue; - } - rows.emplace(row); - } - return std::vector(rows.begin(), rows.end()); -} -template -class HierarchicalSigmoidOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& in = GET_DATA_SAFELY(ctx.Input("X"), "Input", "X", - "HierarchicalSigmoid"); - auto& w = GET_DATA_SAFELY(ctx.Input("W"), "Input", "W", - "HierarchicalSigmoid"); - auto* path = ctx.Input("PathTable"); - auto* code = ctx.Input("PathCode"); - auto& label = GET_DATA_SAFELY(ctx.Input("Label"), "Input", - "Label", "HierarchicalSigmoid"); - auto* bias = ctx.Input("Bias"); - auto* out = ctx.Output("Out"); - auto* pre_out = ctx.Output("PreOut"); - size_t num_classes = static_cast(ctx.Attr("num_classes")); - // for remote prefetch - - bool is_custom = false; - if (path) { - is_custom = true; - } - int64_t code_length = - path ? path->dims()[1] : math::FindLastSet(num_classes - 1); - int64_t batch_size = in.dims()[0]; - LoDTensor sum; - auto& dev_ctx = ctx.template device_context(); - auto* pre_out_data = pre_out->mutable_data( - phi::make_ddim({batch_size, code_length}), ctx.GetPlace()); - auto pre_out_mat = EigenMatrix::From(*pre_out); - // Not all class(leaf) nodes' path lengths equal code_length, thus init as - // 0s can avoid out of path's loss. - phi::funcs::SetConstant zero; - zero(dev_ctx, pre_out, static_cast(0.0)); - auto& place = *ctx.template device_context().eigen_device(); - phi::funcs::RowwiseSum row_sum; - - std::unique_ptr> bit_code; - if (!is_custom) { - bit_code.reset(new math::MatrixBitCodeFunctor( - num_classes, label.template data())); - } else { - bit_code.reset(new math::MatrixBitCodeFunctor( - *path, *code, label.template data())); - } - - std::vector sum_dims({batch_size, 1UL}); - sum.mutable_data(phi::make_ddim(sum_dims), ctx.GetPlace()); - auto sum_mat = EigenMatrix::From(sum); - out->mutable_data(ctx.GetPlace()); - auto out_mat = framework::EigenMatrix::From(*out); - if (bias) { - bit_code->Add(*bias, pre_out); - } - bit_code->Mul(pre_out, w, in); - // clip to [-40, 40] - Transform trans; - trans(ctx.template device_context(), pre_out_data, - pre_out_data + pre_out->numel(), pre_out_data, - ClipFunctor(static_cast(-40.0), static_cast(40.0))); - bit_code->Sum(*pre_out, out, static_cast(-1)); - // use softrelu to calculate cross entropy - pre_out_mat.device(place) = (static_cast(1.0) + pre_out_mat.exp()).log(); - row_sum(dev_ctx, *pre_out, &sum); - // TODO(guosheng): Subtract the out of path's loss, since not all - // class(leaf) nodes' path lengths equal code_length. But it won't break the - // gradient check since both have the out of path's loss and will cancel out - // each other. - out_mat.device(place) = sum_mat + out_mat; - } -}; - -template -class HierarchicalSigmoidGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& in = GET_DATA_SAFELY(ctx.Input("X"), "Input", "X", - "HierarchicalSigmoidGrad"); - auto& w = GET_DATA_SAFELY(ctx.Input("W"), "Input", "W", - "HierarchicalSigmoidGrad"); - auto* path = ctx.Input("PathTable"); - auto* code = ctx.Input("PathCode"); - auto* in_grad = ctx.Output(framework::GradVarName("X")); - bool is_sparse = ctx.Attr("is_sparse"); - auto& dev_ctx = ctx.template device_context(); - phi::funcs::SetConstant zero; - auto& label = GET_DATA_SAFELY(ctx.Input("Label"), "Input", - "Label", "HierarchicalSigmoidGrad"); - auto& pre_out = GET_DATA_SAFELY(ctx.Input("PreOut"), "Input", - "PreOut", "HierarchicalSigmoidGrad"); - auto& out_grad = GET_DATA_SAFELY( - ctx.Input(framework::GradVarName("Out")), "Input", - framework::GradVarName("Out"), "HierarchicalSigmoidGrad"); - LoDTensor pre_out_grad; - - pre_out_grad.mutable_data(pre_out.dims(), ctx.GetPlace()); - in_grad->mutable_data(ctx.GetPlace()); - zero(dev_ctx, in_grad, static_cast(0.0)); - - size_t num_classes = static_cast(ctx.Attr("num_classes")); - - bool is_custom = false; - if (path) { - is_custom = true; - } - - std::unique_ptr> bit_code; - if (!is_custom) { - bit_code.reset(new math::MatrixBitCodeFunctor( - num_classes, label.template data())); - } else { - bit_code.reset(new math::MatrixBitCodeFunctor( - *path, *code, label.template data())); - } - - // softrelu derivative - - auto blas = phi::funcs::GetBlas(ctx); - - auto* pre_out_grad_data = pre_out_grad.data(); - auto* pre_out_data = pre_out.template data(); - auto n = pre_out.numel(); - blas.VEXP(n, pre_out_data, pre_out_grad_data); - blas.VINV(n, pre_out_grad_data, pre_out_grad_data); - for (int64_t i = 0; i < n; ++i) { - pre_out_grad_data[i] = 1.0 - pre_out_grad_data[i]; - } - bit_code->Sub(&pre_out_grad); // the gradient of clip(w * x + b) - auto* out_grad_data = out_grad.template data(); - - int64_t dim0 = pre_out_grad.dims()[0]; - int64_t dim1 = pre_out_grad.dims()[1]; - for (int64_t i = 0; i < dim0; ++i) { - T tmp = out_grad_data[i]; - blas.SCAL(dim1, tmp, pre_out_grad_data + i * dim1); - } - // TODO(guosheng): multiply pre_out_grad with subgradient of clipping to - // be consistent with the clipping in forward. - auto* bias_grad = ctx.Output(framework::GradVarName("Bias")); - if (bias_grad) { - bias_grad->mutable_data(ctx.GetPlace()); - zero(dev_ctx, bias_grad, static_cast(0.0)); - bit_code->AddGrad(pre_out_grad, bias_grad); - } - if (!is_sparse) { - auto* w_grad = ctx.Output(framework::GradVarName("W")); - w_grad->mutable_data(ctx.GetPlace()); - zero(dev_ctx, w_grad, static_cast(0.0)); - bit_code->MulGradWeight(pre_out_grad, w_grad, in); - } else { - PADDLE_ENFORCE_NOT_NULL(path, - platform::errors::NotFound( - "Custom tree must be set for sparse mode!")); - framework::Vector real_rows = PathToRows(*path); - auto* w_grad = ctx.Output(framework::GradVarName("W")); - w_grad->set_rows(real_rows); - // Build a map of id -> row_index to speed up finding the index of one id - w_grad->set_height(w.dims()[0]); - auto* w_grad_value = w_grad->mutable_value(); - framework::DDim temp_dim(w.dims()); - temp_dim[0] = real_rows.size(); - w_grad_value->mutable_data(temp_dim, ctx.GetPlace()); - zero(dev_ctx, w_grad_value, static_cast(0.0)); - bit_code->MulGradWeight(pre_out_grad, w_grad, in); - } - bit_code->MulGradError(pre_out_grad, w, in_grad); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/histogram_op.cc b/paddle/fluid/operators/histogram_op.cc index 92cc6077defcd3f2b27c1b45875014742bc792ae..c9fd75651b5892beffa3b2aad7c21a0805facfce 100644 --- a/paddle/fluid/operators/histogram_op.cc +++ b/paddle/fluid/operators/histogram_op.cc @@ -16,7 +16,9 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -28,27 +30,6 @@ class HistogramOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "histogram"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "histogram"); - const auto &nbins = ctx->Attrs().Get("bins"); - const auto &minval = ctx->Attrs().Get("min"); - const auto &maxval = ctx->Attrs().Get("max"); - - PADDLE_ENFORCE_GE(nbins, 1, - platform::errors::InvalidArgument( - "The bins should be greater than or equal to 1." - "But received nbins is %d", - nbins)); - PADDLE_ENFORCE_GE(maxval, minval, platform::errors::InvalidArgument( - "max must be larger or equal to min." - "But received max is %d, min is %d", - maxval, minval)); - - ctx->SetOutputDim("Out", phi::make_ddim({nbins})); - ctx->ShareLoD("X", /*->*/ "Out"); - } - framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const { auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); @@ -81,7 +62,12 @@ class HistogramOpMaker : public framework::OpProtoAndCheckerMaker { } // namespace paddle namespace ops = paddle::operators; + +DECLARE_INFER_SHAPE_FUNCTOR(histogram, HistogramInferShapeFunctor, + PD_INFER_META(phi::HistogramInferMeta)); + REGISTER_OPERATOR( histogram, ops::HistogramOp, ops::HistogramOpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); + paddle::framework::EmptyGradOpMaker, + HistogramInferShapeFunctor); diff --git a/paddle/fluid/operators/index_select_op.cc b/paddle/fluid/operators/index_select_op.cc index fea71edf41313f9a93c3a2a0311d0db69db3b41c..069cc9416a620cec987f6463841ecd677db8c7b4 100644 --- a/paddle/fluid/operators/index_select_op.cc +++ b/paddle/fluid/operators/index_select_op.cc @@ -13,8 +13,13 @@ // limitations under the License. #include "paddle/fluid/operators/index_select_op.h" + #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/binary.h" + namespace paddle { namespace operators { @@ -24,52 +29,6 @@ class IndexSelectOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Input(X) of IndexSelectOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("Index"), true, - platform::errors::InvalidArgument( - "Input(Index) of IndexSelectOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(Out) of IndexSelectOp should not be null.")); - - auto input_dim = ctx->GetInputDim("X"); - auto index_dim = ctx->GetInputDim("Index"); - auto dim = ctx->Attrs().Get("dim"); - - PADDLE_ENFORCE_EQ( - dim < input_dim.size() && dim >= (0 - input_dim.size()), true, - platform::errors::OutOfRange( - "Attr(dim) is out of range, It's expected " - "to be in range of [-%d, %d]. But received Attr(dim) = %d.", - input_dim.size(), input_dim.size() - 1, dim)); - - PADDLE_ENFORCE_EQ( - index_dim.size() == 1 || (index_dim.size() == 2 && index_dim[1] == 1), - true, platform::errors::InvalidArgument( - "The 'shape' of Input(Index) must be 1-D tensor. " - "But received: the 'shape' of Input(Index) is [%s], " - "the dimension of Input(Index) is [%d].", - index_dim, index_dim.size())); - - PADDLE_ENFORCE_EQ(index_dim[0] != 0, true, - platform::errors::InvalidArgument( - "The length of Input(Index) can't be 0.")); - - auto output_dim = phi::vectorize(input_dim); - if (dim < 0) { - dim += input_dim.size(); - } - output_dim[dim] = index_dim[0]; - ctx->SetOutputDim("Out", phi::make_ddim(output_dim)); - auto type = ctx->GetInputsVarType("X")[0]; - if (type == framework::proto::VarType::LOD_TENSOR) { - ctx->ShareLoD("X", /*->*/ "Out"); - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -148,20 +107,11 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(IndexSelectGradNoNeedBufferVarsInferer, } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(index_select, IndexSelectInferShapeFunctor, + PD_INFER_META(phi::IndexSelectInferMeta)); REGISTER_OPERATOR(index_select, ops::IndexSelectOp, ops::IndexSelectOpMaker, ops::IndexSelectGradMaker, - ops::IndexSelectGradMaker); + ops::IndexSelectGradMaker, + IndexSelectInferShapeFunctor); REGISTER_OPERATOR(index_select_grad, ops::IndexSelectGradOp, ops::IndexSelectGradNoNeedBufferVarsInferer); -REGISTER_OP_CPU_KERNEL( - index_select, - ops::IndexSelectKernel, - ops::IndexSelectKernel, - ops::IndexSelectKernel, - ops::IndexSelectKernel); -REGISTER_OP_CPU_KERNEL( - index_select_grad, - ops::IndexSelectGradKernel, - ops::IndexSelectGradKernel, - ops::IndexSelectGradKernel, - ops::IndexSelectGradKernel); diff --git a/paddle/fluid/operators/index_select_op.cu b/paddle/fluid/operators/index_select_op.cu deleted file mode 100644 index f810aee2adea540f1ffb6999ce38380ee05d0901..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/index_select_op.cu +++ /dev/null @@ -1,209 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/index_select_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -namespace paddle { -namespace operators { - -using platform::PADDLE_CUDA_NUM_THREADS; -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; - -template -__global__ void index_select_cuda_kernel(const T* input, T* output, - const IndexT* index, int64_t N, - int64_t stride, int64_t size, - int64_t delta) { - int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= N) { - return; - } - - int64_t pre_idx = idx / (stride * size); - int64_t dim_idx = idx % (stride * size) / stride; - IndexT src_dim_idx = index[dim_idx]; - int64_t input_idx = idx + (delta * pre_idx + src_dim_idx - dim_idx) * stride; - output[idx] = input[input_idx]; -} - -template -__global__ void index_select_grad_cuda_kernel(const T* output_grad, - T* input_grad, - const IndexT* index, int64_t nums, - int64_t N, int64_t stride, - int64_t size, int64_t delta) { - int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= N) { - return; - } - - int64_t pre_idx = idx / (stride * size); - int64_t dim_idx = idx % (stride * size) / stride; - IndexT src_dim_idx = index[dim_idx]; - int64_t input_idx = idx + (delta * pre_idx + src_dim_idx - dim_idx) * stride; - paddle::platform::CudaAtomicAdd(&input_grad[input_idx], output_grad[idx]); -} - -template -__global__ void index_select_grad_init(T* input_grad, int64_t N) { - int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= N) { - return; - } - input_grad[idx] = 0.0; -} - -template -class IndexSelectCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* in = context.Input("X"); - auto* index = context.Input("Index"); - auto* out = context.Output("Out"); - int dim = context.Attr("dim"); - auto input_dim = in->dims(); - auto output_dim = out->dims(); - dim = dim >= 0 ? dim : dim + input_dim.size(); - auto stride_dim = phi::stride(input_dim); - int64_t stride = stride_dim[dim]; - int64_t size = output_dim[dim]; - int64_t delta = input_dim[dim] - size; - - const auto& index_type = framework::TransToProtoVarType(index->dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT64 || - index_type == framework::proto::VarType::INT32; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Input(Index) holds the wrong type, it holds %s, but " - "desires to be %s or %s", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); - - auto* in_data = in->data(); - auto* out_data = out->mutable_data(context.GetPlace()); - int64_t numel = out->numel(); - - auto stream = - context.template device_context().stream(); - - if (index_type == framework::proto::VarType::INT64) { - const int64_t* index_data = index->data(); - index_select_cuda_kernel<<< - (numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_data, out_data, index_data, - numel, stride, size, delta); - platform::GpuStreamSync(stream); - } else { - const int* index_data = index->data(); - index_select_cuda_kernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / - PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>( - in_data, out_data, index_data, numel, stride, size, delta); - platform::GpuStreamSync(stream); - } - } -}; - -template -class IndexSelectGradCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* output_grad = context.Input(framework::GradVarName("Out")); - auto* in_grad = context.Output(framework::GradVarName("X")); - auto* index = context.Input("Index"); - - auto* output_grad_data = output_grad->data(); - auto* in_grad_data = in_grad->mutable_data(context.GetPlace()); - - int dim = context.Attr("dim"); - auto input_dim = in_grad->dims(); - auto output_dim = output_grad->dims(); - dim = dim >= 0 ? dim : dim + input_dim.size(); - auto stride_dim = phi::stride(input_dim); - int64_t stride = stride_dim[dim]; - int64_t size = output_dim[dim]; - int64_t delta = input_dim[dim] - size; - - const auto& index_type = framework::TransToProtoVarType(index->dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT64 || - index_type == framework::proto::VarType::INT32; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Input(Index) holds the wrong type, it holds %s, but " - "desires to be %s or %s", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); - - int64_t numel = in_grad->numel(); - int64_t index_nums = index->numel(); - int64_t out_nums = output_grad->numel(); - - auto stream = - context.template device_context().stream(); - - index_select_grad_init< - T><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_grad_data, numel); - - if (index_type == framework::proto::VarType::INT64) { - const int64_t* index_data = index->data(); - index_select_grad_cuda_kernel<<< - (out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>(output_grad_data, in_grad_data, - index_data, index_nums, - out_nums, stride, size, delta); - platform::GpuStreamSync(stream); - } else { - const int* index_data = index->data(); - index_select_grad_cuda_kernel<<< - (out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>(output_grad_data, in_grad_data, - index_data, index_nums, - out_nums, stride, size, delta); - platform::GpuStreamSync(stream); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - index_select, - ops::IndexSelectCUDAKernel, - ops::IndexSelectCUDAKernel, - ops::IndexSelectCUDAKernel, - ops::IndexSelectCUDAKernel, - ops::IndexSelectCUDAKernel); -REGISTER_OP_CUDA_KERNEL( - index_select_grad, - ops::IndexSelectGradCUDAKernel, - ops::IndexSelectGradCUDAKernel, - ops::IndexSelectGradCUDAKernel, - ops::IndexSelectGradCUDAKernel, - ops::IndexSelectGradCUDAKernel); diff --git a/paddle/fluid/operators/index_select_op.h b/paddle/fluid/operators/index_select_op.h index 04b4f69add78513bf716ab03d3bc2ba86dfbad2d..684829be2697cdc1676e8b80e15b2d600d922f3b 100644 --- a/paddle/fluid/operators/index_select_op.h +++ b/paddle/fluid/operators/index_select_op.h @@ -91,41 +91,6 @@ void IndexSelectInner(const framework::ExecutionContext& context, output->Resize(output_dim); } -template -class IndexSelectKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto inputs = *context.Input("X"); - auto* index = context.Input("Index"); - auto* output = context.Output("Out"); - - int dim = context.Attr("dim"); - if (dim < 0) { - dim += inputs.dims().size(); - } - const auto& index_type = framework::TransToProtoVarType(index->dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT32 || - index_type == framework::proto::VarType::INT64; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Input(Index) holds the wrong type, it holds %s, but " - "desires to be %s or %s", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); - - if (index_type == framework::proto::VarType::INT32) { - IndexSelectInner(context, &inputs, *index, output, - dim); - } else if (index_type == framework::proto::VarType::INT64) { - IndexSelectInner(context, &inputs, *index, - output, dim); - } - } -}; - template struct IndexSelectAdd { void operator()(const framework::ExecutionContext& ctx, int slice_size, @@ -197,43 +162,5 @@ void IndexSelectGradInner(const framework::ExecutionContext& context, x_grad->Resize(output_dim); } -template -class IndexSelectGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x_grad = - context.Output(framework::GradVarName("X")); - auto* index = context.Input("Index"); - auto* out_grad = - context.Input(framework::GradVarName("Out")); - - int dim = context.Attr("dim"); - if (dim < 0) { - dim += out_grad->dims().size(); - } - const auto& index_type = framework::TransToProtoVarType(index->dtype()); - - bool index_type_match = index_type == framework::proto::VarType::INT32 || - index_type == framework::proto::VarType::INT64; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Input(Index) holds the wrong type, it holds %s, but " - "desires to be %s or %s", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); - - if (index_type == framework::proto::VarType::INT32) { - IndexSelectGradInner(context, *out_grad, *index, - x_grad, dim); - } else if (index_type == framework::proto::VarType::INT64) { - IndexSelectGradInner(context, *out_grad, - *index, x_grad, dim); - } - } -}; - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/index_select_op_npu.cc b/paddle/fluid/operators/index_select_op_npu.cc index bce7a3c1caae39d21c9324b0f927401317284cc5..a232fba7e28d68c2df8394caa6bc5d93397f1f37 100644 --- a/paddle/fluid/operators/index_select_op_npu.cc +++ b/paddle/fluid/operators/index_select_op_npu.cc @@ -12,12 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/index_select_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" +#include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { namespace operators { +using Tensor = framework::Tensor; + template class IndexSelectNPUKernel : public framework::OpKernel { public: diff --git a/paddle/fluid/operators/inplace_abn_op.cc b/paddle/fluid/operators/inplace_abn_op.cc index 7f5136969980b887bb7bbe013690898e66abeac1..77951ff394e7491569746c89ac45826f23fdf313 100644 --- a/paddle/fluid/operators/inplace_abn_op.cc +++ b/paddle/fluid/operators/inplace_abn_op.cc @@ -323,6 +323,7 @@ class InplaceABNGradKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; + REGISTER_OPERATOR(inplace_abn, ops::InplaceABNOp, ops::InplaceABNOpMaker, ops::BatchNormOpInferVarType, ops::InplaceABNOpGradMaker, diff --git a/paddle/fluid/operators/isclose_op.cc b/paddle/fluid/operators/isclose_op.cc index 0ae7a9fa02f1fb217555ae41d8b25cbba0e43d19..1c79213757fdfa8d9ef0d7c7ab315d03f94b0c57 100644 --- a/paddle/fluid/operators/isclose_op.cc +++ b/paddle/fluid/operators/isclose_op.cc @@ -12,56 +12,19 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/isclose_op.h" #include #include + +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { -template -struct GetTensorValue { - T operator()(const platform::CPUDeviceContext& dev_ctx, - const framework::Tensor& tensor) const { - return *(tensor.data()); - } -}; - -template -struct IscloseFunctor { - void operator()(const platform::CPUDeviceContext& ctx, - const framework::Tensor& in, const framework::Tensor& other, - const double rtol, const double atol, bool equal_nan, - framework::Tensor* output) { - auto* in_a = in.data(); - auto* in_b = other.data(); - auto* out_data = output->mutable_data(ctx.GetPlace()); - auto num = in.numel(); - // *out_data = true; - for (int i = 0; i < num; i++) { - out_data[i] = true; - } - for (int i = 0; i < num; i++) { - const T a = in_a[i], b = in_b[i]; - bool val; - if (std::isnan(a) || std::isnan(b)) { - val = equal_nan && std::isnan(a) == std::isnan(b); - } else { - T left = (a > b ? a - b : b - a); - T right = atol + (b > 0 ? rtol * b : (-rtol) * b); - T diff = (left > right ? left - right : right - left); - val = a == b || left <= right || diff <= 1e-15; - } - // *out_data &= val; - out_data[i] = val; - } - } -}; - class IscloseOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -100,40 +63,6 @@ class IscloseOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "Isclose"); - OP_INOUT_CHECK(ctx->HasInput("Other"), "Input", "Other", "Isclose"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Isclose"); - - auto input_dim = ctx->GetInputDim("Input"); - auto other_dim = ctx->GetInputDim("Other"); - PADDLE_ENFORCE_EQ(input_dim.size(), other_dim.size(), - platform::errors::PreconditionNotMet( - "Input(Input) and Input(Other) must have the same " - "dimension size.")); - int n = input_dim.size(); - bool is_runtime = ctx->IsRuntime(); - for (int i = 0; i < n; i++) { - if (is_runtime) { - PADDLE_ENFORCE_EQ(input_dim[i], other_dim[i], - platform::errors::PreconditionNotMet( - "The value at dim %d of Input(Input) is not " - "equal to the Input(Other): %ld != %ld.", - i, input_dim[i], other_dim[i])); - } else { - if (!(input_dim[i] < 0 || other_dim[i] < 0)) { - PADDLE_ENFORCE_EQ(input_dim[i], other_dim[i], - platform::errors::PreconditionNotMet( - "The value at dim %d of Input(Input) is not " - "equal to the Input(Other): %ld != %ld.", - i, input_dim[i], other_dim[i])); - } - } - } - - ctx->SetOutputDim("Out", input_dim); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -154,12 +83,11 @@ class IscloseOpVarTypeInference : public framework::VarTypeInference { } // namespace paddle namespace ops = paddle::operators; -using CPU = paddle::platform::CPUDeviceContext; +DECLARE_INFER_SHAPE_FUNCTOR(isclose, IscloseInferShapeFunctor, + PD_INFER_META(phi::ValueCompareInferMeta)); REGISTER_OPERATOR( isclose, ops::IscloseOp, ops::IscloseOpMaker, paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker, - ops::IscloseOpVarTypeInference); -REGISTER_OP_CPU_KERNEL(isclose, ops::IscloseKernel, - ops::IscloseKernel); + ops::IscloseOpVarTypeInference, IscloseInferShapeFunctor); diff --git a/paddle/fluid/operators/isclose_op.cu b/paddle/fluid/operators/isclose_op.cu deleted file mode 100644 index 09710ba0c6957d39318abfc24113d4b9db11622d..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/isclose_op.cu +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/operators/isclose_op.h" - -namespace paddle { -namespace operators { - -template -struct GetTensorValue { - T operator()(const platform::CUDADeviceContext& dev_ctx, - const framework::Tensor& tensor) const { - const T* data = tensor.data(); - T value; - const auto gpu_place = dev_ctx.GetPlace(); - memory::Copy(platform::CPUPlace(), &value, gpu_place, data, sizeof(T), - dev_ctx.stream()); - return value; - } -}; - -template -__global__ void IscloseCUDAKernel(const T* in_data, const T* other_data, - const double rtol, const double atol, - bool equal_nan, int num, bool* out_data) { - unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; - bool val; - for (int i = idx; i < num; i += blockDim.x * gridDim.x) { - const T a = in_data[i], b = other_data[i]; - if (isnan(a) || isnan(b)) { - val = equal_nan && isnan(a) == isnan(b); - } else { - T left = (a > b ? a - b : b - a); - T right = atol + (b > 0 ? rtol * b : (-rtol) * b); - T diff = (left > right ? left - right : right - left); - val = a == b || left <= right || diff <= 1e-15; - } - out_data[i] = val; - // if (!val) *out_data = false; - } -} - -template -struct IscloseFunctor { - void operator()(const platform::CUDADeviceContext& dev_ctx, - const framework::Tensor& in, const framework::Tensor& other, - const double rtol, const double atol, bool equal_nan, - framework::Tensor* output) { - int num = in.numel(); - const T* in_data = in.data(); - const T* other_data = other.data(); - bool* out_data = output->mutable_data(dev_ctx.GetPlace()); - int block = 1024; - int grid = (block - 1 + num) / block; - grid = (grid > block) ? block : grid; -#ifdef PADDLE_WITH_HIP - hipMemset(out_data, true, num * sizeof(bool)); -#else - cudaMemset(out_data, true, num * sizeof(bool)); -#endif - IscloseCUDAKernel<<>>( - in_data, other_data, rtol, atol, equal_nan, num, out_data); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -using CUDA = paddle::platform::CUDADeviceContext; -REGISTER_OP_CUDA_KERNEL(isclose, ops::IscloseKernel, - ops::IscloseKernel); diff --git a/paddle/fluid/operators/isclose_op.h b/paddle/fluid/operators/isclose_op.h deleted file mode 100644 index cde5d2afbf009a16a3d0c3601697703d8ec8eb7d..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/isclose_op.h +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/platform/place.h" - -namespace paddle { -namespace operators { -using Tensor = framework::Tensor; - -template -struct GetTensorValue { - T operator()(const platform::DeviceContext& ctx, - const framework::Tensor& tensor) const; -}; - -template -struct IscloseFunctor { - void operator()(const DeviceContext& ctx, const framework::Tensor& in, - const framework::Tensor& other, const float rtol, - const float atol, bool equal_nan, framework::Tensor* output); -}; - -template -class IscloseKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - // get attrs - bool equal_nan = ctx.Attr("equal_nan"); - // get input/output - const auto* input = ctx.Input("Input"); - const auto* other = ctx.Input("Other"); - auto* out = ctx.Output("Out"); - - double rtol_v = std::stod(ctx.Attr("rtol")); - double atol_v = std::stod(ctx.Attr("atol")); - - auto& dev_ctx = ctx.template device_context(); - GetTensorValue get_tensor_value; - if (ctx.HasInput("Rtol")) { - const auto* rtol = ctx.Input("Rtol"); - PADDLE_ENFORCE_EQ( - rtol->numel(), 1, - platform::errors::InvalidArgument( - "Input(Rtol) size must be 1, but get %d.", rtol->numel())); - PADDLE_ENFORCE_EQ( - framework::TransToProtoVarType(rtol->dtype()), - framework::proto::VarType::FP64, - platform::errors::InvalidArgument( - "Input(Rtol) type must be double, but get %s.", - framework::DataTypeToString( - framework::TransToProtoVarType(rtol->dtype())))); - rtol_v = get_tensor_value(dev_ctx, *rtol); - } - if (ctx.HasInput("Atol")) { - const auto* atol = ctx.Input("Atol"); - PADDLE_ENFORCE_EQ( - atol->numel(), 1, - platform::errors::InvalidArgument( - "Input(Atol) size must be 1, but get %d", atol->numel())); - PADDLE_ENFORCE_EQ( - framework::TransToProtoVarType(atol->dtype()), - framework::proto::VarType::FP64, - platform::errors::InvalidArgument( - "Input(Atol) type must be double, but get %s", - framework::DataTypeToString( - framework::TransToProtoVarType(atol->dtype())))); - atol_v = get_tensor_value(dev_ctx, *atol); - } - - IscloseFunctor()(dev_ctx, *input, *other, rtol_v, atol_v, - equal_nan, out); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/kldiv_loss_op.cc b/paddle/fluid/operators/kldiv_loss_op.cc index a78d8ec10149db5a1f8d585cb06bb08ea6ca5a5f..67c1942ea0b41e480c524f9c188b2a82649ba44e 100644 --- a/paddle/fluid/operators/kldiv_loss_op.cc +++ b/paddle/fluid/operators/kldiv_loss_op.cc @@ -9,10 +9,11 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/kldiv_loss_op.h" #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -22,44 +23,6 @@ using framework::Tensor; class KLDivLossOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "KLDivLoss"); - OP_INOUT_CHECK(ctx->HasInput("Target"), "Input", "Target", "KLDivLoss"); - OP_INOUT_CHECK(ctx->HasOutput("Loss"), "Output", "Loss", "KLDivLoss"); - - auto dim_x = ctx->GetInputDim("X"); - auto dim_target = ctx->GetInputDim("Target"); - PADDLE_ENFORCE_EQ(dim_x.size(), dim_target.size(), - platform::errors::InvalidArgument( - "Input(X) rank and Input(Target) rank should be " - "same, but received X rank(%d) != Target rank(%d)", - dim_x.size(), dim_target.size())); - for (int i = 0; i < dim_x.size(); i++) { - if (ctx->IsRuntime() || (dim_x[i] > 0 && dim_target[i] > 0)) { - PADDLE_ENFORCE_EQ( - dim_x[i], dim_target[i], - platform::errors::InvalidArgument( - "Input(X) and Input(Target) should in same shape. but received " - "X dimension[%d](%d) != Target dimension[%d](%d)", - i, dim_x[i], i, dim_target[i])); - } - } - - auto reduction = ctx->Attrs().Get("reduction"); - - auto reduction_valid = "mean" == reduction || "sum" == reduction || - "batchmean" == reduction || "none" == reduction; - PADDLE_ENFORCE_EQ( - reduction_valid, true, - platform::errors::InvalidArgument( - "Attr(reduction) can only be 'none'|'batchmean'|'sum'|'mean'.")); - - if ("none" == reduction) { - ctx->SetOutputDim("Loss", dim_x); - } else { - ctx->SetOutputDim("Loss", {1}); - } - } protected: framework::OpKernelType GetExpectedKernelType( @@ -172,15 +135,12 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(KLDivLossGradNoNeedBufferVarInferer, "X"); } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(kldiv_loss, KLDivInferShapeFunctor, + PD_INFER_META(phi::KLDivInferMeta)); + REGISTER_OPERATOR(kldiv_loss, ops::KLDivLossOp, ops::KLDivLossOpMaker, ops::KLDivLossOpGradMaker, - ops::KLDivLossOpGradMaker); + ops::KLDivLossOpGradMaker, + KLDivInferShapeFunctor); REGISTER_OPERATOR(kldiv_loss_grad, ops::KLDivLossOpGrad, ops::KLDivLossGradNoNeedBufferVarInferer); -REGISTER_OP_CPU_KERNEL( - kldiv_loss, ops::KLDivLossKernel, - ops::KLDivLossKernel); -REGISTER_OP_CPU_KERNEL( - kldiv_loss_grad, - ops::KLDivLossGradKernel, - ops::KLDivLossGradKernel); diff --git a/paddle/fluid/operators/kldiv_loss_op.cu b/paddle/fluid/operators/kldiv_loss_op.cu deleted file mode 100644 index 5226cb8c08e3db4a0bfbbe4440c27264903f06e3..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/kldiv_loss_op.cu +++ /dev/null @@ -1,22 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at -http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include "paddle/fluid/operators/kldiv_loss_op.h" - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL( - kldiv_loss, - ops::KLDivLossKernel, - ops::KLDivLossKernel); -REGISTER_OP_CUDA_KERNEL( - kldiv_loss_grad, - ops::KLDivLossGradKernel, - ops::KLDivLossGradKernel); diff --git a/paddle/fluid/operators/kldiv_loss_op.h b/paddle/fluid/operators/kldiv_loss_op.h deleted file mode 100644 index 5a6ef06f5eb1e855c8a528664528c9919304c7b9..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/kldiv_loss_op.h +++ /dev/null @@ -1,119 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#pragma once -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/core/hostdevice.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using Array1 = Eigen::DSizes; - -template -struct KLDivLossForward { - HOSTDEVICE KLDivLossForward() {} - - HOSTDEVICE T operator()(const T& target, const T& input) const { - if (target <= 0) { - return 0; - } else { - return target * (std::log(target) - input); - } - } -}; - -template -struct KLDivLossBackward { - HOSTDEVICE KLDivLossBackward() {} - - HOSTDEVICE T operator()(const T& target, const T& grad) const { - if (target <= 0) { - return 0; - } else { - return static_cast(-1.) * grad; - } - } -}; - -template -class KLDivLossKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& place = *ctx.template device_context().eigen_device(); - auto* input = ctx.Input("X"); - auto* target = ctx.Input("Target"); - auto* loss = ctx.Output("Loss"); - auto reduction = ctx.Attr("reduction"); - - const int n = input->dims()[0]; - - loss->mutable_data(ctx.GetPlace()); - auto input_t = framework::EigenVector::Flatten(*input); - auto target_t = framework::EigenVector::Flatten(*target); - auto loss_t = framework::EigenVector::Flatten(*loss); - auto output = target_t.binaryExpr(input_t, KLDivLossForward()); - if ("none" == reduction) { - loss_t.device(place) = output; - } else if ("batchmean" == reduction) { - auto output_sum = output.sum(); - if (n > 0) { - loss_t.device(place) = output_sum / output_sum.constant(n); - } else { - loss_t.device(place) = output_sum; - } - } else if ("mean" == reduction) { - loss_t.device(place) = output.mean(); - } else if ("sum" == reduction) { - loss_t.device(place) = output.sum(); - } - } -}; - -template -class KLDivLossGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& place = *ctx.template device_context().eigen_device(); - auto* target = ctx.Input("Target"); - auto reduction = ctx.Attr("reduction"); - auto* input_grad = ctx.Output(framework::GradVarName("X")); - auto* loss_grad = ctx.Input(framework::GradVarName("Loss")); - - const int n = input_grad->dims()[0]; - const int numel = input_grad->numel(); - const int expand = numel / loss_grad->numel(); - - input_grad->mutable_data(ctx.GetPlace()); - - auto target_t = framework::EigenVector::Flatten(*target); - - auto input_grad_t = framework::EigenVector::Flatten(*input_grad); - auto loss_grad_t = framework::EigenVector::Flatten(*loss_grad); - - auto loss_grad_expand = loss_grad_t.broadcast(Array1(expand)); - auto grad_t = target_t * loss_grad_expand; - input_grad_t.device(place) = - target_t.binaryExpr(grad_t, KLDivLossBackward()); - - if ("mean" == reduction) { - input_grad_t.device(place) = input_grad_t / static_cast(numel); - } else if ("batchmean" == reduction) { - input_grad_t.device(place) = input_grad_t / static_cast(n); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/kldiv_loss_op_npu.cc b/paddle/fluid/operators/kldiv_loss_op_npu.cc index 322ae5df4cb877b4dde022e6c203a32cd8dd001d..eac181489aa9d09f4661c898b13e77570ad928a8 100644 --- a/paddle/fluid/operators/kldiv_loss_op_npu.cc +++ b/paddle/fluid/operators/kldiv_loss_op_npu.cc @@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the Licnse. */ -#include "paddle/fluid/operators/kldiv_loss_op.h" +#include +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/kron_op.cc b/paddle/fluid/operators/kron_op.cc index 68d0c7978b4e45f216abd5fa5c4be93f788e8f04..60390016d66e3addf0ead14f6b9209511324961c 100644 --- a/paddle/fluid/operators/kron_op.cc +++ b/paddle/fluid/operators/kron_op.cc @@ -17,7 +17,9 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -26,27 +28,6 @@ class KronOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "kron"); - OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "kron"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "kron"); - - auto dim_x = ctx->GetInputDim("X"); - auto dim_y = ctx->GetInputDim("Y"); - auto rank_x = dim_x.size(); - auto rank_y = dim_y.size(); - auto rank = (rank_x > rank_y) ? rank_x : rank_y; - - std::vector dim_out; - dim_out.reserve(rank); - for (int i = 0; i < rank; i++) { - int64_t dim_xi = (i < rank - rank_x) ? 1 : dim_x.at(i - (rank - rank_x)); - int64_t dim_yi = (i < rank - rank_y) ? 1 : dim_y.at(i - (rank - rank_y)); - dim_out.push_back(dim_xi == -1 || dim_yi == -1 ? -1 : dim_xi * dim_yi); - } - ctx->SetOutputDim("Out", phi::make_ddim(dim_out)); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -173,7 +154,10 @@ class KronGradOpMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(kron, KronInferShapeFunctor, + PD_INFER_META(phi::KronInferMeta)); REGISTER_OPERATOR(kron, ops::KronOp, ops::KronOpMaker, ops::KronGradOpMaker, - ops::KronGradOpMaker); + ops::KronGradOpMaker, + KronInferShapeFunctor); REGISTER_OPERATOR(kron_grad, ops::KronGradOp); diff --git a/paddle/fluid/operators/kthvalue_op.cc b/paddle/fluid/operators/kthvalue_op.cc index 2a79cee27814e86b277d927082e9a772359217f1..4c679d30263863c70176bebb686556af056068d0 100644 --- a/paddle/fluid/operators/kthvalue_op.cc +++ b/paddle/fluid/operators/kthvalue_op.cc @@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/kthvalue_op.h" #include #include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -25,54 +26,6 @@ class KthvalueOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "kthvalue"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "kthvalue"); - OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "kthvalue"); - auto input_dims = ctx->GetInputDim("X"); - const int& dim_size = input_dims.size(); - int axis = static_cast(ctx->Attrs().Get("axis")); - PADDLE_ENFORCE_LT(axis, dim_size, - paddle::platform::errors::InvalidArgument( - "the axis must be [-%d, %d), but received %d .", - dim_size, dim_size, axis)); - PADDLE_ENFORCE_GE(axis, -dim_size, - paddle::platform::errors::InvalidArgument( - "the axis must be [-%d, %d), but received %d .", - dim_size, dim_size, axis)); - if (axis < 0) axis += dim_size; - int k = static_cast(ctx->Attrs().Get("k")); - PADDLE_ENFORCE_GE( - k, 1, paddle::platform::errors::InvalidArgument( - "the k in the kthvalue must >= 1, but received %d .", k)); - PADDLE_ENFORCE_GE(input_dims.size(), 1, - paddle::platform::errors::InvalidArgument( - "input of kthvalue must have >= 1d shape")); - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_GE( - input_dims[axis], k, - paddle::platform::errors::InvalidArgument( - "input of kthvalue must have >= %d columns in axis of %d", k, - axis)); - } - bool keepdim = ctx->Attrs().Get("keepdim"); - std::vector dimvec; - for (int64_t i = 0; i < axis; i++) { - dimvec.emplace_back(input_dims[i]); - } - if (keepdim) { - dimvec.emplace_back(static_cast(1)); - } - for (int64_t i = axis + 1; i < dim_size; i++) { - dimvec.emplace_back(input_dims[i]); - } - framework::DDim dims = phi::make_ddim(dimvec); - ctx->SetOutputDim("Out", dims); - ctx->SetOutputDim("Indices", dims); - ctx->ShareLoD("X", "Out"); - ctx->ShareLoD("X", "Indices"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -155,20 +108,13 @@ class KthvalueGradOpMaker : public framework::SingleGradOpMaker { } // namespace operators } // namespace paddle +DECLARE_INFER_SHAPE_FUNCTOR(kthvalue, KthvalueInferShapeFunctor, + PD_INFER_META(phi::KthvalueInferMeta)); + namespace ops = paddle::operators; REGISTER_OPERATOR(kthvalue, ops::KthvalueOp, ops::KthvalueOpMaker, ops::KthvalueGradOpMaker, - ops::KthvalueGradOpMaker); -REGISTER_OP_CPU_KERNEL( - kthvalue, ops::KthvalueCPUKernel, - ops::KthvalueCPUKernel, - ops::KthvalueCPUKernel, - ops::KthvalueCPUKernel); + ops::KthvalueGradOpMaker, + KthvalueInferShapeFunctor); REGISTER_OPERATOR(kthvalue_grad, ops::KthvalueOpGrad); -REGISTER_OP_CPU_KERNEL( - kthvalue_grad, - ops::KthvalueGradCPUKernel, - ops::KthvalueGradCPUKernel, - ops::KthvalueGradCPUKernel, - ops::KthvalueGradCPUKernel); diff --git a/paddle/fluid/operators/kthvalue_op.cu b/paddle/fluid/operators/kthvalue_op.cu deleted file mode 100644 index f6f56f70f1a11971b31e679ef879f2d1d0a96085..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/kthvalue_op.cu +++ /dev/null @@ -1,278 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/kthvalue_op.h" -#include "paddle/fluid/operators/top_k_function_cuda.h" -#ifdef __NVCC__ -#include "cub/cub.cuh" -#endif -#ifdef __HIPCC__ -#include -#endif - -namespace paddle { -namespace operators { - -int getBlockSize(int col) { - if (col > 512) - return 1024; - else if (col > 256 && col <= 512) - return 512; - else if (col > 128 && col <= 256) - return 256; - else if (col > 64 && col <= 128) - return 128; - else - return 64; -} - -template -bool SortKthvalue(const platform::CUDADeviceContext& ctx, - const framework::Tensor* input_tensor, const int64_t num_cols, - const int64_t num_rows, const int k, - framework::Tensor* out_tensor, - framework::Tensor* indices_tensor) { - auto cu_stream = ctx.stream(); - framework::Tensor input_indices; - const std::vector dims = {num_rows, num_cols}; - auto dim = phi::make_ddim(dims); - input_indices.Resize(dim); - input_indices.mutable_data(ctx.GetPlace()); - size_t temp_storage_bytes = -1; - int block_size = getBlockSize(num_cols); - unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0]; - unsigned int grid_size = num_rows < maxGridDimX - ? static_cast(num_rows) - : maxGridDimX; - InitIndex<<>>( - input_indices.data(), num_rows, num_cols); - cub::CountingInputIterator counting_iter(0); - cub::TransformInputIterator> - segment_offsets_t(counting_iter, SegmentOffsetIter(num_cols)); - T* sorted_values_ptr; - int64_t* sorted_indices_ptr; - framework::Tensor temp_values, temp_indices; - const T* input = input_tensor->data(); - T* values = out_tensor->data(); - int64_t* indices = indices_tensor->mutable_data(ctx.GetPlace()); - temp_values.Resize(dim); - temp_indices.Resize(dim); - sorted_values_ptr = temp_values.mutable_data(ctx.GetPlace()); - sorted_indices_ptr = temp_indices.mutable_data(ctx.GetPlace()); - auto err = cub::DeviceSegmentedRadixSort::SortPairs( - nullptr, temp_storage_bytes, input, sorted_values_ptr, - input_indices.data(), sorted_indices_ptr, num_cols * num_rows, - num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8, - cu_stream); -#ifdef __HIPCC__ - if (err != hipSuccess) { - LOG(ERROR) << "KthvalueOP failed as could not launch " - "hipcub::DeviceSegmentedRadixSort::SortPairs, status: " - << hipGetErrorString(err); - return false; - } -#else - if (err != cudaSuccess) { - LOG(ERROR) << "KthvalueOP failed as could not launch " - "cub::DeviceSegmentedRadixSort::SortPairs, status: " - << cudaGetErrorString(err); - return false; - } -#endif - framework::Tensor temp_storage; - temp_storage.mutable_data(ctx.GetPlace(), temp_storage_bytes); - - err = cub::DeviceSegmentedRadixSort::SortPairs( - temp_storage.data(), temp_storage_bytes, input, - sorted_values_ptr, input_indices.data(), sorted_indices_ptr, - num_cols * num_rows, num_rows, segment_offsets_t, segment_offsets_t + 1, - 0, sizeof(T) * 8, cu_stream); -#ifdef __HIPCC__ - if (err != hipSuccess) { - LOG(ERROR) << "KthvalueOP failed as could not launch " - "hipcub::DeviceSegmentedRadixSort::SortPairs, " - << temp_storage_bytes << ", status: " << hipGetErrorString(err); - return false; - } -#else - if (err != cudaSuccess) { - LOG(ERROR) << "KthvalueOP failed as could not launch " - "cub::DeviceSegmentedRadixSort::SortPairs, " - << temp_storage_bytes << ", status: " << cudaGetErrorString(err); - return false; - } -#endif - auto& dev = *ctx.eigen_device(); - const Eigen::DSizes slice_indices{0, k - 1}; - const Eigen::DSizes slice_sizes{num_rows, 1}; - auto e_indices = framework::EigenMatrix::From(*indices_tensor, dim); - auto e_tmp_indices = framework::EigenMatrix::From( - static_cast(temp_indices)); - std::vector odims = {static_cast(num_rows), static_cast(1)}; - dim = phi::make_ddim(odims); - auto e_values = framework::EigenMatrix::From(*out_tensor, dim); - auto e_tmp_values = framework::EigenMatrix::From( - static_cast(temp_values)); - - EigenSlice, int64_t, 2>::Eval( - dev, e_indices, e_tmp_indices, slice_indices, slice_sizes); - EigenSlice, T, 2>::Eval( - dev, e_values, e_tmp_values, slice_indices, slice_sizes); - return true; -} - -template -class KthvalueOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::InvalidArgument( - "It must use CUDAPlace, you must check your device set.")); - auto* input = ctx.Input("X"); - auto* output = ctx.Output("Out"); - auto* indices = ctx.Output("Indices"); - int k = static_cast(ctx.Attr("k")); - int axis = static_cast(ctx.Attr("axis")); - bool keepdim = static_cast(ctx.Attr("keepdim")); - const auto& in_dims = input->dims(); - if (axis < 0) axis += in_dims.size(); - auto out_dims = output->dims(); - const T* input_data = input->data(); - T* output_data = output->mutable_data(ctx.GetPlace()); - int64_t* indices_data = indices->mutable_data(ctx.GetPlace()); - - if (axis == in_dims.size() - 1) { - const int64_t& input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t& input_width = in_dims[in_dims.size() - 1]; - const auto& dev_ctx = ctx.cuda_device_context(); - PADDLE_ENFORCE_EQ(SortKthvalue(dev_ctx, input, input_width, - input_height, k, output, indices), - true, platform::errors::External( - "KthvalueOP: Error when use cub sorting")); - return; - } else { - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.emplace_back(i); - } - trans.emplace_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans.emplace_back(i); - } - trans.emplace_back(axis); - if (!keepdim) { - std::vector tmp_out_shape; - for (int i = 0; i < axis; i++) { - tmp_out_shape.emplace_back(in_dims[i]); - } - tmp_out_shape.emplace_back(1); - for (int i = axis + 1; i < in_dims.size(); i++) { - tmp_out_shape.emplace_back(in_dims[i]); - } - framework::DDim tmp_out_dims = phi::make_ddim(tmp_out_shape); - output->Resize(tmp_out_dims); - indices->Resize(tmp_out_dims); - } - framework::DDim trans_dims(in_dims); - framework::DDim trans_out_dims(in_dims); - for (int i = 0; i < trans.size(); i++) { - trans_dims[i] = in_dims[trans[i]]; - trans_out_dims[i] = in_dims[trans[i]]; - } - trans_out_dims[in_dims.size() - 1] = 1; - framework::Tensor trans_input; - trans_input.mutable_data(trans_dims, ctx.GetPlace()); - int ndims = trans.size(); - const auto& dev_ctx = ctx.cuda_device_context(); - TransCompute(ndims, dev_ctx, *input, - &trans_input, trans); - framework::Tensor trans_ind, trans_out; - trans_ind.mutable_data(trans_out_dims, ctx.GetPlace()); - trans_out.mutable_data(trans_out_dims, ctx.GetPlace()); - const int64_t input_height = - phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); - const int64_t input_width = trans_dims[trans_dims.size() - 1]; - PADDLE_ENFORCE_EQ( - SortKthvalue(dev_ctx, &trans_input, input_width, input_height, k, - &trans_out, &trans_ind), - true, - platform::errors::External("KthvalueOP: Error when use cub sorting")); - TransCompute( - ndims, dev_ctx, trans_ind, indices, trans); - TransCompute(ndims, dev_ctx, trans_out, - output, trans); - if (!keepdim) { - output->Resize(out_dims); - indices->Resize(out_dims); - } - } - } -}; - -template -class KthvalueOpGradCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(context.GetPlace()), true, - platform::errors::InvalidArgument( - "It must use CUDAPlace, you must check your device set.")); - auto* x = context.Input("X"); - auto* out_grad = - context.Input(framework::GradVarName("Out")); - auto* indices = context.Input("Indices"); - auto* x_grad = - context.Output(framework::GradVarName("X")); - int axis = context.Attr("axis"); - int k = static_cast(context.Attr("k")); - const auto& in_dims = x->dims(); - auto out_dims = indices->dims(); - if (axis < 0) axis += in_dims.size(); - T* x_grad_data = x_grad->mutable_data(context.GetPlace()); - const T* out_grad_data = out_grad->data(); - const int64_t* indices_data = indices->data(); - int pre, n, post; - GetDims(in_dims, axis, &pre, &n, &post); - auto& dev_ctx = context.cuda_device_context(); - int block_size = getBlockSize(post * k); - int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); - const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1); - int grid_size = std::min(max_blocks, pre); - AssignGradWithAxis<<>>( - out_grad_data, indices_data, x_grad_data, pre, post, n, 1); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - kthvalue, - ops::KthvalueOpCUDAKernel, - ops::KthvalueOpCUDAKernel, - ops::KthvalueOpCUDAKernel, - ops::KthvalueOpCUDAKernel); -REGISTER_OP_CUDA_KERNEL( - kthvalue_grad, - ops::KthvalueOpGradCUDAKernel, - ops::KthvalueOpGradCUDAKernel, - ops::KthvalueOpGradCUDAKernel, - ops::KthvalueOpGradCUDAKernel); diff --git a/paddle/fluid/operators/kthvalue_op.h b/paddle/fluid/operators/kthvalue_op.h deleted file mode 100644 index 15df0a10c6992f07f9913b867319bff342180c3d..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/kthvalue_op.h +++ /dev/null @@ -1,281 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/transpose_op.h" - -namespace paddle { -namespace operators { -template -static void getKthvalue(Type input_height, Type input_width, int input_dim, - const framework::Tensor* input, T* t_out, - Type* t_indices, const int& k) { - bool partial_sort_flag = (k * 64) < input_width; -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (Type i = 0; i < input_height; ++i) { - std::vector> col_vec; - col_vec.reserve(input_width); - if (input_dim == 1) { - auto e_input = framework::EigenVector::Flatten(*input); - for (Type j = 0; j < input_width; ++j) { - col_vec.emplace_back(std::pair(e_input(j), j)); - } - } else { - auto e_input = framework::EigenMatrix::Reshape(*input, input_dim - 1); - for (Type j = 0; j < input_width; ++j) { - col_vec.emplace_back(std::pair(e_input(i, j), j)); - } - } - if (partial_sort_flag) { - std::partial_sort( - col_vec.begin(), col_vec.begin() + k, col_vec.end(), - [](const std::pair& l, const std::pair& r) { - return (!std::isnan(static_cast(l.first)) && - std::isnan(static_cast(r.first))) || - (l.first < r.first); - }); - } else { - std::nth_element( - col_vec.begin(), col_vec.begin() + k - 1, col_vec.end(), - [](const std::pair& l, const std::pair& r) { - return (!std::isnan(static_cast(l.first)) && - std::isnan(static_cast(r.first))) || - (l.first < r.first); - }); - } - t_out[i] = col_vec[k - 1].first; - t_indices[i] = col_vec[k - 1].second; - } -} - -template -static void kthvalueAssign(const Type& input_height, const Type& input_width, - const int& input_dim, const framework::Tensor* input, - const framework::Tensor* indices, T* output_data) { -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (Type i = 0; i < input_height; ++i) { - if (input_dim == 1) { - auto e_input = framework::EigenVector::Flatten(*input); - auto e_indices = framework::EigenVector::Flatten(*indices); - output_data[i * input_width + e_indices(0)] = e_input(0); - } else { - auto e_input = framework::EigenMatrix::Reshape(*input, input_dim - 1); - auto e_indices = - framework::EigenMatrix::Reshape(*indices, input_dim - 1); - output_data[i * input_width + e_indices(i, 0)] = e_input(i, 0); - } - } -} - -template -class KthvalueCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* input = context.Input("X"); - auto* output = context.Output("Out"); - auto* indices = context.Output("Indices"); - const auto& in_dims = input->dims(); - int k = static_cast(context.Attr("k")); - bool keepdim = static_cast(context.Attr("keepdim")); - int axis = static_cast(context.Attr("axis")); - if (axis < 0) axis += in_dims.size(); - T* output_data = output->mutable_data(context.GetPlace()); - int64_t* indices_data = indices->mutable_data(context.GetPlace()); - auto out_dims = output->dims(); - if (axis == in_dims.size() - 1) { - const int64_t& input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t& input_width = in_dims[in_dims.size() - 1]; - getKthvalue(input_height, input_width, in_dims.size(), input, - output_data, indices_data, k); - } else { - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.emplace_back(i); - } - trans.emplace_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans.emplace_back(i); - } - trans.emplace_back(axis); - if (!keepdim) { - std::vector tmp_out_shape; - for (int i = 0; i < axis; i++) { - tmp_out_shape.emplace_back(in_dims[i]); - } - tmp_out_shape.emplace_back(1); - for (int i = axis + 1; i < in_dims.size(); i++) { - tmp_out_shape.emplace_back(in_dims[i]); - } - framework::DDim tmp_out_dims = phi::make_ddim(tmp_out_shape); - output->Resize(tmp_out_dims); - indices->Resize(tmp_out_dims); - } - framework::DDim trans_dims(in_dims); - framework::DDim trans_out_dims(in_dims); - - for (size_t i = 0; i < trans.size(); i++) { - trans_dims[i] = in_dims[trans[i]]; - trans_out_dims[i] = in_dims[trans[i]]; - } - trans_out_dims[in_dims.size() - 1] = 1; - framework::Tensor trans_inp; - trans_inp.mutable_data(trans_dims, context.GetPlace()); - int ndims = trans.size(); - auto& dev_context = - context.template device_context(); - TransCompute(ndims, dev_context, *input, - &trans_inp, trans); - - const int64_t input_height = - phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); - const int64_t input_width = trans_dims[trans_dims.size() - 1]; - framework::Tensor tmp_out, tmp_indices; - T* t_out = tmp_out.mutable_data(trans_out_dims, context.GetPlace()); - auto* t_ind = - tmp_indices.mutable_data(trans_out_dims, context.GetPlace()); - - getKthvalue(input_height, input_width, in_dims.size(), - &trans_inp, t_out, t_ind, k); - TransCompute( - ndims, dev_context, tmp_indices, indices, trans); - TransCompute(ndims, dev_context, tmp_out, - output, trans); - if (!keepdim) { - output->Resize(out_dims); - indices->Resize(out_dims); - } - } - } -}; - -template -class KthvalueGradCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* out_grad = - context.Input(framework::GradVarName("Out")); - auto* indices = context.Input("Indices"); - auto* x_grad = - context.Output(framework::GradVarName("X")); - int axis = static_cast(context.Attr("axis")); - bool keepdim = static_cast(context.Attr("keepdim")); - auto in_dims = x->dims(); - auto out_dims = indices->dims(); - axis = (axis < 0) ? (in_dims.size() + axis) : axis; - if (!keepdim) { - std::vector tmp_out_shape; - for (int i = 0; i < axis; i++) { - tmp_out_shape.emplace_back(out_dims[i]); - } - tmp_out_shape.emplace_back(1); - for (int i = axis + 1; i < in_dims.size(); i++) { - tmp_out_shape.emplace_back(out_dims[i - 1]); - } - out_dims = phi::make_ddim(tmp_out_shape); - } - T* x_grad_data = x_grad->mutable_data(context.GetPlace()); - if (axis == in_dims.size() - 1) { - const int64_t input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t input_width = in_dims[in_dims.size() - 1]; - memset(x_grad_data, 0, x_grad->numel() * sizeof(T)); - if (keepdim) { - kthvalueAssign(input_height, input_width, in_dims.size(), out_grad, - indices, x_grad_data); - } else { - auto& dev_context = - context.template device_context(); - framework::Tensor out_grad_tmp, indices_tmp; - out_grad_tmp.mutable_data(out_grad->dims(), dev_context.GetPlace()); - indices_tmp.mutable_data(indices->dims(), - dev_context.GetPlace()); - framework::TensorCopy(*out_grad, dev_context.GetPlace(), dev_context, - &out_grad_tmp); - framework::TensorCopy(*indices, dev_context.GetPlace(), dev_context, - &indices_tmp); - out_grad_tmp.Resize(out_dims); - indices_tmp.Resize(out_dims); - kthvalueAssign(input_height, input_width, in_dims.size(), &out_grad_tmp, - &indices_tmp, x_grad_data); - } - } else { - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.emplace_back(i); - } - trans.emplace_back(out_dims.size() - 1); - for (int i = axis + 1; i < out_dims.size() - 1; i++) { - trans.emplace_back(i); - } - trans.emplace_back(axis); - framework::DDim trans_dims(out_dims); - framework::DDim trans_in_dims(in_dims); - for (size_t i = 0; i < trans.size(); i++) { - trans_dims[i] = out_dims[trans[i]]; - trans_in_dims[i] = in_dims[trans[i]]; - } - framework::Tensor trans_dO, trans_ind; - trans_dO.mutable_data(trans_dims, context.GetPlace()); - trans_ind.mutable_data(trans_dims, context.GetPlace()); - int ndims = trans.size(); - auto& dev_context = - context.template device_context(); - if (keepdim) { - TransCompute( - ndims, dev_context, *out_grad, &trans_dO, trans); - TransCompute( - ndims, dev_context, *indices, &trans_ind, trans); - } else { - framework::Tensor out_grad_tmp, indices_tmp; - out_grad_tmp.mutable_data(out_grad->dims(), dev_context.GetPlace()); - indices_tmp.mutable_data(indices->dims(), - dev_context.GetPlace()); - framework::TensorCopy(*out_grad, dev_context.GetPlace(), dev_context, - &out_grad_tmp); - framework::TensorCopy(*indices, dev_context.GetPlace(), dev_context, - &indices_tmp); - out_grad_tmp.Resize(out_dims); - indices_tmp.Resize(out_dims); - TransCompute( - ndims, dev_context, out_grad_tmp, &trans_dO, trans); - TransCompute( - ndims, dev_context, indices_tmp, &trans_ind, trans); - } - const int64_t input_height = phi::product( - phi::slice_ddim(trans_in_dims, 0, trans_in_dims.size() - 1)); - const int64_t input_width = trans_in_dims[trans_in_dims.size() - 1]; - framework::Tensor tmp_out; - T* t_out = tmp_out.mutable_data(trans_in_dims, context.GetPlace()); - memset(t_out, 0, x_grad->numel() * sizeof(T)); - kthvalueAssign(input_height, input_width, in_dims.size(), - &trans_dO, &trans_ind, t_out); - TransCompute(ndims, dev_context, tmp_out, - x_grad, trans); - } - } -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/layer_norm_kernel.cu.h b/paddle/fluid/operators/layer_norm_kernel.cu.h index 412ae3c49b5f3cc9fc2422aa220af324e6d99b69..c0a4b88fc76fd0d648b289e0d2f13536523f02d8 100644 --- a/paddle/fluid/operators/layer_norm_kernel.cu.h +++ b/paddle/fluid/operators/layer_norm_kernel.cu.h @@ -758,12 +758,14 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_bwd_1024_final_kernel( */ template -void ln_bwd_1024_kernel_driver( - const platform::CUDADeviceContext &dev_ctx, const int rows, const int cols, - float epsilon, const T *x_ptr, const ScaleT *scale_ptr, const U *mean_ptr, - const U *var_ptr, const T *dout_ptr, T *dx_ptr, ScaleT *dscale_ptr, - ScaleT *dbias_ptr, const MaskType *mask_ptr = nullptr, - T factor = static_cast(0), T *d_dropout_src_ptr = nullptr) { +void ln_bwd_1024_kernel_driver(const phi::GPUContext &dev_ctx, const int rows, + const int cols, float epsilon, const T *x_ptr, + const ScaleT *scale_ptr, const U *mean_ptr, + const U *var_ptr, const T *dout_ptr, T *dx_ptr, + ScaleT *dscale_ptr, ScaleT *dbias_ptr, + const MaskType *mask_ptr = nullptr, + T factor = static_cast(0), + T *d_dropout_src_ptr = nullptr) { auto stream = dev_ctx.stream(); if (cols == 1024) { // step-1: compute dx and reduced part results of dscale and dbias. @@ -1334,8 +1336,7 @@ static void LayerNormBackward( const U *mean, const U *var, T *d_x, LayerNormScaleBiasT *d_scale, LayerNormScaleBiasT *d_bias, float epsilon, - int64_t batch_size, int64_t feature_size, - const platform::CUDADeviceContext &dev_ctx) { + int64_t batch_size, int64_t feature_size, const phi::GPUContext &dev_ctx) { auto stream = dev_ctx.stream(); #ifdef __HIPCC__ const int kMaxBlockDim = 256; diff --git a/paddle/fluid/operators/layer_norm_op.cc b/paddle/fluid/operators/layer_norm_op.cc index e7d676479be0cc1176fa27c477bd35a5d6787cd3..224ab748dab6cdf8be246c4b400b4e55b6faf675 100644 --- a/paddle/fluid/operators/layer_norm_op.cc +++ b/paddle/fluid/operators/layer_norm_op.cc @@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/layer_norm_op.h" - #include #include +#include "paddle/fluid/framework/op_registry.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" @@ -278,10 +277,3 @@ REGISTER_OPERATOR(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker, ops::LayerNormGradOpMaker); REGISTER_OPERATOR(layer_norm_grad, ops::LayerNormGradOp, ops::LayerNormGradNoNeedBufferVarInferer); -REGISTER_OP_CPU_KERNEL( - layer_norm, ops::LayerNormKernel, - ops::LayerNormKernel); -REGISTER_OP_CPU_KERNEL( - layer_norm_grad, - ops::LayerNormGradKernel, - ops::LayerNormGradKernel); diff --git a/paddle/fluid/operators/layer_norm_op.cu b/paddle/fluid/operators/layer_norm_op.cu deleted file mode 100644 index dfe73d3727132ae9b8f71e2a415ef5193f303493..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/layer_norm_op.cu +++ /dev/null @@ -1,289 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/operators/layer_norm_kernel.cu.h" -#include "paddle/fluid/operators/layer_norm_op.h" -#include "paddle/fluid/platform/float16.h" - -namespace paddle { -namespace operators { - -template -void LayerNormDirectCUDAFunctor::operator()(gpuStream_t stream, - const T *input, - std::vector input_shape, - const T *bias, const T *scale, - T *output, T *mean, T *variance, - int begin_norm_axis, float eps) { - const auto x_dims = phi::make_ddim(input_shape); - auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis); - int64_t batch_size = static_cast(matrix_dim[0]); - int64_t feature_size = static_cast(matrix_dim[1]); - switch (GetDesiredBlockDim(feature_size)) { - FIXED_BLOCK_DIM_CASE( - LayerNormForward<<>>( - input, scale, bias, output, mean, variance, eps, feature_size)); - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "Product from begin_norm_axis to end in layer_norm must be larger " - "than 1")); - break; - } -} - -template -class LayerNormKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - using U = LayerNormParamType; - const float epsilon = ctx.Attr("epsilon"); - auto *scale = ctx.Input("Scale"); - auto *bias = ctx.Input("Bias"); - auto *x = ctx.Input("X"); - - auto *y = ctx.Output("Y"); - auto *mean = ctx.Output("Mean"); - auto *var = ctx.Output("Variance"); - const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); - - const auto x_dims = x->dims(); - auto *x_data = x->data(); - auto *y_data = y->mutable_data(ctx.GetPlace()); - auto *mean_data = mean->mutable_data(ctx.GetPlace()); - auto *var_data = var->mutable_data(ctx.GetPlace()); - - auto *void_scale_data = (scale == nullptr ? nullptr : scale->data()); - auto *void_bias_data = (bias == nullptr ? nullptr : bias->data()); - - framework::proto::VarType::Type x_dtype = - framework::TransToProtoVarType(x->dtype()); - framework::proto::VarType::Type scale_bias_dtype; - if (void_scale_data != nullptr) { - scale_bias_dtype = framework::TransToProtoVarType(scale->dtype()); - if (void_bias_data != nullptr) { - PADDLE_ENFORCE_EQ(scale_bias_dtype, - framework::TransToProtoVarType(bias->dtype()), - platform::errors::InvalidArgument( - "Thie Scale and Bias of layer_norm op " - "should have the same data type.")); - } - } else { - scale_bias_dtype = (void_bias_data != nullptr - ? framework::TransToProtoVarType(bias->dtype()) - : x_dtype); - } - - bool is_scale_bias_same_dtype_with_x = x_dtype == scale_bias_dtype; - if (!is_scale_bias_same_dtype_with_x) { - PADDLE_ENFORCE_EQ(scale_bias_dtype, - framework::DataTypeTrait::DataType(), - platform::errors::InvalidArgument( - "Unsupported data type of Scale and Bias: %s", - framework::DataTypeToString(scale_bias_dtype))); - } - - auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis); - int64_t batch_size = static_cast(matrix_dim[0]); - int64_t feature_size = static_cast(matrix_dim[1]); - - auto stream = ctx.cuda_device_context().stream(); - -#define PADDLE_LAUNCH_LAYERNORM_FWD(ScaleBiasT, IsScaleBiasSameDTypeWithX) \ - do { \ - switch (GetDesiredBlockDim(feature_size)) { \ - FIXED_BLOCK_DIM_CASE( \ - LayerNormForward<<< \ - batch_size, kBlockDim, 0, stream>>>( \ - x_data, static_cast(void_scale_data), \ - static_cast(void_bias_data), y_data, \ - mean_data, var_data, epsilon, feature_size)); \ - default: \ - PADDLE_THROW(platform::errors::InvalidArgument( \ - "Product from begin_norm_axis to end must be larger than 1")); \ - break; \ - } \ - } while (0) - -#ifdef PADDLE_WITH_CUDA - bool can_call_1024_kernel = false; - if (feature_size == 1024 && scale != nullptr && bias != nullptr) { - can_call_1024_kernel = true; - } - if (can_call_1024_kernel) { - const int WARPS_M = 4; - const int WARPS_N = 1; - const int THREADS_PER_WARP = 32; - const int BYTES_PER_LDG = 16; - const int VecSize = BYTES_PER_LDG / sizeof(T); - - const int THREADS_PER_CTA = WARPS_N * THREADS_PER_WARP * WARPS_M; - const int ROWS_PER_CTA = WARPS_M; - - const int grid = static_cast( - std::ceil(batch_size / static_cast(ROWS_PER_CTA))); - if (is_scale_bias_same_dtype_with_x) { - ln_fwd_1024_kernel<<>>( - batch_size, feature_size, epsilon, x_data, - static_cast(void_scale_data), - static_cast(void_bias_data), mean_data, var_data, - y_data); - } else { - ln_fwd_1024_kernel<<>>( - batch_size, feature_size, epsilon, x_data, - static_cast(void_scale_data), - static_cast(void_bias_data), mean_data, var_data, - y_data); - } - } else { -#endif - if (is_scale_bias_same_dtype_with_x) { - PADDLE_LAUNCH_LAYERNORM_FWD(T, true); - } else { - PADDLE_LAUNCH_LAYERNORM_FWD(U, false); - } -#ifdef PADDLE_WITH_CUDA - } -#endif - -#undef PADDLE_LAUNCH_LAYERNORM_FWD - } -}; - -template -class LayerNormGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - using U = LayerNormParamType; - const float epsilon = ctx.Attr("epsilon"); - // d_x, d_scale, d_bias may be nullptr - auto *d_x = ctx.Output(framework::GradVarName("X")); - auto *d_scale = ctx.Output(framework::GradVarName("Scale")); - auto *d_bias = ctx.Output(framework::GradVarName("Bias")); - - auto *x = ctx.Input("X"); - auto *mean = ctx.Input("Mean"); - auto *var = ctx.Input("Variance"); - auto *scale = ctx.Input("Scale"); - auto *bias = ctx.Input("Bias"); - auto *d_y = ctx.Input(framework::GradVarName("Y")); - - const auto &x_dims = x->dims(); - const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); - auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis); - int64_t batch_size = static_cast(matrix_dim[0]); - int64_t feature_size = static_cast(matrix_dim[1]); - - auto *x_data = x->data(); - auto *d_y_data = d_y->data(); - - auto *mean_data = mean->data(); - auto *var_data = var->data(); - - auto *d_x_data = - (d_x == nullptr ? nullptr : d_x->mutable_data(ctx.GetPlace())); - - framework::proto::VarType::Type x_dtype = - framework::TransToProtoVarType(x->dtype()); - framework::proto::VarType::Type scale_bias_dtype; - if (scale != nullptr) { - scale_bias_dtype = framework::TransToProtoVarType(scale->dtype()); - } else { - // FIXME(zengjinle): do not find a better way to get the right - // data type of the d_scale and d_bias if scale == nullptr. - auto *bias = ctx.Input("Bias"); - if (bias != nullptr) { - scale_bias_dtype = framework::TransToProtoVarType(bias->dtype()); - } else { - scale_bias_dtype = x_dtype; - } - } - -#define PADDLE_LAUNCH_LAYERNORM_BWD(ScaleBiasT, IsScaleBiasSameDTypeWithX) \ - do { \ - auto *scale_data = \ - (scale == nullptr ? nullptr : scale->data()); \ - auto *d_scale_data = \ - (d_scale == nullptr ? nullptr : d_scale->mutable_data( \ - ctx.GetPlace())); \ - auto *d_bias_data = \ - (d_bias == nullptr ? nullptr : d_bias->mutable_data( \ - ctx.GetPlace())); \ - auto *d_x_data = \ - (d_x == nullptr ? nullptr : d_x->mutable_data(ctx.GetPlace())); \ - LayerNormBackward( \ - x_data, d_y_data, scale_data, mean_data, var_data, d_x_data, \ - d_scale_data, d_bias_data, epsilon, batch_size, feature_size, \ - ctx.cuda_device_context()); \ - } while (0) - - if (scale_bias_dtype == x_dtype) { - PADDLE_LAUNCH_LAYERNORM_BWD(T, true); - } else { - PADDLE_LAUNCH_LAYERNORM_BWD(U, false); - } - -#undef PADDLE_LAUNCH_LAYERNORM_BWD - } -}; - -template class LayerNormDirectCUDAFunctor; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -#ifdef PADDLE_WITH_HIP -// MIOPEN do not support double -REGISTER_OP_CUDA_KERNEL( - layer_norm, - ops::LayerNormKernel, - ops::LayerNormKernel); -REGISTER_OP_CUDA_KERNEL( - layer_norm_grad, - ops::LayerNormGradKernel, - ops::LayerNormGradKernel); -#elif CUDNN_VERSION_MIN(8, 1, 0) -REGISTER_OP_CUDA_KERNEL( - layer_norm, - ops::LayerNormKernel, - ops::LayerNormKernel, - ops::LayerNormKernel, - ops::LayerNormKernel); -REGISTER_OP_CUDA_KERNEL( - layer_norm_grad, - ops::LayerNormGradKernel, - ops::LayerNormGradKernel, - ops::LayerNormGradKernel, - ops::LayerNormGradKernel); -#else -REGISTER_OP_CUDA_KERNEL( - layer_norm, - ops::LayerNormKernel, - ops::LayerNormKernel, - ops::LayerNormKernel); -REGISTER_OP_CUDA_KERNEL( - layer_norm_grad, - ops::LayerNormGradKernel, - ops::LayerNormGradKernel, - ops::LayerNormGradKernel); -#endif diff --git a/paddle/fluid/operators/layer_norm_op.h b/paddle/fluid/operators/layer_norm_op.h deleted file mode 100644 index 9d70b7cf707437136bf358d31ea6fd4cc0f2a534..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/layer_norm_op.h +++ /dev/null @@ -1,374 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" -#if !defined(PADDLE_WITH_CUDA) && !defined(_WIN32) && !defined(__APPLE__) && \ - !defined(__OSX__) -#include "paddle/fluid/operators/jit/kernels.h" -#endif -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace platform { -class CPUDeviceContext; -class CUDADeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { - -// Wrap RowwiseMean and ColwiseMean. -// Reuse the cpu codes and replace the gpu codes with cublas_gemv, which is -// significantly faster. Unlike the RowwiseMean and ColwiseMean, the -// implementation only considers 2D. -template -struct RowwiseMean2D { - RowwiseMean2D(int left, int right, const platform::DeviceContext& dev_ctx); - - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, framework::Tensor* vec); -}; - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -template -class RowwiseMean2D { - public: - RowwiseMean2D(int left, int right, const platform::DeviceContext& dev_ctx) - : left_(left), right_(right) { - framework::DDim ones_dim({right_}); - divisor_.mutable_data(ones_dim, dev_ctx.GetPlace()); - phi::funcs::set_constant(dev_ctx, &divisor_, 1.0 / right); - } - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, framework::Tensor* out) { - phi::funcs::GetBlas(context).GEMV( - false, left_, right_, 1., input.data(), divisor_.data(), 0., - out->data()); - } - - private: - int left_; - int right_; - framework::Tensor divisor_; -}; -#endif - -template -class RowwiseMean2D { - public: - RowwiseMean2D(int left, int right, const platform::DeviceContext& dev_ctx) {} - - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, framework::Tensor* out) { - row_mean_(context, input, out); - } - - private: - phi::funcs::RowwiseMean row_mean_; -}; - -template -struct ColwiseSum2D { - ColwiseSum2D(int left, int right, const platform::DeviceContext& dev_ctx); - - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, framework::Tensor* vec); -}; - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -template -class ColwiseSum2D { - public: - ColwiseSum2D(int left, int right, const platform::DeviceContext& dev_ctx) - : left_(left), right_(right) { - framework::DDim ones_dim({left_}); - divisor_.mutable_data(ones_dim, dev_ctx.GetPlace()); - phi::funcs::set_constant(dev_ctx, &divisor_, 1.0); - } - - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, framework::Tensor* out) { - phi::funcs::GetBlas(context).GEMV( - true, left_, right_, 1., input.data(), divisor_.data(), 0., - out->data()); - } - - private: - int left_; - int right_; - framework::Tensor divisor_; -}; -#endif - -template -class ColwiseSum2D { - public: - ColwiseSum2D(int left, int right, const platform::DeviceContext& dev_ctx) {} - - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, framework::Tensor* out) { - col_wise_(context, input, out); - } - - private: - phi::funcs::ColwiseSum col_wise_; -}; - -template -struct SubAndSquareFunctor { - inline HOSTDEVICE T operator()(T a, T b) const { return (a - b) * (a - b); } -}; - -template -struct DivAndSqrtFunctor { - explicit DivAndSqrtFunctor(T epsilon) { epsilon_ = epsilon; } - inline HOSTDEVICE T operator()(T a, T b) const { - return a / (sqrt(b + epsilon_)); - } - - private: - T epsilon_; -}; - -template -struct MulInvVarFunctor { - inline HOSTDEVICE T operator()(T a, T b) const { - return a * std::sqrt(1.0 / b); - } -}; - -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; -using DataLayout = framework::DataLayout; - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -template -class LayerNormDirectCUDAFunctor { - public: - void operator()(gpuStream_t stream, const T* input, - std::vector input_shape, const T* bias, const T* scale, - T* output, T* mean, T* variance, int begin_norm_axis, - float eps); -}; -#endif - -template -class LayerNormKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const float epsilon = ctx.Attr("epsilon"); - auto* scale = ctx.Input("Scale"); - auto* bias = ctx.Input("Bias"); - auto x = *ctx.Input("X"); - - auto* y = ctx.Output("Y"); - auto* mean = ctx.Output("Mean"); - auto* var = ctx.Output("Variance"); - const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); - - const auto x_dims = x.dims(); - - y->mutable_data(ctx.GetPlace()); - mean->mutable_data(ctx.GetPlace()); - var->mutable_data(ctx.GetPlace()); - - auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis); - int left = static_cast(matrix_dim[0]); - int right = static_cast(matrix_dim[1]); - framework::DDim matrix_shape({left, right}); - - x.Resize(matrix_shape); - Tensor out; - out.ShareDataWith(*y); - out.Resize(matrix_shape); - -#if defined(PADDLE_WITH_CUDA) || defined(_WIN32) || defined(__APPLE__) || \ - defined(__OSX__) - auto& dev_ctx = ctx.template device_context(); - RowwiseMean2D row_mean(left, right, ctx.device_context()); - - // get mean - row_mean(dev_ctx, x, mean); - - // get variance - ElementwiseComputeEx, DeviceContext, T>( - ctx, &x, mean, /*axis*/ 0, SubAndSquareFunctor(), &out); - row_mean(dev_ctx, out, var); - - // get x_norm - ElementwiseComputeEx, DeviceContext, T>( - ctx, &x, mean, /*axis*/ 0, SubFunctor(), &out); - ElementwiseComputeEx, DeviceContext, T>( - ctx, &out, var, /*axis*/ 0, - DivAndSqrtFunctor(static_cast(epsilon)), &out); - - if (scale) { - ElementwiseComputeEx, DeviceContext, T>( - ctx, &out, scale, /*axis*/ 1, MulFunctor(), &out); - } - if (bias) { - ElementwiseComputeEx, DeviceContext, T>( - ctx, &out, bias, /*axis*/ 1, AddFunctor(), &out); - } -#else - PADDLE_ENFORCE_EQ(mean->numel(), left, - platform::errors::InvalidArgument( - "mean's length (%d) is not equal with expected (%d).", - mean->numel(), left)); - PADDLE_ENFORCE_EQ(var->numel(), left, - platform::errors::InvalidArgument( - "var's length (%d) is not equal with expected (%d).", - var->numel(), left)); - if (scale) { - PADDLE_ENFORCE_EQ( - scale->numel(), right, - platform::errors::InvalidArgument( - "scale's length (%d) is not equal with expected (%d).", - scale->numel(), right)); - } - if (bias) { - PADDLE_ENFORCE_EQ( - bias->numel(), right, - platform::errors::InvalidArgument( - "bias's length (%d) is not equal with expected (%d).", - bias->numel(), right)); - } - - auto ker = - jit::KernelFuncs, platform::CPUPlace>::Cache() - .At(right); - ker(x.data(), out.data(), mean->data(), var->data(), - scale ? scale->data() : nullptr, bias ? bias->data() : nullptr, - static_cast(left), static_cast(epsilon), right); -#endif - } -}; - -template -class LayerNormGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const float epsilon = ctx.Attr("epsilon"); - auto x = *ctx.Input("X"); - auto* mean = ctx.Input("Mean"); - auto* var = ctx.Input("Variance"); - auto* scale = ctx.Input("Scale"); - auto d_y = *ctx.Input(framework::GradVarName("Y")); - const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); - - // init output - auto* d_x = ctx.Output(framework::GradVarName("X")); - auto* d_scale = ctx.Output(framework::GradVarName("Scale")); - auto* d_bias = ctx.Output(framework::GradVarName("Bias")); - - const auto& x_dims = x.dims(); - auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis); - int left = static_cast(matrix_dim[0]); - int right = static_cast(matrix_dim[1]); - framework::DDim matrix_shape({left, right}); - - d_y.Resize(matrix_shape); - auto& dev_ctx = ctx.template device_context(); - ColwiseSum2D colwise_sum(left, right, - ctx.device_context()); - - Tensor temp; - Tensor temp_norm; - if (d_scale || d_x) { - x.Resize(matrix_shape); - temp.mutable_data(matrix_shape, ctx.GetPlace()); - - temp_norm.mutable_data(matrix_shape, ctx.GetPlace()); - // get x_norm - ElementwiseComputeEx, DeviceContext, T>( - ctx, &x, mean, /*axis*/ 0, SubFunctor(), &temp_norm); - ElementwiseComputeEx, DeviceContext, T>( - ctx, &temp_norm, var, /*axis*/ 0, - DivAndSqrtFunctor(static_cast(epsilon)), &temp_norm); - } - - if (d_bias) { - d_bias->mutable_data(ctx.GetPlace()); - colwise_sum(dev_ctx, d_y, d_bias); - } - if (d_scale) { - d_scale->mutable_data(ctx.GetPlace()); - ElementwiseComputeEx, DeviceContext, T>( - ctx, &temp_norm, &d_y, /*axis*/ 0, MulFunctor(), &temp); - colwise_sum(dev_ctx, temp, d_scale); - } - - if (d_x) { - framework::DDim vec_shape({left}); - d_x->mutable_data(ctx.GetPlace()); - auto dx_dim = d_x->dims(); - Tensor temp_vec; - temp_vec.mutable_data(vec_shape, ctx.GetPlace()); - - RowwiseMean2D row_mean(left, right, - ctx.device_context()); - - if (d_scale) { - // dy_dx - ElementwiseComputeEx, DeviceContext, T>( - ctx, &d_y, scale, /*axis*/ 1, MulFunctor(), &temp); - framework::TensorCopy(temp, ctx.GetPlace(), ctx.device_context(), d_x); - - // dy_dmean_dx - row_mean(dev_ctx, temp, &temp_vec); - ElementwiseComputeEx, DeviceContext, T>( - ctx, d_x, &temp_vec, /*axis*/ 0, SubFunctor(), d_x); - - // dy_var_dx - ElementwiseComputeEx, DeviceContext, T>( - ctx, &temp, &temp_norm, /*axis*/ 0, MulFunctor(), &temp); - } else { - // dy_dx - framework::TensorCopy(d_y, ctx.GetPlace(), ctx.device_context(), d_x); - - // dy_dmean_dx - row_mean(dev_ctx, d_y, &temp_vec); - ElementwiseComputeEx, DeviceContext, T>( - ctx, d_x, &temp_vec, /*axis*/ 0, SubFunctor(), d_x); - - // dy_var_dx - ElementwiseComputeEx, DeviceContext, T>( - ctx, &d_y, &temp_norm, /*axis*/ 0, MulFunctor(), &temp); - } - // dy_var_dx - row_mean(dev_ctx, temp, &temp_vec); - ElementwiseComputeEx, DeviceContext, T>( - ctx, &temp_norm, &temp_vec, /*axis*/ 0, MulFunctor(), &temp); - ElementwiseComputeEx, DeviceContext, T>( - ctx, d_x, &temp, /*axis*/ 0, SubFunctor(), d_x); - - ElementwiseComputeEx, DeviceContext, T>( - ctx, d_x, var, /*axis*/ 0, - DivAndSqrtFunctor(static_cast(epsilon)), d_x); - d_x->Resize(dx_dim); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/layer_norm_op_npu.cc b/paddle/fluid/operators/layer_norm_op_npu.cc index c88880b43fff9fccd9764f145fba8ca4c61343c7..3c7e5bf9593e0ae2b3d8c04db1467c3b8fd1e174 100644 --- a/paddle/fluid/operators/layer_norm_op_npu.cc +++ b/paddle/fluid/operators/layer_norm_op_npu.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/layer_norm_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/layer_norm_op_xpu.cc b/paddle/fluid/operators/layer_norm_op_xpu.cc index 0480a354c8bd8fdb81c95a576f57e9a12019ffc9..3b21a55f8df0dbb532729cf5cbca4c7362223b9c 100644 --- a/paddle/fluid/operators/layer_norm_op_xpu.cc +++ b/paddle/fluid/operators/layer_norm_op_xpu.cc @@ -14,7 +14,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/layer_norm_op.h" +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/lgamma_op.cc b/paddle/fluid/operators/lgamma_op.cc index 148fb05afcfd9a4ef1fcbc587a2bd33947a41000..72c6b41efa98922b4ba23fa4b6e1a83f931c701e 100644 --- a/paddle/fluid/operators/lgamma_op.cc +++ b/paddle/fluid/operators/lgamma_op.cc @@ -12,7 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/lgamma_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -35,16 +38,6 @@ $$out = log\Gamma(x)$$ class LgammaOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Lgamma"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Lgamma"); - - auto in_dims = ctx->GetInputDim("X"); - - ctx->SetOutputDim("Out", in_dims); - ctx->ShareLoD("X", "Out"); - } }; template @@ -83,17 +76,12 @@ class LgammaGradOp : public framework::OperatorWithKernel { namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(lgamma, LgammaInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); + REGISTER_OPERATOR(lgamma, ops::LgammaOp, ops::LgammaOpMaker, ops::LgammaGradMaker, - ops::LgammaGradMaker); + ops::LgammaGradMaker, + LgammaInferShapeFunctor); REGISTER_OPERATOR(lgamma_grad, ops::LgammaGradOp); - -REGISTER_OP_CPU_KERNEL( - lgamma, ops::LgammaKernel, - ops::LgammaKernel) - -REGISTER_OP_CPU_KERNEL( - lgamma_grad, - ops::LgammaGradKernel, - ops::LgammaGradKernel); diff --git a/paddle/fluid/operators/lgamma_op.cu b/paddle/fluid/operators/lgamma_op.cu deleted file mode 100644 index b9f273727b00bb5ec4398bf82b0a19737ee2387a..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/lgamma_op.cu +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" -#include "paddle/fluid/operators/lgamma_op.h" - -namespace paddle { -namespace operators { - -template -struct CudaLgammaFunctor { - __device__ __forceinline__ T operator()(const T x) const { - return Eigen::numext::lgamma(x); - } -}; - -template -class LgammaKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* x = context.Input("X"); - Tensor* out = context.Output("Out"); - out->mutable_data(context.GetPlace()); - - auto& dev_ctx = context.device_context(); - std::vector ins = {x}; - std::vector outs = {out}; - auto functor = CudaLgammaFunctor(); - paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, - &outs, functor); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL( - lgamma, ops::LgammaKernel, - ops::LgammaKernel); - -REGISTER_OP_CUDA_KERNEL( - lgamma_grad, - ops::LgammaGradKernel, - ops::LgammaGradKernel); diff --git a/paddle/fluid/operators/lgamma_op.h b/paddle/fluid/operators/lgamma_op.h deleted file mode 100644 index 674054e74573208ea9bbd537419d202e1a30d8c0..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/lgamma_op.h +++ /dev/null @@ -1,100 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/platform/for_range.h" - -namespace paddle { -namespace operators { - -template -struct LgammaFunctor { - LgammaFunctor(const T* input, T* output, int64_t numel) - : input_(input), output_(output), numel_(numel) {} - - HOSTDEVICE void operator()(int64_t idx) const { - output_[idx] = Eigen::numext::lgamma(input_[idx]); - } - - private: - const T* input_; - T* output_; - int64_t numel_; -}; - -template -struct LgammaGradFunctor { - LgammaGradFunctor(const T* dout, const T* x, T* output, int64_t numel) - : dout_(dout), x_(x), output_(output), numel_(numel) {} - - HOSTDEVICE void operator()(int64_t idx) const { - output_[idx] = dout_[idx] * Eigen::numext::digamma(x_[idx]); - } - - private: - const T* dout_; - const T* x_; - T* output_; - int64_t numel_; -}; - -using Tensor = framework::Tensor; - -template -class LgammaKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* x = context.Input("X"); - Tensor* out = context.Output("Out"); - - auto numel = x->numel(); - auto* x_data = x->data(); - auto* out_data = out->mutable_data(context.GetPlace(), - size_t(x->numel() * sizeof(T))); - - auto& dev_ctx = context.template device_context(); - platform::ForRange for_range(dev_ctx, numel); - LgammaFunctor functor(x_data, out_data, numel); - for_range(functor); - } -}; - -template -class LgammaGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const { - const framework::Tensor* d_out = - ctx.Input(framework::GradVarName("Out")); - const framework::Tensor* x = ctx.Input("X"); - framework::Tensor* d_x = - ctx.Output(framework::GradVarName("X")); - - auto numel = d_out->numel(); - auto* dout_data = d_out->data(); - auto* x_data = x->data(); - auto* dx_data = d_x->mutable_data( - ctx.GetPlace(), static_cast(numel * sizeof(T))); - - auto& dev_ctx = ctx.template device_context(); - platform::ForRange for_range(dev_ctx, numel); - LgammaGradFunctor functor(dout_data, x_data, dx_data, numel); - for_range(functor); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/log_softmax_op.cc b/paddle/fluid/operators/log_softmax_op.cc index 0e69b397e04c7eda7f515350caf870be5d7b57a5..da38f906b9bd34ba6c3251059ee12902e62eadaf 100644 --- a/paddle/fluid/operators/log_softmax_op.cc +++ b/paddle/fluid/operators/log_softmax_op.cc @@ -12,10 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/log_softmax_op.h" #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/common_infer_shape_functions.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -24,10 +27,6 @@ class LogSoftmaxOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - return UnaryOpUnchangedInferShapeCheckAxis(ctx); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -123,18 +122,11 @@ class LogSoftmaxGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; - +DECLARE_INFER_SHAPE_FUNCTOR(log_softmax, LogSoftmaxInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMetaCheckAxis)); REGISTER_OPERATOR(log_softmax, ops::LogSoftmaxOp, ops::LogSoftmaxOpMaker, ops::LogSoftmaxOpInferVarType, ops::LogSoftmaxGradOpMaker, - ops::LogSoftmaxGradOpMaker); + ops::LogSoftmaxGradOpMaker, + LogSoftmaxInferShapeFunctor); REGISTER_OPERATOR(log_softmax_grad, ops::LogSoftmaxGradOp); - -REGISTER_OP_CPU_KERNEL( - log_softmax, - ops::LogSoftmaxKernel, - ops::LogSoftmaxKernel); -REGISTER_OP_CPU_KERNEL( - log_softmax_grad, - ops::LogSoftmaxGradKernel, - ops::LogSoftmaxGradKernel); diff --git a/paddle/fluid/operators/log_softmax_op.cu b/paddle/fluid/operators/log_softmax_op.cu deleted file mode 100644 index 26b6ce43303d181c41b60cf36c229d00acb0e626..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/log_softmax_op.cu +++ /dev/null @@ -1,81 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/log_softmax_op.h" -#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class LogSoftmaxKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *x = ctx.Input("X"); - auto *out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - int input_axis = ctx.Attr("axis"); - auto &dev_ctx = ctx.template device_context(); - phi::SoftmaxForwardCUDAKernelDriver(dev_ctx, *x, input_axis, out); - } -}; - -template -class LogSoftmaxGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *out = ctx.Input("Out"); - auto *dout = ctx.Input(framework::GradVarName("Out")); - auto *dx = ctx.Output(framework::GradVarName("X")); - dx->mutable_data(ctx.GetPlace()); - - int input_axis = ctx.Attr("axis"); - auto &dev_ctx = ctx.template device_context(); - phi::SoftmaxBackwardCUDAKernelDriver(dev_ctx, *out, *dout, - input_axis, dx); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -#ifdef PADDLE_WITH_HIP -REGISTER_OP_CUDA_KERNEL( - log_softmax, ops::LogSoftmaxKernel, - ops::LogSoftmaxKernel, - ops::LogSoftmaxKernel); -REGISTER_OP_CUDA_KERNEL( - log_softmax_grad, ops::LogSoftmaxGradKernel, - ops::LogSoftmaxGradKernel, - ops::LogSoftmaxGradKernel); -#else -REGISTER_OP_CUDA_KERNEL( - log_softmax, ops::LogSoftmaxKernel, - ops::LogSoftmaxKernel, - ops::LogSoftmaxKernel, - ops::LogSoftmaxKernel); -REGISTER_OP_CUDA_KERNEL( - log_softmax_grad, ops::LogSoftmaxGradKernel, - ops::LogSoftmaxGradKernel, - ops::LogSoftmaxGradKernel, - ops::LogSoftmaxGradKernel); -#endif diff --git a/paddle/fluid/operators/log_softmax_op.h b/paddle/fluid/operators/log_softmax_op.h deleted file mode 100644 index 162087a75662d711a63cbbe4beeaecf265367c6a..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/log_softmax_op.h +++ /dev/null @@ -1,197 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -using EigenMatrix = framework::EigenMatrix; - -static inline int CanonicalAxis(const int axis, const int rank) { - if (axis < 0) { - return axis + rank; - } - return axis; -} - -static inline size_t SizeToAxis(const int axis, const framework::DDim dims) { - size_t size = 1; - for (int i = 0; i < axis; i++) { - size *= dims[i]; - } - return size; -} - -static inline size_t SizeFromAxis(const int axis, const framework::DDim dims) { - size_t size = 1; - for (int i = axis; i < dims.size(); i++) { - size *= dims[i]; - } - return size; -} - -template -struct ValueClip { - HOSTDEVICE T operator()(const T& x) const { - const T kThreshold = static_cast(-64.); - return x < kThreshold ? kThreshold : x; - } -}; - -template -struct LogSoftmaxFunctor { - void operator()(const DeviceContext& context, const framework::Tensor* X, - framework::Tensor* Y, const int axis) { - constexpr int kBatchDim = 0; - constexpr int kClassDim = 1; - constexpr int kAxisDim = 1; - - int axis_dim = X->dims()[axis]; - const int n = SizeToAxis(axis, X->dims()); - const int d = SizeFromAxis(axis, X->dims()); - framework::DDim dim_2d{n, d}; - - auto logits = EigenMatrix::From(*X, dim_2d); - auto log_softmax = EigenMatrix::From(*Y, dim_2d); - - const int batch_size = logits.dimension(kBatchDim); - const int num_classes = logits.dimension(kClassDim); - const int num_remain = num_classes / axis_dim; - - Eigen::DSizes along_axis(kAxisDim); - Eigen::DSizes batch_classes(batch_size, num_classes); - Eigen::DSizes batch_by_one(batch_size, 1); - Eigen::DSizes one_by_class(1, num_classes); - Eigen::DSizes batch_one_remain(batch_size, 1, num_remain); - Eigen::DSizes one_axis_one(1, axis_dim, 1); - Eigen::DSizes one_axis(1, axis_dim); - Eigen::DSizes batch_axis_remain(batch_size, axis_dim, num_remain); - - // For numerical stability, logits should be shifted by maximum number along - // axis, calculate shifted_logits into log_softmax tensor for memory reuse. - if (num_remain == 1) { - // axis == -1, axis and class in same dimension, calculate along - // class dimension directly for higher performance - log_softmax.device(*context.eigen_device()) = - (logits - - logits.maximum(along_axis) - .eval() - .reshape(batch_by_one) - .broadcast(one_by_class)) - .unaryExpr(ValueClip()); - } else { - // axis != -1, class dimension split into (axis, remain), max and sum - // should be calculated along axis dimension - log_softmax.device(*context.eigen_device()) = - (logits.reshape(batch_axis_remain) - - logits.reshape(batch_axis_remain) - .maximum(along_axis) - .eval() - .reshape(batch_one_remain) - .broadcast(one_axis_one) - .reshape(batch_classes)) - .unaryExpr(ValueClip()); - } - - log_softmax.device(*context.eigen_device()) = - log_softmax - - log_softmax.exp() - .eval() - .reshape(batch_axis_remain) - .sum(along_axis) - .log() - .broadcast(one_axis); - } -}; - -template -class LogSoftmaxKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* X = context.Input("X"); - auto* Out = context.Output("Out"); - const int rank = X->dims().size(); - const int axis = CanonicalAxis(context.Attr("axis"), rank); - - // allocate memory on device. - Out->mutable_data(context.GetPlace()); - - if (X->numel() != 0) { - LogSoftmaxFunctor()( - context.template device_context(), X, Out, axis); - } - } -}; - -template -struct LogSoftmaxGradFunctor { - void operator()(const DeviceContext& context, const framework::Tensor* Y, - const framework::Tensor* dY, framework::Tensor* dX, - const int axis) { - constexpr int kBatchDim = 0; - constexpr int kClassDim = 1; - - const int n = SizeToAxis(axis, Y->dims()); - const int d = SizeFromAxis(axis, Y->dims()); - framework::DDim dim_2d{n, d}; - - auto y = EigenMatrix::From(*Y, dim_2d); - auto dy = EigenMatrix::From(*dY, dim_2d); - auto dx = EigenMatrix::From(*dX, dim_2d); - - const int axis_dim = Y->dims()[axis]; - const int batch_size = y.dimension(kBatchDim); - const int num_classes = y.dimension(kClassDim); - const int num_remain = num_classes / axis_dim; - - Eigen::DSizes along_class(kClassDim); - Eigen::DSizes batch_axis_remain(batch_size, axis_dim, num_remain); - Eigen::DSizes one_axis(1, axis_dim); - - dx.device(*context.eigen_device()) = - dy - - (y.exp()) * (dy.reshape(batch_axis_remain) - .sum(along_class) - .broadcast(one_axis)); - } -}; - -template -class LogSoftmaxGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* Out = context.Input("Out"); - auto* dOut = - context.Input(framework::GradVarName("Out")); - auto* dX = context.Output(framework::GradVarName("X")); - const int rank = Out->dims().size(); - const int axis = CanonicalAxis(context.Attr("axis"), rank); - - // allocate memory on device. - dX->mutable_data(context.GetPlace()); - - if (Out->numel() != 0) { - LogSoftmaxGradFunctor()( - context.template device_context(), Out, dOut, dX, - axis); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/log_softmax_op_npu.cc b/paddle/fluid/operators/log_softmax_op_npu.cc index 5795f1dffac785b82662cebb84e8224cec78ecf6..6ce21aec9215a007ac6ca49ee1bffc1a40d40c81 100644 --- a/paddle/fluid/operators/log_softmax_op_npu.cc +++ b/paddle/fluid/operators/log_softmax_op_npu.cc @@ -12,8 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/log_softmax_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" +#include "paddle/phi/kernels/funcs/axis_utils.h" namespace paddle { namespace operators { @@ -27,7 +28,7 @@ class LogSoftmaxNPUKernel : public framework::OpKernel { auto* X = ctx.Input("X"); auto* Out = ctx.Output("Out"); const int rank = X->dims().size(); - const int axis = CanonicalAxis(ctx.Attr("axis"), rank); + const int axis = phi::funcs::CanonicalAxis(ctx.Attr("axis"), rank); Out->mutable_data(ctx.GetPlace()); if (X->numel() != 0) { @@ -47,7 +48,7 @@ class LogSoftmaxGradNPUKernel : public framework::OpKernel { auto* dOut = ctx.Input(framework::GradVarName("Out")); auto* dX = ctx.Output(framework::GradVarName("X")); const int rank = dOut->dims().size(); - const int axis = CanonicalAxis(ctx.Attr("axis"), rank); + const int axis = phi::funcs::CanonicalAxis(ctx.Attr("axis"), rank); // allocate memory on device. dX->mutable_data(ctx.GetPlace()); diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc index 65297abe3e49b8f0fe85e4102d8f449c2c02788f..88d70d9bb7dae50f9ca0d82ce53896632b8b00ed 100644 --- a/paddle/fluid/operators/lrn_op.cc +++ b/paddle/fluid/operators/lrn_op.cc @@ -221,7 +221,7 @@ class LRNOp : public framework::OperatorWithKernel { auto ar = paddle::framework::AttrReader(attrs); const std::string data_format = ar.Get("data_format"); auto dl = framework::StringToDataLayout(data_format); - // Some models may have intentionally set "AnyLayout" for pool + // Some models may have intentionally set "AnyLayout" for lrn // op. Treat this as NCHW (default data_format value) if (dl != framework::DataLayout::kAnyLayout) { return framework::OpKernelType(expected_kernel_type.data_type_, diff --git a/paddle/fluid/operators/lu_op.h b/paddle/fluid/operators/lu_op.h index 214b2eccae9f75e9bfcfa3df0b823918e2b0c353..2414ae68438fd4e3cff94d60f400063b72116714 100644 --- a/paddle/fluid/operators/lu_op.h +++ b/paddle/fluid/operators/lu_op.h @@ -18,9 +18,9 @@ limitations under the License. */ #include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/operators/set_value_op.h" #include "paddle/fluid/operators/svd_helper.h" -#include "paddle/fluid/operators/tril_triu_op.h" +#include "paddle/phi/kernels/elementwise_kernel.h" #include "paddle/phi/kernels/funcs/lapack/lapack_function.h" -#include "paddle/phi/kernels/math_kernel.h" +#include "paddle/phi/kernels/funcs/tril_triu_compute.h" #include "paddle/phi/kernels/triangular_solve_kernel.h" namespace paddle { @@ -404,11 +404,12 @@ void LU_Unpack(const DeviceContext& dev_ctx, const framework::Tensor* LU, const auto W = udims[udims.size() - 1]; auto L_dataptr = L->mutable_data(dev_ctx.GetPlace()); platform::ForRange x_for_range(dev_ctx, LU->numel()); - TrilTriuCompute tril_computer(LU->data(), -1, true, H, W, L_dataptr); + phi::funcs::TrilTriuCompute tril_computer(LU->data(), -1, true, H, W, + L_dataptr); x_for_range(tril_computer); - TrilTriuCompute triu_computer(LU->data(), 0, false, H, W, - U->mutable_data(dev_ctx.GetPlace())); + phi::funcs::TrilTriuCompute triu_computer( + LU->data(), 0, false, H, W, U->mutable_data(dev_ctx.GetPlace())); x_for_range(triu_computer); // set L's diagonal 1 @@ -532,15 +533,15 @@ class LUGradKernel : public framework::OpKernel { auto phil_rank = LmHdims.size(); auto phiu_rank = UmHdims.size(); platform::ForRange l_for_range(dev_ctx, phi_L.numel()); - TrilTriuCompute tril_computer(phi_L.data(), -1, true, - LmHdims[phil_rank - 2], - LmHdims[phil_rank - 1], phi_L.data()); + phi::funcs::TrilTriuCompute tril_computer( + phi_L.data(), -1, true, LmHdims[phil_rank - 2], + LmHdims[phil_rank - 1], phi_L.data()); l_for_range(tril_computer); platform::ForRange u_for_range(dev_ctx, phi_U.numel()); - TrilTriuCompute triu_computer(phi_U.data(), 0, false, - UmHdims[phiu_rank - 2], - UmHdims[phiu_rank - 1], phi_U.data()); + phi::funcs::TrilTriuCompute triu_computer( + phi_U.data(), 0, false, UmHdims[phiu_rank - 2], + UmHdims[phiu_rank - 1], phi_U.data()); u_for_range(triu_computer); Tensor_Add(dev_ctx, phi_L, phi_U, &phi); @@ -591,8 +592,9 @@ class LUGradKernel : public framework::OpKernel { const auto W = phidims[phidims.size() - 1]; platform::ForRange x_for_range(dev_ctx, phi_complement.numel()); - TrilTriuCompute tril_computer(phi_complement.data(), -1, true, H, - W, phi_complement_l.data()); + phi::funcs::TrilTriuCompute tril_computer( + phi_complement.data(), -1, true, H, W, + phi_complement_l.data()); x_for_range(tril_computer); Tensor_Sub(dev_ctx, phi, phi_complement_l, &phi); @@ -664,8 +666,8 @@ class LUGradKernel : public framework::OpKernel { const auto W = phidims[phidims.size() - 1]; platform::ForRange x_for_range(dev_ctx, phi_complement.numel()); - TrilTriuCompute triu_computer(phi_complement.data(), 0, false, H, W, - phi_complement_u.data()); + phi::funcs::TrilTriuCompute triu_computer( + phi_complement.data(), 0, false, H, W, phi_complement_u.data()); x_for_range(triu_computer); Tensor_Sub(dev_ctx, phi, phi_complement_u, &phi); diff --git a/paddle/fluid/operators/lu_unpack_op.h b/paddle/fluid/operators/lu_unpack_op.h index d2303f2c08da8e98053e314f9756e4e375e27775..e4100867dc685ef68cd01b22ab7972aa8b436a06 100644 --- a/paddle/fluid/operators/lu_unpack_op.h +++ b/paddle/fluid/operators/lu_unpack_op.h @@ -16,7 +16,8 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/lu_op.h" -#include "paddle/fluid/operators/tril_triu_op.h" +#include "paddle/fluid/platform/for_range.h" +#include "paddle/phi/kernels/funcs/tril_triu_compute.h" namespace paddle { namespace operators { @@ -87,7 +88,8 @@ class LU_UnpackGradKernel : public framework::OpKernel { auto W = ldims[ldims.size() - 1]; auto L_dataptr = dl_tril.mutable_data(dev_ctx.GetPlace()); platform::ForRange l_for_range(dev_ctx, dl->numel()); - TrilTriuCompute tril_computer(dl->data(), -1, true, H, W, L_dataptr); + phi::funcs::TrilTriuCompute tril_computer(dl->data(), -1, true, H, W, + L_dataptr); l_for_range(tril_computer); const auto udims = du->dims(); @@ -96,7 +98,8 @@ class LU_UnpackGradKernel : public framework::OpKernel { W = udims[udims.size() - 1]; auto U_dataptr = du_triu.mutable_data(dev_ctx.GetPlace()); platform::ForRange u_for_range(dev_ctx, du->numel()); - TrilTriuCompute triu_computer(du->data(), 0, false, H, W, U_dataptr); + phi::funcs::TrilTriuCompute triu_computer(du->data(), 0, false, H, W, + U_dataptr); u_for_range(triu_computer); auto xdims = dx->dims(); diff --git a/paddle/fluid/operators/masked_select_op.cc b/paddle/fluid/operators/masked_select_op.cc index a6eb535c693b8422a7b066618cbfddeddd751387..1887bbcfb7efdcf43e0cc020d773268312523505 100644 --- a/paddle/fluid/operators/masked_select_op.cc +++ b/paddle/fluid/operators/masked_select_op.cc @@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -21,16 +23,6 @@ class MaskedSelectOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "Input", "MaskedSelect"); - OP_INOUT_CHECK(ctx->HasInput("Mask"), "Input", "Mask", "MaskedSelect"); - OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Out", "MaskedSelect"); - - // output will only be a 1-D Tensor - ctx->SetOutputDim("Y", phi::make_ddim({-1})); - ctx->ShareLoD("X", /*->*/ "Y"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -100,8 +92,13 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(MaskedSelectedGradNoNeedBufferVarsInferer, } // namespace paddle namespace ops = paddle::operators; + +DECLARE_INFER_SHAPE_FUNCTOR(masked_select, MaksedSelectInferShapeFunctor, + PD_INFER_META(phi::MaskedSelectInferMeta)); + REGISTER_OPERATOR(masked_select, ops::MaskedSelectOp, ops::MaskedSelectOpMaker, ops::MaskedSelectGradOpMaker, - ops::MaskedSelectGradOpMaker); + ops::MaskedSelectGradOpMaker, + MaksedSelectInferShapeFunctor); REGISTER_OPERATOR(masked_select_grad, ops::MaskedSelectOpGrad, ops::MaskedSelectedGradNoNeedBufferVarsInferer); diff --git a/paddle/fluid/operators/math/inclusive_scan.h b/paddle/fluid/operators/math/inclusive_scan.h index 9994ccc10cb13b2f692b18f16182f6bcdad7efa5..b77e23450360c836ae3efe0a6dc2c77216e660f0 100644 --- a/paddle/fluid/operators/math/inclusive_scan.h +++ b/paddle/fluid/operators/math/inclusive_scan.h @@ -34,10 +34,10 @@ namespace paddle { namespace operators { namespace math { -template +template static void CubInclusiveScan(InputIterator x_iter, OutputIterator y_iter, - size_t n, BinaryOp op, - const platform::CUDADeviceContext &dev_ctx) { + size_t n, BinaryOp op, const Context &dev_ctx) { memory::AllocationPtr allocation; void *temp_storage = nullptr; size_t temp_storage_bytes = 0; @@ -185,11 +185,10 @@ static __global__ void InclusiveScanInnerDimCUDAKernel(const T *x, T *y, } } -template +template static void InclusiveScanInnerDim(const T *x, T *y, size_t outer_dim, size_t inner_dim, T init, BinaryOp op, - bool reverse, - const platform::CUDADeviceContext &dev_ctx) { + bool reverse, const Context &dev_ctx) { constexpr size_t kThreadNumX = 16; constexpr size_t kThreadNumY = 32; @@ -209,10 +208,10 @@ static void InclusiveScanInnerDim(const T *x, T *y, size_t outer_dim, } } -template +template void InclusiveScan(const T *x, T *y, size_t outer_dim, size_t mid_dim, size_t inner_dim, T init, BinaryOp op, bool reverse, - const platform::CUDADeviceContext &dev_ctx) { + const Context &dev_ctx) { if (outer_dim == 0 || mid_dim == 0 || inner_dim == 0) return; if (outer_dim == 1 && inner_dim == 1) { @@ -224,8 +223,7 @@ void InclusiveScan(const T *x, T *y, size_t outer_dim, size_t mid_dim, CubInclusiveScan(x, y, mid_dim, op, dev_ctx); } } else if (inner_dim != 1) { - platform::ForRange for_range( - dev_ctx, outer_dim * inner_dim); + platform::ForRange for_range(dev_ctx, outer_dim * inner_dim); if (reverse) { for_range( InclusiveScanOuterOrMidDimFunctor( diff --git a/paddle/fluid/operators/matrix_power_op.cc b/paddle/fluid/operators/matrix_power_op.cc index cdf204628b638f877c92e35a8941487aa39b5427..56f65340ea999f48702294f912c4354d83990881 100644 --- a/paddle/fluid/operators/matrix_power_op.cc +++ b/paddle/fluid/operators/matrix_power_op.cc @@ -14,8 +14,11 @@ #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor_util.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -23,26 +26,6 @@ namespace operators { class MatrixPowerOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "matrix_power"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "matrix_power"); - auto dims = ctx->GetInputDim("X"); - auto n_dim = dims.size(); - PADDLE_ENFORCE_GE(n_dim, 2, - platform::errors::InvalidArgument( - "The Input(X) should have at least 2 dimensions. But " - "received a %d dimension tensor.", - n_dim)); - PADDLE_ENFORCE_EQ(dims[n_dim - 2], dims[n_dim - 1], - platform::errors::InvalidArgument( - "The inner-most 2 dimensions of Input(X) all should " - "be square matrices " - "But received X's shape[-2] = %d and shape[-1] = %d.", - dims[n_dim - 2], dims[n_dim - 1])); - ctx->SetOutputDim("Out", dims); - ctx->ShareLoD("X", /*->*/ "Out"); - } }; class MatrixPowerOpMaker : public framework::OpProtoAndCheckerMaker { @@ -116,9 +99,14 @@ class MatrixPowerGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; + +DECLARE_INFER_SHAPE_FUNCTOR(matrix_power, MatrixPowerInferShapeFunctor, + PD_INFER_META(phi::MatrixPowerInferMeta)); + REGISTER_OPERATOR(matrix_power, ops::MatrixPowerOp, ops::MatrixPowerOpMaker, ops::MatrixPowerOpInferVarType, ops::MatrixPowerGradOpMaker, - ops::MatrixPowerGradOpMaker); + ops::MatrixPowerGradOpMaker, + MatrixPowerInferShapeFunctor); REGISTER_OPERATOR(matrix_power_grad, ops::MatrixPowerGradOp); diff --git a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc index 812c55cdd5055186d7fd83a2057d88256f3b34a3..2e82b47e8da1c6eb6f4a05fc4f7f356110f9fff1 100644 --- a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc @@ -12,8 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/layer_norm_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/mkldnn_reuse.h" +#include "paddle/phi/common/data_type.h" namespace paddle { namespace operators { @@ -139,7 +140,7 @@ class LayerNormMKLDNNOpKernel : public paddle::framework::OpKernel { layer_norm_p->execute(astream, args); astream.wait(); - y->set_layout(DataLayout::kMKLDNN); + y->set_layout(phi::DataLayout::kMKLDNN); y->set_format(platform::GetMKLDNNFormat(*dst_memory)); } }; diff --git a/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc index bdb4fe1198a8e550cb22cf4d727f6b7da34c28fe..86ecb01c89af7e12cc2a3dbcf91740d65d0ac247 100644 --- a/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc @@ -50,13 +50,8 @@ class PReluMKLDNNHandler if (weights->dims().size() != x->dims().size()) { auto new_weights_dims = std::vector(x->dims().size(), 1); if (mode == "channel") { - if (data_format == "NHWC") { - new_weights_dims[x->dims().size() - 1] = - *std::max_element(weights_dims.begin(), weights_dims.end()); - } else { - new_weights_dims[1] = - *std::max_element(weights_dims.begin(), weights_dims.end()); - } + new_weights_dims[1] = + *std::max_element(weights_dims.begin(), weights_dims.end()); } weights_dims = std::move(new_weights_dims); } diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc index 717af61b858dc16f9bdda20f530cbf06a09908eb..0e988557df6262d4f924323084daf062aee75e0c 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc @@ -98,7 +98,7 @@ TEST(test_pool2d_transpose_nhwc, cpu_place) { TEST(test_pool2d_relu_relu_nhwc, cpu_place) { framework::DDim dims({1, 4, 8, 512}); // NHWC shape - framework::DDim expected_dims({1, 512, 3, 7}); // NHWC expected shape + framework::DDim expected_dims({1, 512, 3, 7}); // NCHW expected shape platform::CPUPlace p; framework::Scope scope; diff --git a/paddle/fluid/operators/mode_op.cc b/paddle/fluid/operators/mode_op.cc index c7fb92cd5107cee12e0995948e320ef3ed616f4d..9c16ccb138f7da56568ce6224dc30deb5bbccb7f 100644 --- a/paddle/fluid/operators/mode_op.cc +++ b/paddle/fluid/operators/mode_op.cc @@ -12,10 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/mode_op.h" #include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + namespace paddle { namespace operators { @@ -23,43 +27,6 @@ class ModeOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "mode"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "mode"); - OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "mode"); - - auto input_dims = ctx->GetInputDim("X"); - const int& dim_size = input_dims.size(); - int axis = static_cast(ctx->Attrs().Get("axis")); - PADDLE_ENFORCE_EQ( - (axis < dim_size) && (axis >= (-1 * dim_size)), true, - paddle::platform::errors::InvalidArgument( - "the axis of ModeOp must be [-%d, %d), but you set axis is %d", - dim_size, dim_size, axis)); - PADDLE_ENFORCE_GE(input_dims.size(), 1, - paddle::platform::errors::InvalidArgument( - "input of ModeOp must have >= 1d shape")); - if (axis < 0) axis += dim_size; - bool keepdim = ctx->Attrs().Get("keepdim"); - std::vector dimvec; - for (int64_t i = 0; i < axis; i++) { - dimvec.emplace_back(input_dims[i]); - } - if (keepdim) { - dimvec.emplace_back(static_cast(1)); - } - for (int64_t i = axis + 1; i < dim_size; i++) { - dimvec.emplace_back(input_dims[i]); - } - framework::DDim dims = phi::make_ddim(dimvec); - PADDLE_ENFORCE_GE(input_dims.size(), 1, platform::errors::InvalidArgument( - "input shape should >= 1d")); - ctx->SetOutputDim("Out", dims); - ctx->SetOutputDim("Indices", dims); - ctx->ShareLoD("X", "Out"); - ctx->ShareLoD("X", "Indices"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -138,18 +105,11 @@ class ModeGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; + +DECLARE_INFER_SHAPE_FUNCTOR(mode, ModeInferShapeFunctor, + PD_INFER_META(phi::ModeInferMeta)); REGISTER_OPERATOR(mode, ops::ModeOp, ops::ModeOpMaker, ops::ModeGradOpMaker, - ops::ModeGradOpMaker); -REGISTER_OP_CPU_KERNEL(mode, - ops::ModeCPUKernel, - ops::ModeCPUKernel, - ops::ModeCPUKernel, - ops::ModeCPUKernel); - + ops::ModeGradOpMaker, + ModeInferShapeFunctor); REGISTER_OPERATOR(mode_grad, ops::ModeOpGrad); -REGISTER_OP_CPU_KERNEL( - mode_grad, ops::ModeGradCPUKernel, - ops::ModeGradCPUKernel, - ops::ModeGradCPUKernel, - ops::ModeGradCPUKernel); diff --git a/paddle/fluid/operators/mode_op.cu b/paddle/fluid/operators/mode_op.cu deleted file mode 100644 index 2bacda8afb0eb340c4c8d4068f3013e2adbc7f91..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/mode_op.cu +++ /dev/null @@ -1,232 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/mode_op.h" -#include "paddle/fluid/operators/top_k_function_cuda.h" - -namespace paddle { -namespace operators { - -int ComputeBlockSize(int col) { - if (col > 512) - return 1024; - else if (col > 256 && col <= 512) - return 512; - else if (col > 128 && col <= 256) - return 256; - else if (col > 64 && col <= 128) - return 128; - else - return 64; -} - -template -void getModebySort(const platform::CUDADeviceContext& ctx, - const framework::Tensor* input_tensor, - const int64_t num_cols, const int64_t num_rows, - T* out_tensor, int64_t* indices_tensor) { - framework::Tensor input_tmp; - framework::TensorCopy(*input_tensor, ctx.GetPlace(), &input_tmp); - T* input_tmp_data = input_tmp.mutable_data(ctx.GetPlace()); - input_tmp.Resize(phi::make_ddim({num_rows, num_cols})); - thrust::device_ptr out_tensor_ptr(out_tensor); - thrust::device_ptr indices_tensor_ptr(indices_tensor); - - for (int64_t i = 0; i < num_rows; ++i) { - T* begin = input_tmp_data + num_cols * i; - T* end = input_tmp_data + num_cols * (i + 1); - thrust::device_vector indices_data(num_cols); - thrust::sequence(thrust::device, indices_data.begin(), - indices_data.begin() + num_cols); - thrust::sort_by_key(thrust::device, begin, end, indices_data.begin()); - int unique = 1 + thrust::inner_product(thrust::device, begin, end - 1, - begin + 1, 0, thrust::plus(), - thrust::not_equal_to()); - thrust::device_vector keys_data(unique); - thrust::device_vector cnts_data(unique); - thrust::reduce_by_key(thrust::device, begin, end, - thrust::constant_iterator(1), keys_data.begin(), - cnts_data.begin()); - auto it = thrust::max_element(thrust::device, cnts_data.begin(), - cnts_data.begin() + unique); - T mode = keys_data[it - cnts_data.begin()]; - int64_t counts = cnts_data[it - cnts_data.begin()]; - auto pos = thrust::find(thrust::device, begin, end, mode); - int64_t index = indices_data[pos - begin + counts - 1]; - out_tensor_ptr[i] = static_cast(mode); - indices_tensor_ptr[i] = static_cast(index); - } -} - -template -class ModeOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::InvalidArgument( - "It must use CUDAPlace, you must check your device set.")); - auto* input = ctx.Input("X"); - auto* output = ctx.Output("Out"); - auto* indices = ctx.Output("Indices"); - int axis = static_cast(ctx.Attr("axis")); - bool keepdim = static_cast(ctx.Attr("keepdim")); - - // get the input dims - const auto& in_dims = input->dims(); - // calcluate the real axis - if (axis < 0) axis += in_dims.size(); - - auto out_dims = output->dims(); - - const T* input_data = input->data(); - T* output_data = output->mutable_data(ctx.GetPlace()); - int64_t* indices_data = indices->mutable_data(ctx.GetPlace()); - - if (axis == in_dims.size() - 1) { - const int64_t& input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t& input_width = in_dims[in_dims.size() - 1]; - const auto& dev_ctx = ctx.cuda_device_context(); - getModebySort(dev_ctx, input, input_width, input_height, output_data, - indices_data); - } else { - std::vector trans_axis; - for (int i = 0; i < axis; i++) { - trans_axis.emplace_back(i); - } - trans_axis.emplace_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans_axis.emplace_back(i); - } - trans_axis.emplace_back(axis); - - if (!keepdim) { - std::vector tmp_out_shape; - for (int i = 0; i < axis; i++) { - tmp_out_shape.emplace_back(in_dims[i]); - } - tmp_out_shape.emplace_back(1); - for (int i = axis + 1; i < in_dims.size(); i++) { - tmp_out_shape.emplace_back(in_dims[i]); - } - framework::DDim tmp_out_dim = phi::make_ddim(tmp_out_shape); - output->Resize(tmp_out_dim); - indices->Resize(tmp_out_dim); - } - - framework::DDim trans_shape(in_dims); - framework::DDim trans_out_shape(in_dims); - for (int i = 0; i < trans_axis.size(); i++) { - trans_shape[i] = in_dims[trans_axis[i]]; - trans_out_shape[i] = in_dims[trans_axis[i]]; - } - trans_out_shape[in_dims.size() - 1] = 1; - - // second step, tranpose the input - framework::Tensor trans_input; - trans_input.mutable_data(trans_shape, ctx.GetPlace()); - int ndims = trans_axis.size(); - const auto& dev_ctx = ctx.cuda_device_context(); - TransCompute(ndims, dev_ctx, *input, - &trans_input, trans_axis); - framework::Tensor trans_ind; - int64_t* trans_ind_data = - trans_ind.mutable_data(trans_out_shape, ctx.GetPlace()); - framework::Tensor trans_out; - T* trans_out_data = - trans_out.mutable_data(trans_out_shape, ctx.GetPlace()); - - const int64_t input_height = - phi::product(phi::slice_ddim(trans_shape, 0, trans_shape.size() - 1)); - const int64_t input_width = trans_shape[trans_shape.size() - 1]; - getModebySort(dev_ctx, &trans_input, input_width, input_height, - trans_out_data, trans_ind_data); - // last step, tranpose back the indices and output - TransCompute( - ndims, dev_ctx, trans_ind, indices, trans_axis); - TransCompute(ndims, dev_ctx, trans_out, - output, trans_axis); - if (!keepdim) { - output->Resize(out_dims); - indices->Resize(out_dims); - } - } - } -}; - -template -class ModeOpGradCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(context.GetPlace()), true, - platform::errors::InvalidArgument( - "It must use CUDAPlace, you must check your device set.")); - auto* x = context.Input("X"); - auto* out_grad = - context.Input(framework::GradVarName("Out")); - auto* indices = context.Input("Indices"); - auto* x_grad = - context.Output(framework::GradVarName("X")); - int axis = context.Attr("axis"); - - const auto& in_dims = x->dims(); - auto out_dims = indices->dims(); - - if (axis < 0) axis += in_dims.size(); - // allocate the cuda memory for the x_grad - T* x_grad_data = x_grad->mutable_data(context.GetPlace()); - const T* out_grad_data = out_grad->data(); - const int64_t* indices_data = indices->data(); - - int pre, n, post; - GetDims(in_dims, axis, &pre, &n, &post); - - // calcluate the block and grid num - auto& dev_ctx = context.cuda_device_context(); - int block_size = ComputeBlockSize(post); - int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); - const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1); - int grid_size = std::min(max_blocks, pre); - AssignGradWithAxis<<>>( - out_grad_data, indices_data, x_grad_data, pre, post, n, 1); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - mode, ops::ModeOpCUDAKernel, - ops::ModeOpCUDAKernel, - ops::ModeOpCUDAKernel, - ops::ModeOpCUDAKernel); -REGISTER_OP_CUDA_KERNEL( - mode_grad, - ops::ModeOpGradCUDAKernel, - ops::ModeOpGradCUDAKernel, - ops::ModeOpGradCUDAKernel, - ops::ModeOpGradCUDAKernel); diff --git a/paddle/fluid/operators/mode_op.h b/paddle/fluid/operators/mode_op.h deleted file mode 100644 index 76d356ed16eb3f81b10d541230f49b73fd836543..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/mode_op.h +++ /dev/null @@ -1,317 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/transpose_op.h" - -namespace paddle { -namespace operators { - -template -static void getMode(Type input_height, Type input_width, int input_dim, - const framework::Tensor* input, T* t_out, Type* t_indices) { -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (Type i = 0; i < input_height; ++i) { - std::vector> col_vec; - col_vec.reserve(input_width); - if (input_dim == 1) { - auto e_input = framework::EigenVector::Flatten(*input); - for (Type j = 0; j < input_width; ++j) { - col_vec.emplace_back(std::pair(e_input(j), j)); - } - } else { - auto e_input = framework::EigenMatrix::Reshape(*input, input_dim - 1); - for (Type j = 0; j < input_width; ++j) { - col_vec.emplace_back(std::pair(e_input(i, j), j)); - } - } - std::sort(col_vec.begin(), col_vec.end(), - [](const std::pair& l, const std::pair& r) { - return (!std::isnan(static_cast(l.first)) && - std::isnan(static_cast(r.first))) || - (l.first < r.first); - }); - T mode = 0; - int64_t indice = 0; - int64_t cur_freq = 0; - int64_t max_freq = 0; - for (int64_t i = 0; i < input_width; ++i) { - ++cur_freq; - if (i == input_width - 1 || (col_vec[i + 1].first != col_vec[i].first)) { - if (cur_freq > max_freq) { - max_freq = cur_freq; - mode = col_vec[i].first; - indice = col_vec[i].second; - } - cur_freq = 0; - } - } - t_out[i] = mode; - t_indices[i] = indice; - } -} - -template -static void ModeAssign(const Type& input_height, const Type& input_width, - const int& input_dim, const framework::Tensor* input, - const framework::Tensor* indices, T* output_data) { -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (Type i = 0; i < input_height; ++i) { - if (input_dim == 1) { - auto e_input = framework::EigenVector::Flatten(*input); - auto e_indices = framework::EigenVector::Flatten(*indices); - output_data[i * input_width + e_indices(0)] = e_input(0); - } else { - auto e_input = framework::EigenMatrix::Reshape(*input, input_dim - 1); - auto e_indices = - framework::EigenMatrix::Reshape(*indices, input_dim - 1); - output_data[i * input_width + e_indices(i, 0)] = e_input(i, 0); - } - } -} - -template -class ModeCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* input = context.Input("X"); - auto* output = context.Output("Out"); - auto* indices = context.Output("Indices"); - const auto& in_dims = input->dims(); - bool keepdim = static_cast(context.Attr("keepdim")); - - // axis < 0, cacluate the real axis - int axis = static_cast(context.Attr("axis")); - if (axis < 0) axis += in_dims.size(); - - T* output_data = output->mutable_data(context.GetPlace()); - int64_t* indices_data = indices->mutable_data(context.GetPlace()); - auto out_dims = output->dims(); - // if axis is not the last dim, transpose it to the last dim, do the - // calculation, - // then tranpose it back to orginal axis. - if (axis == in_dims.size() - 1) { - const int64_t& input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t& input_width = in_dims[in_dims.size() - 1]; - getMode(input_height, input_width, in_dims.size(), input, - output_data, indices_data); - } else { - std::vector trans_axis; - for (int i = 0; i < axis; i++) { - trans_axis.emplace_back(i); - } - trans_axis.push_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans_axis.emplace_back(i); - } - trans_axis.emplace_back(axis); - - if (!keepdim) { - std::vector tmp_out_shape; - for (int i = 0; i < axis; i++) { - tmp_out_shape.emplace_back(in_dims[i]); - } - tmp_out_shape.emplace_back(1); - for (int i = axis + 1; i < in_dims.size(); i++) { - tmp_out_shape.emplace_back(in_dims[i]); - } - framework::DDim tmp_out_dim = phi::make_ddim(tmp_out_shape); - output->Resize(tmp_out_dim); - indices->Resize(tmp_out_dim); - } - - // get the trans input_dims, out_dims - framework::DDim trans_shape(in_dims); - framework::DDim trans_out_shape(in_dims); - - for (size_t i = 0; i < trans_axis.size(); i++) { - trans_shape[i] = in_dims[trans_axis[i]]; - trans_out_shape[i] = in_dims[trans_axis[i]]; - } - trans_out_shape[in_dims.size() - 1] = 1; - - framework::Tensor trans_input; - trans_input.mutable_data(trans_shape, context.GetPlace()); - int ndims = trans_axis.size(); - auto& dev_context = - context.template device_context(); - - // transpose the input value - TransCompute(ndims, dev_context, *input, - &trans_input, trans_axis); - - const int64_t input_height = - phi::product(phi::slice_ddim(trans_shape, 0, trans_shape.size() - 1)); - const int64_t input_width = trans_shape[trans_shape.size() - 1]; - framework::Tensor tmp_out; - T* t_out = tmp_out.mutable_data(trans_out_shape, context.GetPlace()); - framework::Tensor tmp_indices; - auto* t_ind = tmp_indices.mutable_data(trans_out_shape, - context.GetPlace()); - - getMode(input_height, input_width, in_dims.size(), - &trans_input, t_out, t_ind); - // transpose back - TransCompute( - ndims, dev_context, tmp_indices, indices, trans_axis); - TransCompute(ndims, dev_context, tmp_out, - output, trans_axis); - if (!keepdim) { - output->Resize(out_dims); - indices->Resize(out_dims); - } - } - } -}; - -template -class ModeGradCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* out_grad = - context.Input(framework::GradVarName("Out")); - auto* indices = context.Input("Indices"); - auto* x_grad = - context.Output(framework::GradVarName("X")); - int axis = static_cast(context.Attr("axis")); - bool keepdim = static_cast(context.Attr("keepdim")); - - auto in_dims = x->dims(); - auto out_dims = indices->dims(); - - // axis < 0, get the real axis - axis = (axis < 0) ? (in_dims.size() + axis) : axis; - - if (!keepdim) { - std::vector tmp_out_shape; - for (int i = 0; i < axis; i++) { - tmp_out_shape.emplace_back(out_dims[i]); - } - tmp_out_shape.emplace_back(1); - for (int i = axis + 1; i < in_dims.size(); i++) { - tmp_out_shape.emplace_back(out_dims[i - 1]); - } - out_dims = phi::make_ddim(tmp_out_shape); - } - T* x_grad_data = x_grad->mutable_data(context.GetPlace()); - if (axis == in_dims.size() - 1) { - // allocate the memory for the input_grad - // assign the out_grad to input_grad directly - const int64_t input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t input_width = in_dims[in_dims.size() - 1]; - - // init the output grad with 0, because some input elements has no grad - memset(x_grad_data, 0, x_grad->numel() * sizeof(T)); - // Assign the output_grad to input_grad - if (keepdim) { - ModeAssign(input_height, input_width, in_dims.size(), out_grad, indices, - x_grad_data); - } else { - auto& dev_context = - context.template device_context(); - framework::Tensor out_grad_tmp; - framework::Tensor indices_tmp; - out_grad_tmp.mutable_data(out_grad->dims(), dev_context.GetPlace()); - indices_tmp.mutable_data(indices->dims(), - dev_context.GetPlace()); - framework::TensorCopy(*out_grad, dev_context.GetPlace(), dev_context, - &out_grad_tmp); - framework::TensorCopy(*indices, dev_context.GetPlace(), dev_context, - &indices_tmp); - out_grad_tmp.Resize(out_dims); - indices_tmp.Resize(out_dims); - ModeAssign(input_height, input_width, in_dims.size(), &out_grad_tmp, - &indices_tmp, x_grad_data); - } - } else { - // can not assign grad to input_grad, must do the transpose - std::vector trans_axis; - for (int i = 0; i < axis; i++) { - trans_axis.emplace_back(i); - } - trans_axis.emplace_back(out_dims.size() - 1); - for (int i = axis + 1; i < out_dims.size() - 1; i++) { - trans_axis.emplace_back(i); - } - trans_axis.emplace_back(axis); - framework::DDim trans_shape(out_dims); - framework::DDim trans_in_shape(in_dims); - for (size_t i = 0; i < trans_axis.size(); i++) { - trans_shape[i] = out_dims[trans_axis[i]]; - trans_in_shape[i] = in_dims[trans_axis[i]]; - } - // transpose the out_grad, indices - framework::Tensor trans_dO; - trans_dO.mutable_data(trans_shape, context.GetPlace()); - framework::Tensor trans_ind; - trans_ind.mutable_data(trans_shape, context.GetPlace()); - int ndims = trans_axis.size(); - auto& dev_context = - context.template device_context(); - - if (keepdim) { - // Do transpose - TransCompute( - ndims, dev_context, *out_grad, &trans_dO, trans_axis); - TransCompute( - ndims, dev_context, *indices, &trans_ind, trans_axis); - } else { - framework::Tensor out_grad_tmp; - framework::Tensor indices_tmp; - out_grad_tmp.mutable_data(out_grad->dims(), dev_context.GetPlace()); - indices_tmp.mutable_data(indices->dims(), - dev_context.GetPlace()); - framework::TensorCopy(*out_grad, dev_context.GetPlace(), dev_context, - &out_grad_tmp); - framework::TensorCopy(*indices, dev_context.GetPlace(), dev_context, - &indices_tmp); - out_grad_tmp.Resize(out_dims); - indices_tmp.Resize(out_dims); - // Do transpose - TransCompute( - ndims, dev_context, out_grad_tmp, &trans_dO, trans_axis); - TransCompute( - ndims, dev_context, indices_tmp, &trans_ind, trans_axis); - } - const int64_t input_height = phi::product( - phi::slice_ddim(trans_in_shape, 0, trans_in_shape.size() - 1)); - const int64_t input_width = trans_in_shape[trans_in_shape.size() - 1]; - - // Assign the out_grad to tranpose input_grad - framework::Tensor tmp_out; - T* t_out = tmp_out.mutable_data(trans_in_shape, context.GetPlace()); - memset(t_out, 0, x_grad->numel() * sizeof(T)); - - ModeAssign(input_height, input_width, in_dims.size(), - &trans_dO, &trans_ind, t_out); - - // Transpose back - TransCompute(ndims, dev_context, tmp_out, - x_grad, trans_axis); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/multi_dot_op.cc b/paddle/fluid/operators/multi_dot_op.cc index b309e1b87ef9033bd4302cdad4ea60a64cbf02eb..5b107ce643df33af79230c30d784d1ad84c26666 100644 --- a/paddle/fluid/operators/multi_dot_op.cc +++ b/paddle/fluid/operators/multi_dot_op.cc @@ -16,77 +16,19 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/strided_memcpy.h" #include "paddle/fluid/operators/utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/multiary.h" #include "paddle/phi/kernels/funcs/blas/blas.h" namespace paddle { namespace operators { using Tensor = framework::Tensor; -/** - * @brief compute the output shape and check the input shape valid or not - */ -inline framework::DDim ComputeAndCheckShape( - const bool is_runtime, const std::vector& inputs_dims) { - const size_t n = inputs_dims.size(); - auto first_dim = inputs_dims[0]; - - bool is_vector = false; - framework::DDim out_dim; - - PADDLE_ENFORCE_LT( - first_dim.size(), static_cast(3), - platform::errors::InvalidArgument( - "multi_dot: the first input tensor must be 1D or 2D but got[%d]!", - static_cast(first_dim.size()))); - - // If the first tensor is 1D of size n view it as a row vector (1, n) - if (first_dim.size() == 1) { - first_dim = phi::make_ddim({1, static_cast(first_dim[0])}); - is_vector = true; - } - - auto last_dim = inputs_dims[n - 1]; - PADDLE_ENFORCE_LT( - last_dim.size(), static_cast(3), - platform::errors::InvalidArgument( - "the last input tensor of multi_dot must be 1D or 2D but got[%d]!", - static_cast(first_dim.size()))); - - // If the last tensor is 1D of size n view it as a column vector (n, 1) - if (last_dim.size() == 1) { - last_dim = phi::make_ddim({static_cast(last_dim[0]), 1}); - out_dim = is_vector ? phi::make_ddim({1}) : phi::make_ddim({first_dim[0]}); - } else { - out_dim = is_vector ? phi::make_ddim({last_dim[1]}) - : phi::make_ddim({first_dim[0], last_dim[1]}); - } - - auto width = first_dim[1]; - for (size_t i = 1; i < n - 1; i++) { - PADDLE_ENFORCE_EQ(inputs_dims[i].size(), static_cast(2), - platform::errors::InvalidArgument( - "the input tensor of multi_dot op must be 2D.")); - - const auto& tmp_dim = inputs_dims[i]; - PADDLE_ENFORCE_EQ( - tmp_dim[0], width, - platform::errors::InvalidArgument( - "the input matrix does not meet the multiplication requirements.")); - width = tmp_dim[1]; - } - - PADDLE_ENFORCE_EQ( - last_dim[0], width, - platform::errors::InvalidArgument( - "the input matrix does not meet the multiplication requirements.")); - - return out_dim; -} - class MultiDotOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -105,22 +47,6 @@ If the first argument is 1-D it is treated as a row vector. If the last argument class MultiDotOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "multi_dot"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "multi_dot"); - - auto inputs_dims = ctx->GetInputsDim("X"); - - const size_t inputs_num = inputs_dims.size(); - PADDLE_ENFORCE_GT( - inputs_num, static_cast(1), - platform::errors::InvalidArgument( - "The number of input tensors in multi_dot op should > 1.")); - auto out_dims = ComputeAndCheckShape(ctx->IsRuntime(), inputs_dims); - ctx->SetOutputDim("Out", out_dims); - ctx->ShareLoD("X", "Out"); - } }; class MultiDotOpGrad : public framework::OperatorWithKernel { @@ -171,9 +97,15 @@ class MultiDotOpDoubleGradMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; + +DECLARE_INFER_SHAPE_FUNCTOR(multi_dot, MultiDotInferShapeFunctor, + PD_INFER_META(phi::MultiDotInferMeta)); + REGISTER_OPERATOR(multi_dot, ops::MultiDotOp, ops::MultiDotOpMaker, ops::MultiDotOpGradMaker, - ops::MultiDotOpGradMaker); + ops::MultiDotOpGradMaker, + MultiDotInferShapeFunctor); + REGISTER_OPERATOR(multi_dot_grad, ops::MultiDotOpGrad, ops::MultiDotOpDoubleGradMaker, ops::MultiDotOpDoubleGradMaker); diff --git a/paddle/fluid/operators/multiplex_op.cc b/paddle/fluid/operators/multiplex_op.cc index 313a479ea301bb2c7dac0d0a27ca6064de99536a..8771a6573cba044d182aced752d3a65c446ad32e 100644 --- a/paddle/fluid/operators/multiplex_op.cc +++ b/paddle/fluid/operators/multiplex_op.cc @@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/multiplex_op.h" #include #include +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { @@ -169,15 +169,3 @@ REGISTER_OPERATOR(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker, ops::MultiplexGradMaker, ops::MultiplexGradMaker); REGISTER_OPERATOR(multiplex_grad, ops::MultiplexGradOp); -REGISTER_OP_CPU_KERNEL( - multiplex, - ops::MultiplexCPUKernel, - ops::MultiplexCPUKernel, - ops::MultiplexCPUKernel, - ops::MultiplexCPUKernel); -REGISTER_OP_CPU_KERNEL( - multiplex_grad, - ops::MultiplexGradCPUKernel, - ops::MultiplexGradCPUKernel, - ops::MultiplexGradCPUKernel, - ops::MultiplexGradCPUKernel); diff --git a/paddle/fluid/operators/multiplex_op.cu b/paddle/fluid/operators/multiplex_op.cu deleted file mode 100644 index 0a32ee96fb6938157364dc717724ce9193286f27..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/multiplex_op.cu +++ /dev/null @@ -1,117 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/multiplex_op.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class MultiplexGPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const { - auto ins = ctx.MultiInput("X"); - auto* ids = ctx.Input("Ids"); - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - for (size_t i = 0; i < ins.size(); ++i) { - PADDLE_ENFORCE_GT( - ins[i]->numel(), 0, - platform::errors::OutOfRange( - "indexing will be out of bounds with size 0 for the %d-th input.", - i)); - } - - auto rows = ins[0]->dims()[0]; - auto cols = ins[0]->numel() / rows; - // copy index to cpu - Tensor index_t_cpu; - paddle::framework::TensorCopySync(*ids, platform::CPUPlace(), &index_t_cpu); - auto* index = index_t_cpu.data(); - auto stream = ctx.cuda_device_context().stream(); - platform::CUDAPlace place = ctx.GetPlace(); - for (auto i = 0; i < rows; i++) { - int32_t k = index[i]; - PADDLE_ENFORCE_GE(k, 0, platform::errors::PreconditionNotMet( - "index must be nonnegative.")); - PADDLE_ENFORCE_LT(static_cast(k), ins.size(), - platform::errors::PreconditionNotMet( - "index exceeds the number of candidate tensors.")); - memory::Copy(place, out->data() + i * cols, place, - ins[k]->data() + i * cols, cols * sizeof(T), stream); - } - } -}; - -template -class MultiplexGradGPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const { - auto* d_out = ctx.Input(framework::GradVarName("Out")); - auto* ids = ctx.Input("Ids"); - auto d_ins = ctx.MultiOutput(framework::GradVarName("X")); - - size_t idx = -1UL; - for (size_t i = 0; i < d_ins.size(); i++) { - if (d_ins[i]) { - d_ins[i]->mutable_data(ctx.GetPlace()); - auto t = framework::EigenVector::Flatten(*d_ins[i]); - t.device(*ctx.template device_context().eigen_device()) = - t.constant(static_cast(0)); - - idx = i; - } - } - - if (idx == -1UL) return; - - auto rows = d_ins[idx]->dims()[0]; - auto cols = d_ins[idx]->numel() / rows; - // copy index to cpu - Tensor index_t_cpu; - paddle::framework::TensorCopySync(*ids, platform::CPUPlace(), &index_t_cpu); - auto* index = index_t_cpu.data(); - - auto stream = ctx.cuda_device_context().stream(); - platform::CUDAPlace place = ctx.GetPlace(); - for (auto i = 0; i < rows; i++) { - size_t k = static_cast(index[i]); - if (d_ins[k]) { - memory::Copy(place, d_ins[k]->data() + i * cols, place, - d_out->data() + i * cols, cols * sizeof(T), stream); - } - } - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL( - multiplex, - ops::MultiplexGPUKernel, - ops::MultiplexGPUKernel, - ops::MultiplexGPUKernel, - ops::MultiplexGPUKernel); -REGISTER_OP_CUDA_KERNEL( - multiplex_grad, - ops::MultiplexGradGPUKernel, - ops::MultiplexGradGPUKernel, - ops::MultiplexGradGPUKernel, - ops::MultiplexGradGPUKernel); diff --git a/paddle/fluid/operators/multiplex_op.h b/paddle/fluid/operators/multiplex_op.h deleted file mode 100644 index 1d0a009edeedcad746853bb286af52cce474df87..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/multiplex_op.h +++ /dev/null @@ -1,96 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/memory/memcpy.h" - -namespace paddle { -namespace operators { - -template -class MultiplexCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const { - auto ins = ctx.MultiInput("X"); - auto ids = ctx.Input("Ids"); - auto* out = ctx.Output("Out"); - - out->mutable_data(ctx.GetPlace()); - - for (size_t i = 0; i < ins.size(); ++i) { - PADDLE_ENFORCE_GT( - ins[i]->numel(), 0, - platform::errors::OutOfRange( - "indexing will be out of bounds with size 0 for the %d-th input.", - i)); - } - - auto rows = ins[0]->dims()[0]; - auto cols = ins[0]->numel() / rows; - auto index = ids->data(); - platform::CPUPlace place = ctx.GetPlace(); - for (auto i = 0; i < rows; i++) { - int32_t k = index[i]; - PADDLE_ENFORCE_GE(k, 0, platform::errors::PreconditionNotMet( - "index must be nonnegative.")); - PADDLE_ENFORCE_LT(static_cast(k), ins.size(), - platform::errors::PreconditionNotMet( - "index exceeds the number of candidate tensors.")); - memory::Copy(place, out->data() + i * cols, place, - ins[k]->data() + i * cols, cols * sizeof(T)); - } - } -}; - -template -class MultiplexGradCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const { - auto* d_out = ctx.Input(framework::GradVarName("Out")); - auto* ids = ctx.Input("Ids"); - auto d_ins = - ctx.MultiOutput(framework::GradVarName("X")); - - size_t idx = -1UL; - for (size_t i = 0; i < d_ins.size(); i++) { - if (d_ins[i]) { - d_ins[i]->mutable_data(ctx.GetPlace()); - auto t = framework::EigenVector::Flatten(*d_ins[i]); - t.device(*ctx.template device_context().eigen_device()) = - t.constant(static_cast(0)); - - idx = i; - } - } - - if (idx == -1UL) return; - - auto rows = d_ins[idx]->dims()[0]; - auto cols = d_ins[idx]->numel() / rows; - auto* index = ids->data(); - platform::CPUPlace place = ctx.GetPlace(); - for (auto i = 0; i < rows; i++) { - size_t k = static_cast(index[i]); - if (d_ins[k]) { - memory::Copy(place, d_ins[k]->data() + i * cols, place, - d_out->data() + i * cols, cols * sizeof(T)); - } - } - } -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/norm_op.cc b/paddle/fluid/operators/norm_op.cc index 5d394424d54f5291df2855041d5d7f943dbd43d0..51daccce0e8822a1eec25ac428e5a56c632805e2 100644 --- a/paddle/fluid/operators/norm_op.cc +++ b/paddle/fluid/operators/norm_op.cc @@ -15,7 +15,9 @@ limitations under the License. */ #include #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -57,21 +59,7 @@ where, $\sum {x^2}$ is calculated along the `axis` dimension. }; class NormOp : public framework::OperatorWithKernel { - public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "NormOp"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "NormOp"); - auto xdim = ctx->GetInputDim("X"); - ctx->SetOutputDim("Out", xdim); - - if (ctx->Attrs().Get("is_test") == false) { - int axis = ctx->Attrs().Get("axis"); - if (axis < 0) axis = xdim.size() + axis; - xdim[axis] = 1; - ctx->SetOutputDim("Norm", xdim); - } - } }; class NormOpGrad : public framework::OperatorWithKernel { @@ -111,7 +99,11 @@ class NormOpGradOpMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; using CPU = paddle::platform::CPUDeviceContext; +DECLARE_INFER_SHAPE_FUNCTOR(norm, NormInferShapeFunctor, + PD_INFER_META(phi::NormInferMeta)); + REGISTER_OPERATOR(norm, ops::NormOp, ops::NormOpMaker, ops::NormOpGradOpMaker, - ops::NormOpGradOpMaker); + ops::NormOpGradOpMaker, + NormInferShapeFunctor); REGISTER_OPERATOR(norm_grad, ops::NormOpGrad); diff --git a/paddle/fluid/operators/number_count_op.cc b/paddle/fluid/operators/number_count_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..8f7a3b82acf19fa79cbf5c632977e6ae533ae12b --- /dev/null +++ b/paddle/fluid/operators/number_count_op.cc @@ -0,0 +1,66 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/number_count_op.h" + +namespace paddle { +namespace operators { + +class NumberCountOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("gate_idx"), "Input", "gate_idx", + "NumberCount"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "number_count", + "NumberCount"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + // the dtype of the gate_idx should be same as int64 + auto gate_idx_dtype = + OperatorWithKernel::IndicateVarDataType(ctx, "gate_idx"); + + PADDLE_ENFORCE_EQ(gate_idx_dtype, framework::proto::VarType::INT64, + platform::errors::InvalidArgument( + "The dtype of the gate_idx_dtype should be int64")); + return framework::OpKernelType(gate_idx_dtype, ctx.GetPlace()); + } +}; + +class NumberCountOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("gate_idx", "(Tensor) The input gate index tensor."); + AddOutput("Out", "(Tensor) The output expert count tensor."); + AddAttr("upper_range", "(int), The number of experts."); + + AddComment(R"DOC(number_count Operator.count gate indices.)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CPU_KERNEL(number_count, ops::NumberCountOpCPUKernel, + ops::NumberCountOpCPUKernel); + +REGISTER_OP_WITHOUT_GRADIENT(number_count, ops::NumberCountOp, + ops::NumberCountOpMaker); diff --git a/paddle/fluid/operators/number_count_op.cu b/paddle/fluid/operators/number_count_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..97e4b4f2845ae132c28d3bb71dcc8e73f02e193a --- /dev/null +++ b/paddle/fluid/operators/number_count_op.cu @@ -0,0 +1,108 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/number_count_op.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle { +namespace operators { + +#define CEIL(_x_, _y_) (((_x_)-1) / (_y_) + 1) +#define PERTHREAD_EXPERTS 256 +#define WARP_SIZE 32 + +const int CUDA_NUM_THREADS = 512; +static inline int GET_BLOCKS(const int N) { + return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; +} + +using LoDTensor = framework::LoDTensor; +using Tensor = framework::Tensor; + +template +__global__ void initialize_zero_kernel(T* data, const int length) { + CUDA_KERNEL_LOOP(idx, length) { data[idx] = static_cast(0); } +} + +template +__global__ void NumberCount(const T* gate_idx, T* number_count, + int64_t batch_size, int upper_range) { + int res_tmp[PERTHREAD_EXPERTS] = {0}; + int expert_min = blockIdx.x * PERTHREAD_EXPERTS; + int expert_max = expert_min + PERTHREAD_EXPERTS; + if (expert_max > upper_range) { + expert_max = upper_range; + } + for (int i = threadIdx.x; i < batch_size; i += blockDim.x) { + T idx = gate_idx[i]; + if (idx == -1) { + continue; + } + if (idx < expert_min || idx >= expert_max) { + continue; + } + res_tmp[idx - expert_min] += 1; + } + for (int i = expert_min; i < expert_max; ++i) { + int x = res_tmp[i - expert_min]; +#pragma unroll + for (int j = 1; j < WARP_SIZE; j <<= 1) { +#ifdef __HIPCC__ + x = x + __shfl_down(x, j); +#else + x = x + __shfl_down_sync(-1u, x, j); +#endif + } + if (threadIdx.x % WARP_SIZE == 0) { + platform::CudaAtomicAdd(number_count + i, x); + } + } +} + +template +class NumberCountOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto gate_idx = context.Input("gate_idx"); + auto upper_range = context.Attr("upper_range"); + auto number_count = context.Output("Out"); + + int64_t batch_size = gate_idx->numel(); + auto place = context.GetPlace(); + const auto& dev_ctx = + context.template device_context(); + + framework::DDim out_dims = phi::make_ddim({upper_range}); + auto out_data = number_count->mutable_data(out_dims, place); + const T* gate_data = gate_idx->data(); + + initialize_zero_kernel< + T><<>>( + out_data, upper_range); + + NumberCount< + T><<>>( + gate_data, out_data, batch_size, upper_range); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL(number_count, ops::NumberCountOpCUDAKernel); diff --git a/paddle/fluid/operators/number_count_op.h b/paddle/fluid/operators/number_count_op.h new file mode 100644 index 0000000000000000000000000000000000000000..95e64946fb8a2156fdb4cbae880ccf2c143447ed --- /dev/null +++ b/paddle/fluid/operators/number_count_op.h @@ -0,0 +1,37 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" + +#if defined(PADDLE_WITH_GLOO) +#include "paddle/fluid/framework/fleet/gloo_wrapper.h" +#endif + +namespace paddle { +namespace operators { + +template +class NumberCountOpCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_THROW(platform::errors::Unavailable( + "Do not support expert count op for cpu kernel now.")); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/pad_op_npu.cc b/paddle/fluid/operators/pad_op_npu.cc index d0cb674b4049f988773accd4b0652d62a1be2287..adc4a2ffaf8c54d32c10fa47e27d86aef2f9c508 100644 --- a/paddle/fluid/operators/pad_op_npu.cc +++ b/paddle/fluid/operators/pad_op_npu.cc @@ -90,5 +90,5 @@ namespace plat = paddle::platform; REGISTER_OP_NPU_KERNEL(pad, ops::PadNPUKernel, ops::PadNPUKernel, ops::PadNPUKernel); -REGISTER_OP_NPU_KERNEL(pad_grad, ops::PadNPUKernel, +REGISTER_OP_NPU_KERNEL(pad_grad, ops::PadGradNPUKernel, ops::PadGradNPUKernel); diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc index 9bd6ae8bab829e2f200f697e10e7e54f398f8d73..de35f67405810180554bfd556f91b7501f9c4ba2 100644 --- a/paddle/fluid/operators/prelu_op.cc +++ b/paddle/fluid/operators/prelu_op.cc @@ -9,14 +9,39 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/prelu_op.h" - #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { +using Tensor = framework::Tensor; + +framework::OpKernelType innerGetKernelTypeForVar( + const Tensor &tensor, const framework::OpKernelType &expected_kernel_type) { +#ifdef PADDLE_WITH_MKLDNN + auto isOneDNNKernelChosen = + (expected_kernel_type.data_layout_ == framework::DataLayout::kMKLDNN); + auto isNotOneDNNTensor = (tensor.layout() != framework::DataLayout::kMKLDNN); + auto isModelNHWC = + (paddle::platform::MKLDNNDeviceContext::tls() + .get_cur_paddle_data_layout() == framework::DataLayout::kNHWC); + // All inputs (including alpha) need shape rotating + if (isOneDNNKernelChosen && isNotOneDNNTensor && isModelNHWC) { + return framework::OpKernelType(expected_kernel_type.data_type_, + tensor.place(), + framework::DataLayout::kNHWC); + } +#endif + return framework::OpKernelType(expected_kernel_type.data_type_, + tensor.place(), tensor.layout()); +} + class PReluOp : public framework::OperatorWithKernel { public: PReluOp(const std::string &type, const framework::VariableNameMap &inputs, @@ -24,95 +49,6 @@ class PReluOp : public framework::OperatorWithKernel { const framework::AttributeMap &attrs) : OperatorWithKernel(type, inputs, outputs, attrs) {} - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "prelu"); - OP_INOUT_CHECK(ctx->HasInput("Alpha"), "Input", "Alpha", "prelu"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "prelu"); - - auto x_dim = ctx->GetInputDim("X"); - std::string mode = ctx->Attrs().Get("mode"); - if (mode == "all") { - PADDLE_ENFORCE_EQ(phi::product(ctx->GetInputDim("Alpha")), 1, - platform::errors::InvalidArgument( - "For mode 'all', size of weight Alpha must be one. " - "But recevied alpha's size: %d.", - product(ctx->GetInputDim("Alpha")))); - } else if (mode == "channel") { - auto x_rank = x_dim.size(); - PADDLE_ENFORCE_GE(x_rank, 2, - platform::errors::InvalidArgument( - "For mode 'channel', rank of input X must be " - "equal or larger than 2. But recevied X's " - "rank: %d", - x_rank)); - const std::string data_format_str = - ctx->Attrs().Get("data_format"); - PADDLE_ENFORCE_EQ(data_format_str == "NCHW" || data_format_str == "NHWC", - true, - platform::errors::InvalidArgument( - "For mode 'channel', data_format must be one of " - "NCHW and NHWC. But recevied data_format: %s", - data_format_str)); - if (data_format_str == "NCHW") { - PADDLE_ENFORCE_EQ( - product(ctx->GetInputDim("Alpha")) == x_dim[1], true, - platform::errors::InvalidArgument( - "For mode 'channel', size of weight Alpha must be " - "equal to the number of channels of input(x). But " - "recevied alpha's size: %d, x_dim[1]: %d", - product(ctx->GetInputDim("Alpha")), x_dim[1])); - } else { - PADDLE_ENFORCE_EQ( - product(ctx->GetInputDim("Alpha")) == x_dim[x_rank - 1], true, - platform::errors::InvalidArgument( - "For mode 'channel', size of weight Alpha must be " - "equal to the number of channels of input(x). But " - "recevied alpha's size: %d, x_dim[%d]: %d", - product(ctx->GetInputDim("Alpha")), x_rank - 1, - x_dim[x_rank - 1])); - } - - } else if (mode == "element") { - auto alpha_dim = ctx->GetInputDim("Alpha"); - auto alpha_rank = alpha_dim.size(); - auto x_rank = x_dim.size(); - PADDLE_ENFORCE_GE(x_rank, 1, - platform::errors::InvalidArgument( - "For mode 'element', rank of input X must be " - "equal or larger than 2. But recevied X's " - "rank: %d", - x_rank)); - PADDLE_ENFORCE_EQ( - alpha_rank, x_rank, - platform::errors::InvalidArgument( - "For mode 'element', rank of weight Alpha must be ", - "equal to the rank of input(x). But recevied alpha's rank: %d, " - "x's rank: %d.", - alpha_rank, x_rank)); - size_t x_product = 1; - size_t alpha_product = 1; - for (int64_t i = x_rank - 1; i > 0; i--) { - x_product *= x_dim[i]; - alpha_product *= alpha_dim[i]; - } - PADDLE_ENFORCE_EQ( - alpha_product, x_product, - platform::errors::InvalidArgument( - "For mode 'element', the size of weight Alpha must be " - "equal to the size of input(x). But recevied alpha's size: %d, " - "x's size: %d.", - alpha_product, x_product)); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Attr(mode) of prelu must be one of 'all', 'channel', or 'element'. " - "But recevied " - "mode: '%s'.", - mode)); - } - ctx->ShareDim("X", /*->*/ "Out"); - ctx->ShareLoD("X", /*->*/ "Out"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -128,6 +64,12 @@ class PReluOp : public framework::OperatorWithKernel { #endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } + + framework::OpKernelType GetKernelTypeForVar( + const std::string &var_name, const Tensor &tensor, + const framework::OpKernelType &expected_kernel_type) const { + return innerGetKernelTypeForVar(tensor, expected_kernel_type); + } }; class PReluOpMaker : public framework::OpProtoAndCheckerMaker { @@ -212,6 +154,12 @@ class PReluGradOp : public framework::OperatorWithKernel { #endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } + + framework::OpKernelType GetKernelTypeForVar( + const std::string &var_name, const Tensor &tensor, + const framework::OpKernelType &expected_kernel_type) const { + return innerGetKernelTypeForVar(tensor, expected_kernel_type); + } }; template @@ -236,13 +184,10 @@ class PReluGradOpMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(prelu, PReluInferShapeFunctor, + PD_INFER_META(phi::PReluInferMeta)); REGISTER_OPERATOR(prelu, ops::PReluOp, ops::PReluOpMaker, ops::PReluGradOpMaker, - ops::PReluGradOpMaker); + ops::PReluGradOpMaker, + PReluInferShapeFunctor); REGISTER_OPERATOR(prelu_grad, ops::PReluGradOp); -REGISTER_OP_CPU_KERNEL( - prelu, ops::PReluKernel, - ops::PReluKernel); -REGISTER_OP_CPU_KERNEL( - prelu_grad, ops::PReluGradKernel, - ops::PReluGradKernel); diff --git a/paddle/fluid/operators/prelu_op.cu b/paddle/fluid/operators/prelu_op.cu deleted file mode 100644 index 12e55d042d7037606179cc06480e4f80f942d8a2..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/prelu_op.cu +++ /dev/null @@ -1,208 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/prelu.h" -#include "paddle/fluid/operators/prelu_op.h" -#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -#define CUDA_NUM_THREADS 1024 - -inline static int PADDLE_GET_BLOCKS(const int N) { - return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; -} - -template -class CUDAPReluKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* alpha = context.Input("Alpha"); - auto* out = context.Output("Out"); - - const T* x_ptr = x->data(); - T* o_ptr = out->mutable_data(context.GetPlace()); - - const T* alpha_ptr = alpha->data(); - auto& mode = context.Attr("mode"); - auto& data_format = context.Attr("data_format"); - - int numel = x->numel(); - auto dim = x->dims(); - auto x_rank = dim.size(); - - VLOG(4) << "dim[0]:" << dim[0] << ", dim[1]:" << dim[1] << ", dim[" - << x_rank - 1 << "]:" << dim[x_rank - 1] << ", numel:" << numel; - - if (mode == "channel") { - bool channel_last = data_format == "NHWC"; - size_t channel = channel_last ? dim[x_rank - 1] : dim[1]; - math::PreluChannelWiseDirectCUDAFunctor prelu_channel_wise; - prelu_channel_wise(context.cuda_device_context().stream(), x_ptr, - alpha_ptr, o_ptr, dim[0], channel, channel_last, - numel); - } else if (mode == "element") { - math::PreluElementWiseDirectCUDAFunctor prelu_element_wise; - prelu_element_wise(context.cuda_device_context().stream(), x_ptr, - alpha_ptr, o_ptr, dim[0], numel); - } else { - math::PreluScalarDirectCUDAFunctor prelu_scalar; - prelu_scalar(context.cuda_device_context().stream(), x_ptr, alpha_ptr, - o_ptr, numel); - } - } -}; - -enum PRELU_MODE { Element, ChannelFirst, ChannelLast, Scalar }; - -template -__global__ void PReluOpGradKernel(const T* x_ptr, const T* alpha_ptr, - const T* dy_ptr, T* dx_ptr, T* dalpha_ptr, - size_t channel_num, size_t plane_size, - size_t spatial_size, size_t numel, - PRELU_MODE mode) { - CUDA_KERNEL_LOOP(index, numel) { - T scale; - if (mode == Element) { - size_t element_index = index % spatial_size; - scale = alpha_ptr[element_index]; - } else if (mode == ChannelFirst) { - size_t temp = index / plane_size; - size_t channel_index = temp % channel_num; - scale = alpha_ptr[channel_index]; - } else if (mode == ChannelLast) { - size_t channel_index = index % channel_num; - scale = alpha_ptr[channel_index]; - } else { - scale = alpha_ptr[0]; - } - T x = x_ptr[index]; - T dy = dy_ptr[index]; - T zero = static_cast(0); - if (dx_ptr != nullptr) dx_ptr[index] = (x > zero) ? dy : scale * dy; - if (dalpha_ptr != nullptr) dalpha_ptr[index] = (x > zero) ? zero : x * dy; - } -} - -template -class PreluOpGradFunctor { - public: - void operator()(gpuStream_t stream, const T* x, const T* alpha, const T* dy, - T* dx, T* dalpha, const framework::DDim& input_dims, - PRELU_MODE mode) { - size_t numel = 1; - for (size_t i = 0; i < input_dims.size(); ++i) { - numel *= input_dims[i]; - } - size_t plane_size = numel / input_dims[0] / input_dims[1]; - size_t spatial_size = numel / input_dims[0]; - size_t channel = - mode == ChannelLast ? input_dims[input_dims.size() - 1] : input_dims[1]; - - PReluOpGradKernel< - T><<>>( - x, alpha, dy, dx, dalpha, channel, plane_size, spatial_size, numel, - mode); - } -}; - -template -class CUDAPReluGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* alpha = context.Input("Alpha"); - auto* dx = context.Output(framework::GradVarName("X")); - auto* dy = context.Input(framework::GradVarName("Out")); - auto* dalpha = context.Output(framework::GradVarName("Alpha")); - - const T* x_ptr = x->data(); - const T* alpha_ptr = alpha->data(); - const T* dy_ptr = dy->data(); - T* dx_ptr = dx ? dx->mutable_data(context.GetPlace()) : nullptr; - T* dalpha_ptr = - dalpha ? dalpha->mutable_data(context.GetPlace()) : nullptr; - - if (!dx && !dalpha) return; - - auto& mode = context.Attr("mode"); - auto& data_format = context.Attr("data_format"); - - int numel = x->numel(); - auto dim = x->dims(); - auto x_rank = dim.size(); - std::vector input_shape = phi::vectorize(dim); - auto stream = context.cuda_device_context().stream(); - - T* dalpha_tmp_ptr; - Tensor dalpha_tmp; - if (dalpha_ptr == nullptr) { - dalpha_tmp_ptr = dalpha_ptr; - } else { - auto& dev_ctx = context.template device_context(); - dalpha_tmp = context.AllocateTmpTensor(dim, dev_ctx); - dalpha_tmp_ptr = dalpha_tmp.mutable_data(context.GetPlace()); - } - - PRELU_MODE m; - bool channel_last = false; - if (mode == "element") { - m = Element; - } else if (mode == "channel") { - channel_last = data_format == "NHWC"; - m = channel_last ? ChannelLast : ChannelFirst; - } else { - m = Scalar; - } - PreluOpGradFunctor prelu_grad; - prelu_grad(stream, x_ptr, alpha_ptr, dy_ptr, dx_ptr, dalpha_tmp_ptr, dim, - m); - - if (dalpha_tmp_ptr == nullptr) return; - - std::vector reduce_dims; - for (size_t i = 0; i < dim.size(); i++) { - if (mode == "channel" && !channel_last && i == 1) continue; - if (mode == "channel" && channel_last && i == dim.size() - 1) continue; - if (mode == "element" && i != 0) continue; - reduce_dims.push_back(i); - } - - TensorReduceImpl>( - context.cuda_device_context(), dalpha_tmp, dalpha, - kps::IdentityFunctor(), reduce_dims, stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL( - prelu, ops::CUDAPReluKernel, - ops::CUDAPReluKernel, - ops::CUDAPReluKernel); -REGISTER_OP_CUDA_KERNEL( - prelu_grad, - ops::CUDAPReluGradKernel, - ops::CUDAPReluGradKernel, - ops::CUDAPReluGradKernel); diff --git a/paddle/fluid/operators/prelu_op.h b/paddle/fluid/operators/prelu_op.h deleted file mode 100644 index 384994eb37c2a955c383ddeebafe5f0e64d3c961..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/prelu_op.h +++ /dev/null @@ -1,172 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/transform.h" -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using platform::Transform; - -template -class PReluKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* alpha = context.Input("Alpha"); - auto* out = context.Output("Out"); - - const T* x_ptr = x->data(); - T* o_ptr = out->mutable_data(context.GetPlace()); - - const T* alpha_ptr = alpha->data(); - auto& mode = context.Attr("mode"); - auto& data_format = context.Attr("data_format"); - - int numel = x->numel(); - auto dim = x->dims(); - int index = 0; - int i = 0; - if (mode == "channel") { - if (data_format == "NCHW") { - int temp = 1; - for (int j = 2; j < dim.size(); j++) { - temp *= dim[j]; - } - for (i = 0; i < numel; i++) { - index = (i / temp) % dim[1]; - o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i]; - } - } else { - for (i = 0; i < numel; i++) { - index = i % dim[dim.size() - 1]; - o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i]; - } - } - } else if (mode == "element") { - int temp = 1; - for (int j = 1; j < dim.size(); j++) { - temp *= dim[j]; - } - for (i = 0; i < numel; i++) { - index = i % temp; - o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i]; - } - } else { - for (i = 0; i < numel; i++) { - o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[0] * x_ptr[i]; - } - } - } -}; - -template -class PReluGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* dx = context.Output(framework::GradVarName("X")); - auto* dout = context.Input(framework::GradVarName("Out")); - auto* dalpha = context.Output(framework::GradVarName("Alpha")); - auto* alpha = context.Input("Alpha"); - const T* alpha_ptr = alpha->data(); - const T* x_ptr = x->data(); - const T* dout_ptr = dout->data(); - std::string mode = context.Attr("mode"); - auto& data_format = context.Attr("data_format"); - int numel = x->numel(); - auto dim = x->dims(); - int index = 0; - int i = 0; - if (dx) { - T* dx_ptr = dx->mutable_data(context.GetPlace()); - if (mode == "channel") { - if (data_format == "NCHW") { - int temp = 1; - for (int j = 2; j < dim.size(); j++) { - temp *= dim[j]; - } - for (i = 0; i < numel; i++) { - index = (i / temp) % dim[1]; - dx_ptr[i] = - x_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[index] * dout_ptr[i]; - } - } else { - for (i = 0; i < numel; i++) { - index = i % dim[dim.size() - 1]; - dx_ptr[i] = - x_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[index] * dout_ptr[i]; - } - } - } else if (mode == "element") { - int temp = 1; - for (int j = 1; j < dim.size(); j++) { - temp *= dim[j]; - } - for (i = 0; i < numel; i++) { - index = i % temp; - dx_ptr[i] = - x_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[index] * dout_ptr[i]; - } - } else { - for (i = 0; i < numel; i++) { - dx_ptr[i] = x_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[0] * dout_ptr[i]; - } - } - } - - index = 0; - if (dalpha) { - T* dalpha_ptr = dalpha->mutable_data(context.GetPlace()); - memset(dalpha_ptr, 0, sizeof(T) * dalpha->numel()); - - if (mode == "channel") { - if (data_format == "NCHW") { - int temp = 1; - for (int j = 2; j < dim.size(); j++) { - temp *= dim[j]; - } - for (i = 0; i < numel; i++) { - index = (i / temp) % dim[1]; - dalpha_ptr[index] += x_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i]; - } - } else { - for (i = 0; i < numel; i++) { - index = i % dim[dim.size() - 1]; - dalpha_ptr[index] += x_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i]; - } - } - } else if (mode == "element") { - int temp = 1; - for (int j = 1; j < dim.size(); j++) { - temp *= dim[j]; - } - for (i = 0; i < numel; i++) { - index = i % temp; - dalpha_ptr[index] += x_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i]; - } - } else { - for (i = 0; i < numel; i++) { - dalpha_ptr[0] += x_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i]; - } - } - } - - // TODO(Guanzhong): add GPU kernels - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/qr_op.cc b/paddle/fluid/operators/qr_op.cc index 40e3cbde3b00917ee5952b8aebd412b357683018..82fc9ef1b7858992c49f537ce8608856ef6b6fde 100644 --- a/paddle/fluid/operators/qr_op.cc +++ b/paddle/fluid/operators/qr_op.cc @@ -145,8 +145,6 @@ REGISTER_OPERATOR(qr, ops::QrOp, ops::QrOpMaker, REGISTER_OPERATOR(qr_grad, ops::QrGradOp); -REGISTER_OP_CPU_KERNEL(qr, ops::QrCPUKernel, ops::QrCPUKernel); - REGISTER_OP_CPU_KERNEL( qr_grad, ops::QrGradKernel, ops::QrGradKernel); diff --git a/paddle/fluid/operators/qr_op.h b/paddle/fluid/operators/qr_op.h index f09a07e96cd34e1b631ef9484fe23b12a3b58543..5ef02d8942797a720d18358d425cf45f77be82ad 100644 --- a/paddle/fluid/operators/qr_op.h +++ b/paddle/fluid/operators/qr_op.h @@ -48,85 +48,6 @@ static inline std::tuple _parse_qr_mode(std::string mode) { return std::make_tuple(compute_q, reduced); } -template -class QrCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - bool compute_q; - bool reduced_mode; - const Tensor& x = *context.Input("X"); - Tensor& q = *context.Output("Q"); - Tensor& r = *context.Output("R"); - std::string mode = context.Attr("mode"); - std::tie(compute_q, reduced_mode) = _parse_qr_mode(mode); - - auto numel = x.numel(); - PADDLE_ENFORCE_GT(numel, 0, platform::errors::PreconditionNotMet( - "The input of QR is empty.")); - auto x_dims = x.dims(); - int x_rank = x_dims.size(); - int m = x_dims[x_rank - 2]; - int n = x_dims[x_rank - 1]; - int min_mn = std::min(m, n); - int k = reduced_mode ? min_mn : m; - int batch_size = numel / (m * n); - int x_stride = m * n; - int q_stride = m * k; - int r_stride = k * n; - - auto* x_data = x.data>(); - T* q_data = nullptr; - if (compute_q) { - q_data = q.mutable_data>( - context.GetPlace(), - size_t(batch_size * m * k * sizeof(phi::dtype::Real))); - memset(q_data, 0, - size_t(batch_size * m * k * sizeof(phi::dtype::Real))); - } - auto* r_data = r.mutable_data>( - context.GetPlace(), - size_t(batch_size * k * n * sizeof(phi::dtype::Real))); - memset(r_data, 0, size_t(batch_size * k * n * sizeof(phi::dtype::Real))); - - // Implement QR by calling Eigen - for (int i = 0; i < batch_size; ++i) { - const T* x_matrix_ptr = x_data + i * x_stride; - T* r_matrix_ptr = r_data + i * r_stride; - using EigenDynamicMatrix = - Eigen::Matrix; - auto x_matrix = Eigen::Map(x_matrix_ptr, m, n); - Eigen::HouseholderQR qr(x_matrix); - if (reduced_mode) { - auto qr_top_matrix = qr.matrixQR().block(0, 0, min_mn, n); - auto r_matrix_view = - qr_top_matrix.template triangularView(); - auto r_matrix = EigenDynamicMatrix(r_matrix_view); - memcpy(r_matrix_ptr, r_matrix.data(), r_matrix.size() * sizeof(T)); - } else { - auto r_matrix_view = - qr.matrixQR().template triangularView(); - auto r_matrix = EigenDynamicMatrix(r_matrix_view); - memcpy(r_matrix_ptr, r_matrix.data(), r_matrix.size() * sizeof(T)); - } - - if (compute_q) { - T* q_matrix_ptr = q_data + i * q_stride; - if (reduced_mode) { - auto q_matrix = - qr.householderQ() * EigenDynamicMatrix::Identity(m, min_mn); - q_matrix.transposeInPlace(); - memcpy(q_matrix_ptr, q_matrix.data(), q_matrix.size() * sizeof(T)); - } else { - auto q_matrix = - qr.householderQ() * EigenDynamicMatrix::Identity(m, m); - q_matrix.transposeInPlace(); - memcpy(q_matrix_ptr, q_matrix.data(), q_matrix.size() * sizeof(T)); - } - } - } - } -}; - template class QrGradKernel : public framework::OpKernel { public: diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cc b/paddle/fluid/operators/reduce_ops/reduce_all_op.cc index 955cf8d4448c1b23319fa3e0c10dbd12ae3bf49c..9115d21b195e1b615f43b01af61bbdebd1e70294 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.cc @@ -14,6 +14,10 @@ #include "paddle/fluid/operators/reduce_ops/reduce_all_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + namespace paddle { namespace framework { class OpDesc; @@ -28,9 +32,17 @@ class CPUDeviceContext; } // namespace platform } // namespace paddle +DECLARE_INFER_SHAPE_FUNCTOR(reduce_all, ReduceAllInferShapeFunctor, + PD_INFER_META(phi::ReduceInferMetaBase)); +class ReduceAllOpMaker : public ops::ReduceOpMaker { + protected: + virtual std::string GetName() const { return "reduce_all"; } + virtual std::string GetOpType() const { return "Reduce reduce_all"; } +}; // kernel's device type is decided by input tensor place, to be consistent with // compare and logical ops -REGISTER_REDUCE_OP_WITHOUT_GRAD(reduce_all, UseInputPlace); -REGISTER_OP_CPU_KERNEL(reduce_all, - ops::BoolReduceKernel); +REGISTER_OPERATOR( + reduce_all, ops::ReduceOpUseInputPlace, ReduceAllOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker, + ReduceAllInferShapeFunctor); diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op.cc index fa3800dd3c9e46c20df54d748a61166a75be492b..69561b93498883bdf2adcfa3982d24bc1e727be0 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cc @@ -14,6 +14,9 @@ #include "paddle/fluid/operators/reduce_ops/reduce_any_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace framework { class OpDesc; @@ -28,9 +31,18 @@ class CPUDeviceContext; } // namespace platform } // namespace paddle +DECLARE_INFER_SHAPE_FUNCTOR(reduce_any, ReduceAnyInferShapeFunctor, + PD_INFER_META(phi::ReduceInferMetaBase)); + +class ReduceAnyOpMaker : public ops::ReduceOpMaker { + protected: + virtual std::string GetName() const { return "reduce_any"; } + virtual std::string GetOpType() const { return "Reduce reduce_any"; } +}; // kernel's device type is decided by input tensor place, to be consistent with // compare and logical ops -REGISTER_REDUCE_OP_WITHOUT_GRAD(reduce_any, UseInputPlace); -REGISTER_OP_CPU_KERNEL(reduce_any, - ops::BoolReduceKernel); +REGISTER_OPERATOR( + reduce_any, ops::ReduceOpUseInputPlace, ReduceAnyOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker, + ReduceAnyInferShapeFunctor); diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc index d057ee8f5d798f61c13d5c5c166c9d71b6716d6f..e327d19ab3be8daff08b4e358081d2792fd30835 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc @@ -35,7 +35,7 @@ namespace p = paddle::platform; using Tensor = paddle::framework::Tensor; -USE_OP(reduce_any); +USE_OP_ITSELF(reduce_any); USE_OP_DEVICE_KERNEL(reduce_any, NPU); template diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op.cc index 41df8e4a15f093a40a31c70eea98dfb7e575f4cd..15812778e0023e30a29f259bbd14b4c564ea8d46 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_max_op.cc @@ -35,13 +35,3 @@ REGISTER_OPERATOR( paddle::framework::DefaultGradOpMaker, ReduceMaxInferShapeFunctor); REGISTER_OPERATOR(reduce_max_grad, ops::ReduceGradOp) - -REGISTER_OP_CPU_KERNEL( - reduce_max_grad, ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_max_op.part.cu deleted file mode 100644 index 5ee38b8fa46290c86cd44ef1bcc71bd2fcd9bcd4..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/reduce_ops/reduce_max_op.part.cu +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h" - -REGISTER_OP_CUDA_KERNEL( - reduce_max_grad, ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc index 4a18330913803f822436118a35fb957b7e31b391..dc41979defb9314f2efb942f0f530c3b5da3bb8b 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc @@ -107,12 +107,3 @@ REGISTER_OPERATOR(reduce_mean_grad, ops::ReduceGradOp, ops::ReduceMeanDoubleGradDescMaker, ops::ReduceMeanDoubleGradOpBaseMaker, ops::ReduceMeanGradNoNeedBufferVarInferer); - -template -using CPUReduceMeanGradKernel = - ops::ReduceGradKernel; - -REGISTER_OP_CPU_KERNEL(reduce_mean_grad, CPUReduceMeanGradKernel, - CPUReduceMeanGradKernel, - CPUReduceMeanGradKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cc b/paddle/fluid/operators/reduce_ops/reduce_min_op.cc index 11aa78382e319331dc65ec22927f0d5762adfb43..5e5b04d57b002d8e8ecab9ddaf8186118f4bf187 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_min_op.cc @@ -14,21 +14,24 @@ #include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h" -REGISTER_REDUCE_OP(reduce_min); -REGISTER_OP_CPU_KERNEL( - reduce_min, ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel); -REGISTER_OP_CPU_KERNEL( - reduce_min_grad, ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + +namespace ops = paddle::operators; + +class ReduceMinOpMaker : public ops::ReduceOpMaker { + protected: + virtual std::string GetName() const { return "reduce_min"; } + virtual std::string GetOpType() const { return "Reduce reduce_min"; } +}; + +DECLARE_INFER_SHAPE_FUNCTOR(reduce_min, ReduceMinInferShapeFunctor, + PD_INFER_META(phi::ReduceInferMetaBase)); + +REGISTER_OPERATOR( + reduce_min, ops::ReduceOp, ReduceMinOpMaker, + paddle::framework::DefaultGradOpMaker, + paddle::framework::DefaultGradOpMaker, + ReduceMinInferShapeFunctor); +REGISTER_OPERATOR(reduce_min_grad, ops::ReduceGradOp) diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_min_op.part.cu deleted file mode 100644 index bf886063786a8c36884ed20fef41c99468156c01..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/reduce_ops/reduce_min_op.part.cu +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h" - -REGISTER_OP_CUDA_KERNEL( - reduce_min_grad, ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc index eb745ab9c56c5b3cfa62eb36713ebc2485282d6d..b1abdf9e8a758008dff49176c2d6b6682de5b622 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc @@ -14,6 +14,10 @@ #include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + namespace paddle { namespace framework { class OpDesc; @@ -26,14 +30,20 @@ class CPUDeviceContext; } // namespace platform } // namespace paddle -REGISTER_REDUCE_OP(reduce_prod); +namespace ops = paddle::operators; + +class ReduceProdOpMaker : public ops::ReduceOpMaker { + protected: + virtual std::string GetName() const { return "reduce_prod"; } + virtual std::string GetOpType() const { return "Reduce reduce_prod"; } +}; + +DECLARE_INFER_SHAPE_FUNCTOR(reduce_prod, ReduceProdInferShapeFunctor, + PD_INFER_META(phi::ReduceInferMetaBase)); -REGISTER_OP_CPU_KERNEL(reduce_prod_grad, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); +REGISTER_OPERATOR( + reduce_prod, ops::ReduceOp, ReduceProdOpMaker, + paddle::framework::DefaultGradOpMaker, + paddle::framework::DefaultGradOpMaker, + ReduceProdInferShapeFunctor); +REGISTER_OPERATOR(reduce_prod_grad, ops::ReduceGradOp); diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_prod_op.part.cu deleted file mode 100644 index 0610cdd94f89c0371988fac7955d07fc5498a69f..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.part.cu +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h" - -REGISTER_OP_CUDA_KERNEL( - reduce_prod_grad, ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/roi_align_op.cc b/paddle/fluid/operators/roi_align_op.cc index ac0cd75237baf5e8b860f197d42cd27bae65270e..bf78b6a696559cab152a6de2c4730a32dfdbb780 100644 --- a/paddle/fluid/operators/roi_align_op.cc +++ b/paddle/fluid/operators/roi_align_op.cc @@ -9,9 +9,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/roi_align_op.h" #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/ternary.h" namespace paddle { namespace operators { @@ -23,79 +26,6 @@ class ROIAlignOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::NotFound("Input(X) of ROIAlignOp " - "is not found.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("ROIs"), true, - platform::errors::NotFound("Input(ROIs) of ROIAlignOp " - "is not found.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::NotFound("Output(Out) of ROIAlignOp " - "is not found.")); - auto input_dims = ctx->GetInputDim("X"); - auto rois_dims = ctx->GetInputDim("ROIs"); - - if (ctx->HasInput("RoisNum")) { - auto rois_num_dims = ctx->GetInputDim("RoisNum"); - PADDLE_ENFORCE_EQ( - rois_num_dims.size(), 1, - platform::errors::InvalidArgument("The size of RoisNum should be 1" - ", but received size = %d", - rois_num_dims.size())); - } - PADDLE_ENFORCE_EQ( - input_dims.size(), 4, - platform::errors::InvalidArgument( - "The format of Input(X) in" - "RoIAlignOp is NCHW. And the rank of input must be 4. " - "But received rank = %d", - input_dims.size())); - PADDLE_ENFORCE_EQ(rois_dims.size(), 2, platform::errors::InvalidArgument( - "The rank of Input(ROIs) " - "in RoIAlignOp should be 2. " - "But the rank of RoIs is %d", - rois_dims.size())); - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_EQ(rois_dims[1], 4, - platform::errors::InvalidArgument( - "The second dimension " - "of Input(ROIs) should be 4. But received the " - "dimension = %d", - rois_dims[1])); - } - int pooled_height = ctx->Attrs().Get("pooled_height"); - int pooled_width = ctx->Attrs().Get("pooled_width"); - float spatial_scale = ctx->Attrs().Get("spatial_scale"); - - PADDLE_ENFORCE_GT(pooled_height, 0, - platform::errors::InvalidArgument( - "The 'pooled_height' attribute in RoIAlignOp is " - "invalid. The height must be greater than 0. But " - "received 'pooled_height' = %d", - pooled_height)); - PADDLE_ENFORCE_GT(pooled_width, 0, - platform::errors::InvalidArgument( - "The 'pooled_width' attribute in RoIAlignOp is " - "invalid. The width must be greater than 0. But " - "received 'pooled_width' = %d", - pooled_width)); - PADDLE_ENFORCE_GT(spatial_scale, 0.0f, - platform::errors::InvalidArgument( - "The 'spatial_scale' attribute in RoIAlignOp is " - "invalid. The scale must be greater than 0. But " - "received 'spatial_scale' = %f", - spatial_scale)); - - auto out_dims = input_dims; - out_dims[0] = rois_dims[0]; - out_dims[1] = input_dims[1]; - out_dims[2] = pooled_height; - out_dims[3] = pooled_width; - - ctx->SetOutputDim("Out", out_dims); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -221,17 +151,16 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(RoiAlignGradNoNeedBufVarsInferer, "X"); } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(roi_align, RoiAlignInferShapeFunctor, + PD_INFER_META(phi::RoiAlignInferMeta)); + REGISTER_OPERATOR(roi_align, ops::ROIAlignOp, ops::ROIAlignOpMaker, ops::ROIAlignGradMaker, - ops::ROIAlignGradMaker); + ops::ROIAlignGradMaker, + RoiAlignInferShapeFunctor); REGISTER_OPERATOR(roi_align_grad, ops::ROIAlignGradOp, ops::RoiAlignGradNoNeedBufVarsInferer); -REGISTER_OP_CPU_KERNEL( - roi_align_grad, - ops::CPUROIAlignGradOpKernel, - ops::CPUROIAlignGradOpKernel, - ops::CPUROIAlignGradOpKernel); REGISTER_OP_VERSION(roi_align) .AddCheckpoint( R"ROC( diff --git a/paddle/fluid/operators/roi_align_op.cu b/paddle/fluid/operators/roi_align_op.cu deleted file mode 100644 index 1a2e64cd45ca401f5fb8ca6b6975a029ba735280..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/roi_align_op.cu +++ /dev/null @@ -1,227 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "paddle/fluid/memory/memory.h" -#include "paddle/fluid/operators/roi_align_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; - -static constexpr int kNumCUDAThreads = 512; -static constexpr int kNumMaxinumNumBlocks = 4096; -static constexpr int kROISize = 4; - -static inline int NumBlocks(const int N) { - return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, - kNumMaxinumNumBlocks); -} - -template -__device__ void BilinearInterpolateGradient(const int height, const int width, - T y, T x, T* w1, T* w2, T* w3, - T* w4, int* x_low, int* x_high, - int* y_low, int* y_high) { - if (y < -1.0 || y > height || x < -1.0 || x > width) { - return; - } - - y = y <= 0 ? 0 : y; - x = x <= 0 ? 0 : x; - *y_low = static_cast(y); - *x_low = static_cast(x); - if (*y_low >= height - 1) { - *y_high = *y_low = height - 1; - y = static_cast(*y_low); - } else { - *y_high = *y_low + 1; - } - if (*x_low >= width - 1) { - *x_high = *x_low = width - 1; - x = static_cast(*x_low); - } else { - *x_high = *x_low + 1; - } - T ly = y - *y_low, lx = x - *x_low; - T hy = 1. - ly, hx = 1. - lx; - *w1 = hy * hx, *w2 = hy * lx, *w3 = ly * hx, *w4 = ly * lx; - - return; -} - -template -__global__ void GPUROIAlignBackward( - const int nthreads, const T* input_rois, const T* out_grad, - const int num_rois, const float spatial_scale, const int channels, - const int height, const int width, const int pooled_height, - const int pooled_width, const int sampling_ratio, int* roi_batch_id_data, - T* input_grad, const bool continuous_coordinate) { - CUDA_KERNEL_LOOP(i, nthreads) { - int pw = i % pooled_width; - int ph = (i / pooled_width) % pooled_height; - int c = (i / pooled_width / pooled_height) % channels; - int n = i / pooled_width / pooled_height / channels; - const T* offset_input_rois = input_rois + n * kROISize; - int roi_batch_ind = roi_batch_id_data[n]; - - T roi_offset = continuous_coordinate ? T(0.5) : 0; - T roi_xmin = offset_input_rois[0] * spatial_scale - roi_offset; - T roi_ymin = offset_input_rois[1] * spatial_scale - roi_offset; - T roi_xmax = offset_input_rois[2] * spatial_scale - roi_offset; - T roi_ymax = offset_input_rois[3] * spatial_scale - roi_offset; - - T roi_width = roi_xmax - roi_xmin; - T roi_height = roi_ymax - roi_ymin; - if (!continuous_coordinate) { - roi_width = max(roi_width, static_cast(1.)); - roi_height = max(roi_height, static_cast(1.)); - } - T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); - T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); - - T* offset_input_grad = - input_grad + (roi_batch_ind * channels + c) * height * width; - - const T* offset_out_grad = - out_grad + (n * channels + c) * pooled_height * pooled_width; - const T out_grad_this_bin = offset_out_grad[ph * pooled_width + pw]; - - int roi_bin_grid_h = (sampling_ratio > 0) - ? sampling_ratio - : ceil(roi_height / pooled_height); - int roi_bin_grid_w = - (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); - - const T count = roi_bin_grid_h * roi_bin_grid_w; - for (int iy = 0; iy < roi_bin_grid_h; iy++) { - const T y = roi_ymin + ph * bin_size_h + - static_cast(iy + .5f) * bin_size_h / - static_cast(roi_bin_grid_h); - for (int ix = 0; ix < roi_bin_grid_w; ix++) { - const T x = roi_xmin + pw * bin_size_w + - static_cast(ix + .5f) * bin_size_w / - static_cast(roi_bin_grid_w); - T w1 = 0, w2 = 0, w3 = 0, w4 = 0; - int x_low = -1, x_high = -1, y_low = -1, y_high = -1; - BilinearInterpolateGradient(height, width, y, x, &w1, &w2, &w3, &w4, - &x_low, &x_high, &y_low, &y_high); - T diff1 = out_grad_this_bin * w1 / count; - T diff2 = out_grad_this_bin * w2 / count; - T diff3 = out_grad_this_bin * w3 / count; - T diff4 = out_grad_this_bin * w4 / count; - if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { - platform::CudaAtomicAdd(offset_input_grad + y_low * width + x_low, - diff1); - platform::CudaAtomicAdd(offset_input_grad + y_low * width + x_high, - diff2); - platform::CudaAtomicAdd(offset_input_grad + y_high * width + x_low, - diff3); - platform::CudaAtomicAdd(offset_input_grad + y_high * width + x_high, - diff4); - } - } - } - } -} - -template -class GPUROIAlignGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - - auto* out_grad = ctx.Input(framework::GradVarName("Out")); - auto* in_grad = ctx.Output(framework::GradVarName("X")); - - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - auto spatial_scale = ctx.Attr("spatial_scale"); - auto sampling_ratio = ctx.Attr("sampling_ratio"); - auto aligned = ctx.Attr("aligned"); - - int rois_num = rois->dims()[0]; - int channels = in->dims()[1]; - int height = in->dims()[2]; - int width = in->dims()[3]; - - if (!in_grad) { - return; - } - Tensor roi_batch_id_list; - roi_batch_id_list.Resize({rois_num}); - auto cplace = platform::CPUPlace(); - int* roi_batch_id_data = roi_batch_id_list.mutable_data(cplace); - - auto& dev_ctx = ctx.cuda_device_context(); - auto gplace = ctx.GetPlace(); - if (ctx.HasInput("RoisNum")) { - auto* rois_num_t = ctx.Input("RoisNum"); - int rois_batch_size = rois_num_t->numel(); - std::vector rois_num_list(rois_batch_size); - memory::Copy(cplace, rois_num_list.data(), gplace, - rois_num_t->data(), sizeof(int) * rois_batch_size, 0); - int start = 0; - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = start; i < start + rois_num_list[n]; ++i) { - roi_batch_id_data[i] = n; - } - start += rois_num_list[n]; - } - } else { - auto rois_lod = rois->lod().back(); - int rois_batch_size = rois_lod.size() - 1; - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - roi_batch_id_data[i] = n; - } - } - } - auto roi_ptr = - memory::Alloc(dev_ctx, roi_batch_id_list.numel() * sizeof(int)); - int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); - int bytes = roi_batch_id_list.numel() * sizeof(int); - memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes, - dev_ctx.stream()); - in_grad->mutable_data(ctx.GetPlace()); - phi::funcs::SetConstant set_zero; - set_zero(dev_ctx, in_grad, static_cast(0)); - - int output_grad_size = out_grad->numel(); - int blocks = NumBlocks(output_grad_size); - int threads = kNumCUDAThreads; - - if (output_grad_size > 0) { - GPUROIAlignBackward<<>>( - output_grad_size, rois->data(), out_grad->data(), rois_num, - spatial_scale, channels, height, width, pooled_height, pooled_width, - sampling_ratio, roi_id_data, in_grad->mutable_data(ctx.GetPlace()), - aligned); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - roi_align_grad, - ops::GPUROIAlignGradOpKernel, - ops::GPUROIAlignGradOpKernel); diff --git a/paddle/fluid/operators/roi_align_op.h b/paddle/fluid/operators/roi_align_op.h deleted file mode 100644 index 589e35e4ab7ae4caf5efd3fb4d93a26b2ca86b26..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/roi_align_op.h +++ /dev/null @@ -1,196 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; - -template -void bilinear_interpolate_gradient(const int height, const int width, T y, T x, - const T out_grad_this_bin, const T count, - T* batch_grad_data) { - int x_low, y_low, x_high, y_high; - T w1, w2, w3, w4; - if (y < -1.0 || y > height || x < -1.0 || x > width) { - w1 = w2 = w3 = w4 = 0; - x_low = x_high = y_low = y_high = -1; - return; - } - y = y <= 0 ? 0 : y; - x = x <= 0 ? 0 : x; - y_low = static_cast(y); - x_low = static_cast(x); - if (y_low >= height - 1) { - y_high = y_low = height - 1; - y = static_cast(y_low); - } else { - y_high = y_low + 1; - } - - if (x_low >= width - 1) { - x_high = x_low = width - 1; - x = static_cast(x_low); - } else { - x_high = x_low + 1; - } - - T ly = y - y_low, lx = x - x_low; - T hy = 1. - ly, hx = 1. - lx; - w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; - T diff1 = out_grad_this_bin * w1 / count; - T diff2 = out_grad_this_bin * w2 / count; - T diff3 = out_grad_this_bin * w3 / count; - T diff4 = out_grad_this_bin * w4 / count; - if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { - *(batch_grad_data + y_low * width + x_low) += diff1; - *(batch_grad_data + y_low * width + x_high) += diff2; - *(batch_grad_data + y_high * width + x_low) += diff3; - *(batch_grad_data + y_high * width + x_high) += diff4; - } -} - -template -class CPUROIAlignGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - auto* out_grad = - ctx.Input(framework::GradVarName("Out")); - auto* in_grad = ctx.Output(framework::GradVarName("X")); - - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - auto spatial_scale = ctx.Attr("spatial_scale"); - auto sampling_ratio = ctx.Attr("sampling_ratio"); - auto in_dims = in->dims(); - auto aligned = ctx.Attr("aligned"); - - int channels = in_dims[1]; - int height = in_dims[2]; - int width = in_dims[3]; - int rois_num = rois->dims()[0]; - - if (!in_grad) { - return; - } - Tensor roi_batch_id_list; - roi_batch_id_list.Resize({rois_num}); - int* roi_batch_id_data = - roi_batch_id_list.mutable_data(ctx.GetPlace()); - - int rois_batch_size; - if (ctx.HasInput("RoisNum")) { - auto* rois_num_t = ctx.Input("RoisNum"); - rois_batch_size = rois_num_t->numel(); - auto* rois_num_data = rois_num_t->data(); - int start = 0; - for (int n = 0; n < rois_batch_size; ++n) { - for (int i = start; i < start + rois_num_data[n]; ++i) { - roi_batch_id_data[i] = n; - } - start += rois_num_data[n]; - } - } else { - auto rois_lod = rois->lod().back(); - rois_batch_size = rois_lod.size() - 1; - for (int n = 0; n < rois_batch_size; ++n) { - for (std::size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - roi_batch_id_data[i] = n; - } - } - } - in_grad->mutable_data(ctx.GetPlace()); - auto& dev_ctx = ctx.template device_context(); - phi::funcs::SetConstant set_zero; - set_zero(dev_ctx, in_grad, static_cast(0)); - - int output_grad_size = out_grad->numel(); - - if ((!out_grad->IsInitialized()) || (output_grad_size <= 0)) { - return; - } - - const T* rois_data = rois->data(); - const T* out_grad_data = out_grad->data(); - T* in_grad_data = in_grad->mutable_data(ctx.GetPlace()); - - auto in_stride = phi::stride(in->dims()); - auto roi_stride = phi::stride(rois->dims()); - auto out_stride = phi::stride(out_grad->dims()); - - T roi_offset = aligned ? T(0.5) : 0; - for (int n = 0; n < rois_num; ++n) { - int roi_batch_idx = roi_batch_id_data[n]; - T roi_xmin = rois_data[0] * spatial_scale - roi_offset; - T roi_ymin = rois_data[1] * spatial_scale - roi_offset; - T roi_xmax = rois_data[2] * spatial_scale - roi_offset; - T roi_ymax = rois_data[3] * spatial_scale - roi_offset; - - T roi_width = roi_xmax - roi_xmin; - T roi_height = roi_ymax - roi_ymin; - roi_width = std::max(roi_width, static_cast(1.)); - roi_height = std::max(roi_height, static_cast(1.)); - if (!aligned) { - roi_width = std::max(roi_width, static_cast(1.)); - roi_height = std::max(roi_height, static_cast(1.)); - } - - T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); - T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); - for (int c = 0; c < channels; ++c) { - T* batch_grad_data = - in_grad_data + roi_batch_idx * in_stride[0] + c * in_stride[1]; - const T* batch_out_grad_data = - out_grad_data + n * out_stride[0] + c * out_stride[1]; - for (int ph = 0; ph < pooled_height; ++ph) { - for (int pw = 0; pw < pooled_width; ++pw) { - int pool_index = ph * pooled_width + pw; - T out_grad_this_bin = batch_out_grad_data[pool_index]; - int roi_bin_grid_h = (sampling_ratio > 0) - ? sampling_ratio - : ceil(roi_height / pooled_height); - int roi_bin_grid_w = (sampling_ratio > 0) - ? sampling_ratio - : ceil(roi_width / pooled_width); - T count = roi_bin_grid_h * roi_bin_grid_w; - for (int iy = 0; iy < roi_bin_grid_h; iy++) { - const T y = roi_ymin + ph * bin_size_h + - static_cast(iy + .5f) * bin_size_h / - static_cast(roi_bin_grid_h); - for (int ix = 0; ix < roi_bin_grid_w; ix++) { - const T x = roi_xmin + pw * bin_size_w + - static_cast(ix + .5f) * bin_size_w / - static_cast(roi_bin_grid_w); - bilinear_interpolate_gradient(height, width, y, x, - out_grad_this_bin, count, - batch_grad_data); - } - } - } - } - } - rois_data += roi_stride[0]; - } - } -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc index a512e7dcd682b517f64e3b14e2f35c4c539ec8b4..12e33d56c0020858ba44709572ee8e526bc949df 100644 --- a/paddle/fluid/operators/roi_pool_op.cc +++ b/paddle/fluid/operators/roi_pool_op.cc @@ -12,9 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/roi_pool_op.h" #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/ternary.h" namespace paddle { namespace operators { @@ -26,74 +29,6 @@ class ROIPoolOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "roi_pool"); - OP_INOUT_CHECK(ctx->HasInput("ROIs"), "Input", "ROIs", "roi_pool"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "roi_pool"); - OP_INOUT_CHECK(ctx->HasOutput("Argmax"), "Output", "Argmax", "roi_pool"); - - auto input_dims = ctx->GetInputDim("X"); - auto rois_dims = ctx->GetInputDim("ROIs"); - - if (ctx->HasInput("RoisNum")) { - auto rois_num_dims = ctx->GetInputDim("RoisNum"); - PADDLE_ENFORCE_EQ(rois_num_dims.size(), 1, - platform::errors::InvalidArgument( - "The second dimension of RoisNum should " - "be 1, but received dimension is %d", - rois_num_dims.size())); - } - PADDLE_ENFORCE_EQ(input_dims.size(), 4, - platform::errors::InvalidArgument( - "The input data should be a four-dimensional " - "tensor with [N,C,H,W], but received input data with " - " %d dimension", - input_dims.size())); - PADDLE_ENFORCE_EQ( - rois_dims.size(), 2, - platform::errors::InvalidArgument( - "ROIs should be a 2-D LoDTensor with shape (num_rois, 4)" - "given as [[x1, y1, x2, y2], ...], but received ROIs is " - "%d-dimensional LoDTensor", - rois_dims.size())); - PADDLE_ENFORCE_EQ( - rois_dims[1], kROISize, - platform::errors::InvalidArgument( - "ROIs should be a 2-D LoDTensor with shape (num_rois, 4)" - "given as [[x1, y1, x2, y2], ...]. But the second dimension of " - "the received data is %d", - rois_dims[1])); - - int pooled_height = ctx->Attrs().Get("pooled_height"); - int pooled_width = ctx->Attrs().Get("pooled_width"); - float spatial_scale = ctx->Attrs().Get("spatial_scale"); - - PADDLE_ENFORCE_GT(pooled_height, 0, - platform::errors::OutOfRange( - "The pooled output height must be greater than 0" - "but received height is %d", - pooled_height)); - PADDLE_ENFORCE_GT(pooled_width, 0, - platform::errors::OutOfRange( - "The pooled output width must be greater than 0" - "but received width is %d", - pooled_width)); - PADDLE_ENFORCE_GT(spatial_scale, 0.0f, - platform::errors::OutOfRange( - "The spatial scale must be greater than 0, " - "but received spatial scale is %f", - spatial_scale)); - - auto out_dims = input_dims; - out_dims[0] = rois_dims[0]; - out_dims[1] = input_dims[1]; - out_dims[2] = pooled_height; - out_dims[3] = pooled_width; - - ctx->SetOutputDim("Out", out_dims); - ctx->SetOutputDim("Argmax", out_dims); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -212,20 +147,15 @@ class ROIPoolGradMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(roi_pool, RoiPoolInferShapeFunctor, + PD_INFER_META(phi::RoiPoolInferMeta)); + REGISTER_OPERATOR(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker, ops::ROIPoolGradMaker, - ops::ROIPoolGradMaker); + ops::ROIPoolGradMaker, + RoiPoolInferShapeFunctor); REGISTER_OPERATOR(roi_pool_grad, ops::ROIPoolGradOp); -REGISTER_OP_CPU_KERNEL( - roi_pool, - ops::CPUROIPoolOpKernel, - ops::CPUROIPoolOpKernel, - ops::CPUROIPoolOpKernel); -REGISTER_OP_CPU_KERNEL( - roi_pool_grad, - ops::CPUROIPoolGradOpKernel, - ops::CPUROIPoolGradOpKernel, - ops::CPUROIPoolGradOpKernel); + REGISTER_OP_VERSION(roi_pool) .AddCheckpoint( R"ROC( diff --git a/paddle/fluid/operators/roi_pool_op.cu b/paddle/fluid/operators/roi_pool_op.cu deleted file mode 100644 index b907b1114bbc0402fb253ec00610abefe83051c3..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/roi_pool_op.cu +++ /dev/null @@ -1,306 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include -#include "paddle/fluid/memory/memory.h" -#include "paddle/fluid/operators/roi_pool_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; - -static constexpr int kNumCUDAThreads = 512; -static constexpr int kNumMaxinumNumBlocks = 4096; - -static inline int NumBlocks(const int N) { - return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, - kNumMaxinumNumBlocks); -} - -template -__global__ void GPUROIPoolForward( - const int nthreads, const T* input_data, const T* input_rois, - const float spatial_scale, const int channels, const int height, - const int width, const int pooled_height, const int pooled_width, - int* roi_batch_id_data, T* output_data, int64_t* argmax_data) { - int index = blockIdx.x * blockDim.x + threadIdx.x; - int offset = blockDim.x * gridDim.x; - for (size_t i = index; i < nthreads; i += offset) { - int pw = i % pooled_width; - int ph = (i / pooled_width) % pooled_height; - int c = (i / pooled_width / pooled_height) % channels; - int n = i / pooled_width / pooled_height / channels; - - const T* offset_input_rois = input_rois + n * kROISize; - int roi_batch_ind = roi_batch_id_data[n]; - int roi_start_w = round(offset_input_rois[0] * spatial_scale); - int roi_start_h = round(offset_input_rois[1] * spatial_scale); - int roi_end_w = round(offset_input_rois[2] * spatial_scale); - int roi_end_h = round(offset_input_rois[3] * spatial_scale); - - int roi_width = max(roi_end_w - roi_start_w + 1, 1); - int roi_height = max(roi_end_h - roi_start_h + 1, 1); - - int hstart = static_cast(floor(static_cast(ph) * - static_cast(roi_height) / - static_cast(pooled_height))); - int wstart = static_cast(floor(static_cast(pw) * - static_cast(roi_width) / - static_cast(pooled_width))); - int hend = static_cast(ceil(static_cast(ph + 1) * - static_cast(roi_height) / - static_cast(pooled_height))); - int wend = static_cast(ceil(static_cast(pw + 1) * - static_cast(roi_width) / - static_cast(pooled_width))); - hstart = min(max(hstart + roi_start_h, 0), height); - hend = min(max(hend + roi_start_h, 0), height); - wstart = min(max(wstart + roi_start_w, 0), width); - wend = min(max(wend + roi_start_w, 0), width); - bool is_empty = (hend <= hstart) || (wend <= wstart); - - T maxval = is_empty ? 0 : -std::numeric_limits::max(); - int maxidx = -1; - const T* offset_input_data = - input_data + (roi_batch_ind * channels + c) * height * width; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - int input_data_index = h * width + w; - if (offset_input_data[input_data_index] > maxval) { - maxval = offset_input_data[input_data_index]; - maxidx = input_data_index; - } - } - } - output_data[i] = maxval; - if (argmax_data) { - argmax_data[i] = maxidx; - } - } -} - -template -__global__ void GPUROIPoolBackward( - const int nthreads, const T* input_rois, const T* output_grad, - const int64_t* argmax_data, const int num_rois, const float spatial_scale, - const int channels, const int height, const int width, - const int pooled_height, const int pooled_width, int* roi_batch_id_data, - T* input_grad) { - int index = blockIdx.x * blockDim.x + threadIdx.x; - int offset = blockDim.x * gridDim.x; - for (int i = index; i < nthreads; i += offset) { - int pw = i % pooled_width; - int ph = (i / pooled_width) % pooled_height; - int c = (i / pooled_width / pooled_height) % channels; - int n = i / pooled_width / pooled_height / channels; - - int roi_batch_ind = roi_batch_id_data[n]; - int input_offset = (roi_batch_ind * channels + c) * height * width; - int output_offset = (n * channels + c) * pooled_height * pooled_width; - const T* offset_output_grad = output_grad + output_offset; - T* offset_input_grad = input_grad + input_offset; - const int64_t* offset_argmax_data = argmax_data + output_offset; - - int argmax = offset_argmax_data[ph * pooled_width + pw]; - if (argmax != -1) { - platform::CudaAtomicAdd( - offset_input_grad + argmax, - static_cast(offset_output_grad[ph * pooled_width + pw])); - } - } -} - -template -class GPUROIPoolOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - auto* out = ctx.Output("Out"); - auto* argmax = ctx.Output("Argmax"); - - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - auto spatial_scale = ctx.Attr("spatial_scale"); - - auto in_dims = in->dims(); - int batch_size = in_dims[0]; - auto in_stride = phi::stride(in_dims); - int channels = in_dims[1]; - int height = in_dims[2]; - int width = in_dims[3]; - - int rois_num = rois->dims()[0]; - - if (rois_num == 0) return; - - int output_size = out->numel(); - int blocks = NumBlocks(output_size); - int threads = kNumCUDAThreads; - - framework::Tensor roi_batch_id_list; - roi_batch_id_list.Resize({rois_num}); - auto cplace = platform::CPUPlace(); - int* roi_batch_id_data = roi_batch_id_list.mutable_data(cplace); - auto& dev_ctx = ctx.cuda_device_context(); - auto gplace = ctx.GetPlace(); - if (ctx.HasInput("RoisNum")) { - auto* rois_num_t = ctx.Input("RoisNum"); - int rois_batch_size = rois_num_t->numel(); - - PADDLE_ENFORCE_EQ( - rois_batch_size, batch_size, - platform::errors::InvalidArgument( - "The batch size of input(ROIs) and input(X) must be the same but " - "received batch size of input(ROIs) and input(X) is %d and %d " - "respectively.", - rois_batch_size, batch_size)); - std::vector rois_num_list(rois_batch_size); - memory::Copy(cplace, rois_num_list.data(), gplace, - rois_num_t->data(), sizeof(int) * rois_batch_size, 0); - int start = 0; - for (int n = 0; n < rois_batch_size; ++n) { - for (int i = start; i < start + rois_num_list[n]; ++i) { - roi_batch_id_data[i] = n; - } - start += rois_num_list[n]; - } - } else { - auto rois_lod = rois->lod().back(); - int rois_batch_size = rois_lod.size() - 1; - PADDLE_ENFORCE_EQ( - rois_batch_size, batch_size, - platform::errors::InvalidArgument( - "The batch size of input(ROIs) and input(X) must be the same but " - "received batch size of input(ROIs) and input(X) is %d and %d " - "respectively.", - rois_batch_size, batch_size)); - - int rois_num_with_lod = rois_lod[rois_batch_size]; - PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod, - platform::errors::InvalidArgument( - "The number of rois from input(ROIs) and its LOD " - "must be the same. Received rois %d of input(ROIs) " - "but the number of rois %d from its LOD is %d", - rois_num, rois_num_with_lod)); - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - roi_batch_id_data[i] = n; - } - } - } - int bytes = roi_batch_id_list.numel() * sizeof(int); - auto roi_ptr = memory::Alloc(dev_ctx, bytes); - int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); - memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes, - dev_ctx.stream()); - - GPUROIPoolForward<<>>( - output_size, in->data(), rois->data(), spatial_scale, channels, - height, width, pooled_height, pooled_width, roi_id_data, - out->mutable_data(ctx.GetPlace()), - argmax->mutable_data(ctx.GetPlace())); - } -}; - -template -class GPUROIPoolGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - auto* rois_lod = ctx.Input("RoisNum"); - auto* argmax = ctx.Input("Argmax"); - - auto* out_grad = ctx.Input(framework::GradVarName("Out")); - auto* x_grad = ctx.Output(framework::GradVarName("X")); - - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - auto spatial_scale = ctx.Attr("spatial_scale"); - - int rois_num = rois->dims()[0]; - int channels = in->dims()[1]; - int height = in->dims()[2]; - int width = in->dims()[3]; - - if (x_grad) { - framework::Tensor roi_batch_id_list; - roi_batch_id_list.Resize({rois_num}); - auto cplace = platform::CPUPlace(); - int* roi_batch_id_data = roi_batch_id_list.mutable_data(cplace); - - auto& dev_ctx = ctx.cuda_device_context(); - auto gplace = ctx.GetPlace(); - if (ctx.HasInput("RoisNum")) { - auto* rois_num_t = ctx.Input("RoisNum"); - int rois_batch_size = rois_num_t->numel(); - std::vector rois_num_list(rois_batch_size); - memory::Copy(cplace, rois_num_list.data(), gplace, - rois_num_t->data(), sizeof(int) * rois_batch_size, 0); - int start = 0; - for (int n = 0; n < rois_batch_size; ++n) { - for (int i = start; i < start + rois_num_list[n]; ++i) { - roi_batch_id_data[i] = n; - } - start += rois_num_list[n]; - } - } else { - auto rois_lod = rois->lod().back(); - int rois_batch_size = rois_lod.size() - 1; - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - roi_batch_id_data[i] = n; - } - } - } - int bytes = roi_batch_id_list.numel() * sizeof(int); - auto roi_ptr = memory::Alloc(dev_ctx, bytes); - int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); - memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes, - dev_ctx.stream()); - - x_grad->mutable_data(ctx.GetPlace()); - phi::funcs::SetConstant set_zero; - set_zero(dev_ctx, x_grad, static_cast(0)); - - int output_grad_size = out_grad->numel(); - int blocks = NumBlocks(output_grad_size); - int threads = kNumCUDAThreads; - - if (output_grad_size > 0) { - GPUROIPoolBackward<<>>( - output_grad_size, rois->data(), out_grad->data(), - argmax->data(), rois_num, spatial_scale, channels, height, - width, pooled_height, pooled_width, roi_id_data, - x_grad->mutable_data(ctx.GetPlace())); - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - roi_pool, - ops::GPUROIPoolOpKernel, - ops::GPUROIPoolOpKernel); -REGISTER_OP_CUDA_KERNEL( - roi_pool_grad, - ops::GPUROIPoolGradOpKernel, - ops::GPUROIPoolGradOpKernel); diff --git a/paddle/fluid/operators/roi_pool_op.h b/paddle/fluid/operators/roi_pool_op.h deleted file mode 100644 index a104fd49eb3e0b6d842ab6052e1181e6480a6f65..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/roi_pool_op.h +++ /dev/null @@ -1,250 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/memory/memcpy.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -static constexpr int kROISize = 4; - -template -class CPUROIPoolOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - auto* out = ctx.Output("Out"); - auto* argmax = ctx.Output("Argmax"); - - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - auto spatial_scale = ctx.Attr("spatial_scale"); - - auto in_dims = in->dims(); - int batch_size = in_dims[0]; - int channels = in_dims[1]; - int height = in_dims[2]; - int width = in_dims[3]; - int rois_num = rois->dims()[0]; - - auto in_stride = phi::stride(in_dims); - auto argmax_stride = phi::stride(argmax->dims()); - auto roi_stride = phi::stride(rois->dims()); - auto out_stride = phi::stride(out->dims()); - - const T* input_data = in->data(); - - framework::Tensor roi_batch_id_list; - roi_batch_id_list.Resize({rois_num}); - int* roi_batch_id_data = - roi_batch_id_list.mutable_data(ctx.GetPlace()); - - int rois_batch_size; - if (ctx.HasInput("RoisNum")) { - auto* rois_num_t = ctx.Input("RoisNum"); - rois_batch_size = rois_num_t->numel(); - PADDLE_ENFORCE_EQ( - rois_batch_size, batch_size, - platform::errors::InvalidArgument("The rois_batch_size and imgs " - "batch_size must be the same.")); - auto* rois_num_data = rois_num_t->data(); - int start = 0; - for (int n = 0; n < rois_batch_size; ++n) { - for (int i = start; i < start + rois_num_data[n]; ++i) { - roi_batch_id_data[i] = n; - } - start += rois_num_data[n]; - } - } else { - auto rois_lod = rois->lod().back(); - rois_batch_size = rois_lod.size() - 1; - PADDLE_ENFORCE_EQ( - rois_batch_size, batch_size, - platform::errors::InvalidArgument("The rois_batch_size and imgs " - "batch_size must be the same.")); - int rois_num_with_lod = rois_lod[rois_batch_size]; - PADDLE_ENFORCE_EQ( - rois_num, rois_num_with_lod, - platform::errors::InvalidArgument("The rois_num from input " - "and lod must be the same.")); - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - roi_batch_id_data[i] = n; - } - } - } - - T* output_data = out->mutable_data(ctx.GetPlace()); - int64_t* argmax_data = argmax->mutable_data(ctx.GetPlace()); - - const T* rois_data = rois->data(); - for (int n = 0; n < rois_num; ++n) { - int roi_batch_id = roi_batch_id_data[n]; - int roi_start_w = round(rois_data[0] * spatial_scale); - int roi_start_h = round(rois_data[1] * spatial_scale); - int roi_end_w = round(rois_data[2] * spatial_scale); - int roi_end_h = round(rois_data[3] * spatial_scale); - - // Force malformed ROIs to be 1x1 - int roi_height = std::max(roi_end_h - roi_start_h + 1, 1); - int roi_width = std::max(roi_end_w - roi_start_w + 1, 1); - - const float bin_size_h = - static_cast(roi_height) / static_cast(pooled_height); - const float bin_size_w = - static_cast(roi_width) / static_cast(pooled_width); - - const T* batch_data = input_data + roi_batch_id * in_stride[0]; - - for (int c = 0; c < channels; ++c) { - for (int ph = 0; ph < pooled_height; ++ph) { - for (int pw = 0; pw < pooled_width; ++pw) { - // Compute pooling region for this output unit: - // start (included) = floor(ph * roi_height / pooled_height_) - // end (excluded) = ceil((ph + 1) * roi_height / pooled_height_) - int hstart = - static_cast(floor(static_cast(ph) * bin_size_h)); - int wstart = - static_cast(floor(static_cast(pw) * bin_size_w)); - int hend = - static_cast(ceil(static_cast(ph + 1) * bin_size_h)); - int wend = - static_cast(ceil(static_cast(pw + 1) * bin_size_w)); - - hstart = std::min(std::max(hstart + roi_start_h, 0), height); - hend = std::min(std::max(hend + roi_start_h, 0), height); - wstart = std::min(std::max(wstart + roi_start_w, 0), width); - wend = std::min(std::max(wend + roi_start_w, 0), width); - - const int pool_index = ph * pooled_width + pw; - - // Define an empty pooling region to be zero - bool is_empty = (hend <= hstart) || (wend <= wstart); - output_data[pool_index] = - is_empty ? 0 : -std::numeric_limits::max(); - argmax_data[pool_index] = -1; - - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - const int index = h * width + w; - if (batch_data[index] > output_data[pool_index]) { - output_data[pool_index] = batch_data[index]; - argmax_data[pool_index] = index; - } - } - } - } - } - - batch_data += in_stride[1]; - output_data += out_stride[1]; - argmax_data += argmax_stride[1]; - } - // Increment ROI data pointer - rois_data += roi_stride[0]; - } - return; - } -}; - -template -class CPUROIPoolGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - auto* argmax = ctx.Input("Argmax"); - auto* out_grad = - ctx.Input(framework::GradVarName("Out")); - auto* in_grad = ctx.Output(framework::GradVarName("X")); - - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - - if (in_grad) { - int rois_num = rois->dims()[0]; - framework::Tensor roi_batch_id_list; - roi_batch_id_list.Resize({rois_num}); - int* roi_batch_id_data = - roi_batch_id_list.mutable_data(ctx.GetPlace()); - - int rois_batch_size; - if (ctx.HasInput("RoisNum")) { - auto* rois_num_t = ctx.Input("RoisNum"); - rois_batch_size = rois_num_t->numel(); - auto* rois_num_data = rois_num_t->data(); - int start = 0; - for (int n = 0; n < rois_batch_size; ++n) { - for (int i = start; i < start + rois_num_data[n]; ++i) { - roi_batch_id_data[i] = n; - } - start += rois_num_data[n]; - } - } else { - auto rois_lod = rois->lod().back(); - rois_batch_size = rois_lod.size() - 1; - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - roi_batch_id_data[i] = n; - } - } - } - - const T* rois_data = rois->data(); - const T* out_grad_data = out_grad->data(); - const int64_t* argmax_data = argmax->data(); - T* in_grad_data = in_grad->mutable_data(ctx.GetPlace()); - phi::funcs::SetConstant set_zero; - set_zero(ctx.template device_context(), in_grad, - static_cast(0)); - - auto in_stride = phi::stride(in->dims()); - auto argmax_stride = phi::stride(argmax->dims()); - auto roi_stride = phi::stride(rois->dims()); - auto out_stride = phi::stride(out_grad->dims()); - - int channels = in->dims()[1]; - - for (int n = 0; n < rois_num; ++n) { - int roi_batch_idx = roi_batch_id_data[n]; - T* batch_grad_data = in_grad_data + roi_batch_idx * in_stride[0]; - for (int c = 0; c < channels; ++c) { - for (int ph = 0; ph < pooled_height; ++ph) { - for (int pw = 0; pw < pooled_width; ++pw) { - int pool_index = ph * pooled_width + pw; - if (argmax_data[pool_index] >= 0) { - auto index = argmax_data[pool_index]; - batch_grad_data[index] += out_grad_data[pool_index]; - } - } - } - batch_grad_data += in_stride[1]; - out_grad_data += out_stride[1]; - argmax_data += argmax_stride[1]; - } - rois_data += roi_stride[0]; - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/roll_op.cc b/paddle/fluid/operators/roll_op.cc index f82510556fde87fbf4aeb1904e29325358598791..898db4c22fed9cc97baa261b5b512a889290aff3 100644 --- a/paddle/fluid/operators/roll_op.cc +++ b/paddle/fluid/operators/roll_op.cc @@ -12,13 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/roll_op.h" - #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/fluid/platform/complex.h" +#include "paddle/fluid/operators/utils.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -29,43 +32,6 @@ class RollOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Input(X) of RollOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(Out) of RollOp should not be null.")); - - auto dims = ctx->Attrs().Get>("axis"); - auto shifts = ctx->Attrs().Get>("shifts"); - - if (!ctx->HasInput("ShiftsTensor")) { - if (dims.size() != 0) { - PADDLE_ENFORCE_EQ(dims.size(), shifts.size(), - platform::errors::InvalidArgument( - "When dims.size() != 0, dims.size() " - "should be equal to " - "shifts.size(). But received " - "dims.size() = %d, shifts.size() = %d", - dims.size(), shifts.size())); - } else { - PADDLE_ENFORCE_EQ(shifts.size(), 1, - platform::errors::InvalidArgument( - "When dims.size() == 0, shifts.size() " - "should be equal to 1, But received " - "shifts.size() = %d", - shifts.size())); - } - } - - ctx->SetOutputDim("Out", ctx->GetInputDim("X")); - auto type = ctx->GetInputsVarType("X")[0]; - if (type == framework::proto::VarType::LOD_TENSOR) { - ctx->ShareLoD("X", /*->*/ "Out"); - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -149,29 +115,15 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(RollGradNoNeedBufferVarsInferer, "X"); } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(roll, RollInferShapeFunctor, + PD_INFER_META(phi::RollInferMeta)); + REGISTER_OPERATOR(roll, ops::RollOp, ops::RollOpMaker, ops::RollGradMaker, - ops::RollGradMaker); + ops::RollGradMaker, + RollInferShapeFunctor); REGISTER_OPERATOR(roll_grad, ops::RollGradOp, ops::RollGradNoNeedBufferVarsInferer); -REGISTER_OP_CPU_KERNEL( - roll, ops::RollKernel, - ops::RollKernel, - ops::RollKernel, - ops::RollKernel, - ops::RollKernel>, - ops::RollKernel>); -REGISTER_OP_CPU_KERNEL( - roll_grad, ops::RollGradKernel, - ops::RollGradKernel, - ops::RollGradKernel, - ops::RollGradKernel, - ops::RollGradKernel>, - ops::RollGradKernel>); REGISTER_OP_VERSION(roll) .AddCheckpoint( diff --git a/paddle/fluid/operators/roll_op.cu b/paddle/fluid/operators/roll_op.cu deleted file mode 100644 index b9064c5450f9fbed64bcb65a2f9d15be2b56fbcf..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/roll_op.cu +++ /dev/null @@ -1,225 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/roll_op.h" -#include "paddle/fluid/platform/complex.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" -#include "paddle/phi/core/utils/array.h" - -namespace paddle { -namespace operators { - -using platform::PADDLE_CUDA_NUM_THREADS; -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; - -template -__global__ void RollCudaKernel(const T* input, T* output, int64_t N, - phi::Array shifts, - phi::Array strides, - phi::Array sizes) { - int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= N) { - return; - } - - int64_t output_idx = idx; - int64_t new_dim_idx = 0; - -#pragma unroll - for (size_t i = 0; i < Rank; i++) { - new_dim_idx = (idx / strides[i]) % sizes[i] + shifts[i]; - if (new_dim_idx >= sizes[i]) { - output_idx += (shifts[i] - sizes[i]) * strides[i]; - } else { - output_idx += shifts[i] * strides[i]; - } - } - output[output_idx] = input[idx]; -} - -template -class RollKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* in = context.Input("X"); - auto* out = context.Output("Out"); - std::vector shifts = context.Attr>("shifts"); - if (context.HasInput("ShiftsTensor")) { - const auto* shifts_tensor = - context.Input("ShiftsTensor"); - PADDLE_ENFORCE_EQ( - shifts_tensor->dims().size(), 1, - platform::errors::InvalidArgument( - "The rank of ShiftsTensor is expected to be 1, got %s", - shifts_tensor->dims().size())); - shifts = GetDataFromTensor(shifts_tensor); - } - std::vector dims = context.Attr>("axis"); - - auto* in_data = in->data(); - auto* out_data = out->mutable_data(context.GetPlace()); - int64_t numel = in->numel(); - auto stream = - context.template device_context().stream(); - - size_t nums = shifts.size(); - auto input_dim = in->dims(); - auto stride_dim = phi::stride(input_dim); - - std::vector strides(nums), sizes(nums); - if (dims.size() == 0) { - strides[0] = 1; - sizes[0] = numel; - shifts[0] = (shifts[0] % numel + numel) % numel; - } else { - for (size_t i = 0; i < nums; i++) { - int dim = dims[i] >= 0 ? dims[i] : dims[i] + input_dim.size(); - int64_t size = input_dim[dim]; - - if (size != 0) { - shifts[i] = (shifts[i] % size + size) % size; - strides[i] = stride_dim[dim]; - sizes[i] = size; - } - } - } - -#define CALL_ROLL_CUDA_KERNEL(N) \ - case N: { \ - phi::Array _strides; \ - phi::Array _shifts; \ - phi::Array _sizes; \ - for (size_t idx = 0; idx < N; ++idx) { \ - _strides[idx] = strides[idx]; \ - _shifts[idx] = shifts[idx]; \ - _sizes[idx] = sizes[idx]; \ - } \ - RollCudaKernel< \ - T, \ - N><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, \ - PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_data, out_data, numel, \ - _shifts, _strides, _sizes); \ - break; \ - } - - switch (nums) { - CALL_ROLL_CUDA_KERNEL(1); - CALL_ROLL_CUDA_KERNEL(2); - CALL_ROLL_CUDA_KERNEL(3); - CALL_ROLL_CUDA_KERNEL(4); - CALL_ROLL_CUDA_KERNEL(5); - CALL_ROLL_CUDA_KERNEL(6); - CALL_ROLL_CUDA_KERNEL(7); - CALL_ROLL_CUDA_KERNEL(8); - CALL_ROLL_CUDA_KERNEL(9); - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "shifts.size() should be less than 10, But received shifts.size() " - "= %d", - shifts.size())); - } - } -}; - -template -class RollGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* in = context.Input(framework::GradVarName("Out")); - auto* out = context.Output(framework::GradVarName("X")); - std::vector shifts = context.Attr>("shifts"); - if (context.HasInput("ShiftsTensor")) { - const auto* shifts_tensor = - context.Input("ShiftsTensor"); - PADDLE_ENFORCE_EQ( - shifts_tensor->dims().size(), 1, - platform::errors::InvalidArgument( - "The rank of ShiftsTensor is expected to be 1, got %s", - shifts_tensor->dims().size())); - shifts = GetDataFromTensor(shifts_tensor); - } - std::vector dims = context.Attr>("axis"); - - auto* in_data = in->data(); - auto* out_data = out->mutable_data(context.GetPlace()); - int64_t numel = in->numel(); - auto stream = - context.template device_context().stream(); - size_t nums = shifts.size(); - auto input_dim = in->dims(); - auto stride_dim = phi::stride(input_dim); - - std::vector strides(nums), sizes(nums); - if (dims.size() == 0) { - strides[0] = 1; - sizes[0] = numel; - shifts[0] = ((-shifts[0]) % numel + numel) % numel; - } else { - for (size_t i = 0; i < nums; i++) { - int dim = dims[i] >= 0 ? dims[i] : dims[i] + input_dim.size(); - int64_t size = input_dim[dim]; - if (size != 0) { - shifts[i] = ((-shifts[i]) % size + size) % size; - strides[i] = stride_dim[dim]; - sizes[i] = size; - } - } - } - - switch (nums) { - CALL_ROLL_CUDA_KERNEL(1); - CALL_ROLL_CUDA_KERNEL(2); - CALL_ROLL_CUDA_KERNEL(3); - CALL_ROLL_CUDA_KERNEL(4); - CALL_ROLL_CUDA_KERNEL(5); - CALL_ROLL_CUDA_KERNEL(6); - CALL_ROLL_CUDA_KERNEL(7); - CALL_ROLL_CUDA_KERNEL(8); - CALL_ROLL_CUDA_KERNEL(9); - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "shifts.size() should be less than 10, But received shifts.size() " - "= %d", - shifts.size())); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - roll, ops::RollKernel, - ops::RollKernel, - ops::RollKernel, - ops::RollKernel, - ops::RollKernel>, - ops::RollKernel>); -REGISTER_OP_CUDA_KERNEL( - roll_grad, ops::RollGradKernel, - ops::RollGradKernel, - ops::RollGradKernel, - ops::RollGradKernel, - ops::RollGradKernel>, - ops::RollGradKernel>); diff --git a/paddle/fluid/operators/roll_op.h b/paddle/fluid/operators/roll_op.h deleted file mode 100644 index 413c7bcfc15eb1cae86c3fedf47ea4f677d1248c..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/roll_op.h +++ /dev/null @@ -1,169 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/utils.h" -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; -using DDim = framework::DDim; - -template -inline void shift_along_dim(T* data, const DDim& input_dim, int64_t dim, - int64_t shift) { - if (dim < 0) { - dim += input_dim.size(); - } - if (input_dim[dim] == 0) { - return; - } - shift = shift % input_dim[dim]; - if (shift < 0) { - shift += input_dim[dim]; - } - - auto outer_loops = 1; - for (auto i = 0; i < dim; i++) { - outer_loops *= input_dim[i]; - } - auto slice_width = 1; - for (auto i = dim + 1; i < input_dim.size(); i++) { - slice_width *= input_dim[i]; - } - - VLOG(3) << "shift_along_dim_debug: input_dim: " << input_dim - << "; dim: " << dim << "; shift: " << shift - << "; outer_loops: " << outer_loops - << "; slice_width: " << slice_width; - if (shift == 0) { - return; - } - - std::vector head; - auto head_size = slice_width * (input_dim[dim] - shift); - head.resize(head_size); - - for (auto i = 0; i < outer_loops; i++) { - for (auto j = 0; j < head_size; j++) { - head[j] = data[i * input_dim[dim] * slice_width + j]; - } - for (auto j = input_dim[dim] - shift; j < input_dim[dim]; j++) { - auto dst_pos = j - input_dim[dim] + shift; - for (auto k = 0; k < slice_width; k++) { - data[(i * input_dim[dim] + dst_pos) * slice_width + k] = - data[(i * input_dim[dim] + j) * slice_width + k]; - } - } - for (auto j = 0; j < head_size; j++) { - data[(i * input_dim[dim] + shift) * slice_width + j] = head[j]; - } - } -} - -template -class RollKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* input_var = context.InputVar("X"); - auto* output_var = context.OutputVar("Out"); - auto& input = input_var->Get(); - auto* output = output_var->GetMutable(); - std::vector shifts = context.Attr>("shifts"); - if (context.HasInput("ShiftsTensor")) { - const auto* shifts_tensor = - context.Input("ShiftsTensor"); - PADDLE_ENFORCE_EQ( - shifts_tensor->dims().size(), 1, - platform::errors::InvalidArgument( - "The rank of ShiftsTensor is expected to be 1, got %s", - shifts_tensor->dims().size())); - shifts = GetDataFromTensor(shifts_tensor); - } - std::vector dims = context.Attr>("axis"); - - std::vector out_vec; - paddle::framework::TensorToVector(input, context.device_context(), - &out_vec); - - size_t nums = shifts.size(); - DDim input_dim = input.dims(); - - // axis = none, reshape to 1-D tensor - if (dims.size() == 0) { - dims.push_back(0l); - input_dim = framework::Dim<1>(out_vec.size()); - } - - for (size_t i = 0; i < nums; i++) { - PADDLE_ENFORCE_EQ( - dims[i] < input_dim.size() && dims[i] >= (0 - input_dim.size()), true, - platform::errors::OutOfRange( - "Attr(axis[%d]) is out of range, It's expected " - "to be in range of [-%d, %d]. But received Attr(axis[%d]) = %d.", - i, input_dim.size(), input_dim.size() - 1, i, dims[i])); - shift_along_dim(out_vec.data(), input_dim, dims[i], shifts[i]); - } - output->mutable_data(context.GetPlace()); - framework::TensorFromVector(out_vec, context.device_context(), output); - output->Resize(input.dims()); - } -}; - -template -class RollGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* input_var = context.InputVar(framework::GradVarName("Out")); - auto* output_var = context.OutputVar(framework::GradVarName("X")); - auto& input = input_var->Get(); - auto* output = output_var->GetMutable(); - std::vector shifts = context.Attr>("shifts"); - if (context.HasInput("ShiftsTensor")) { - const auto* shifts_tensor = - context.Input("ShiftsTensor"); - shifts = GetDataFromTensor(shifts_tensor); - } - std::vector dims = context.Attr>("axis"); - - std::vector out_vec; - paddle::framework::TensorToVector(input, context.device_context(), - &out_vec); - - size_t nums = shifts.size(); - DDim input_dim = input.dims(); - - // axis = none, reshape to 1-D tensor - if (dims.size() == 0) { - dims.push_back(0l); - input_dim = framework::Dim<1>(out_vec.size()); - } - - for (size_t i = 0; i < nums; i++) { - shift_along_dim(out_vec.data(), input_dim, dims[i], 0 - shifts[i]); - } - output->mutable_data(context.GetPlace()); - framework::TensorFromVector(out_vec, context.device_context(), output); - output->Resize(input.dims()); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/searchsorted_op.cc b/paddle/fluid/operators/searchsorted_op.cc index d0290795455db1546afbda80e71e79de3f1020ac..3a6fdbaa2613d1f87a84f7175d7d5b507c3479ab 100644 --- a/paddle/fluid/operators/searchsorted_op.cc +++ b/paddle/fluid/operators/searchsorted_op.cc @@ -12,8 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -21,60 +23,6 @@ namespace operators { class SearchSortedOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - static bool SearchsortedDimsMatchedBeforeLastDim( - const framework::DDim& sequences_dims, - const framework::DDim& values_dims) { - if (sequences_dims.size() != values_dims.size()) { - return false; - } - const auto& sequences_dims_size = sequences_dims.size(); - for (int64_t dim = 0; dim < sequences_dims_size - 1; ++dim) { - if (sequences_dims[dim] != values_dims[dim]) { - return false; - } - } - return true; - } - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("SortedSequence"), "Input", "SortedSequence", - "searchsorted"); - OP_INOUT_CHECK(ctx->HasInput("Values"), "Input", "Values", "searchsorted"); - - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "searchsorted"); - - auto sequences_dims = ctx->GetInputDim("SortedSequence"); - auto values_dims = ctx->GetInputDim("Values"); - auto out_int32 = ctx->Attrs().Get("out_int32"); - - if (sequences_dims.size() != 1) { - PADDLE_ENFORCE_EQ( - SearchsortedDimsMatchedBeforeLastDim(sequences_dims, values_dims), - true, - platform::errors::Unavailable( - "The dimensions of sorted_sequence tensor ( %s ) and values " - "tensor ( %s ) can not match. Because the input sorted_sequence " - "tensor must be 1 dimension or the first N-1 dimensions of " - "sorted_sequence tensor and input values tensor must match. " - "Please input appropriate sorted_sequence and values again! ", - sequences_dims, values_dims)); - } - - if (out_int32) { - PADDLE_ENFORCE_LT( - sequences_dims[sequences_dims.size() - 1], - std::numeric_limits::max(), - platform::errors::Unavailable( - "The size of sorted_sequence %d exceed the maximum limit d%. " - "Because the size of sorted_sequence should be less than the " - "output maximum value for int32 bit. Please set appropriate " - "sorted_sequence to meet this requirement! ", - sequences_dims[sequences_dims.size() - 1], - std::numeric_limits::max())); - } - - ctx->SetOutputDim("Out", values_dims); - } protected: framework::OpKernelType GetExpectedKernelType( @@ -115,4 +63,7 @@ class SearchSortedOpMaker : public framework::OpProtoAndCheckerMaker { namespace ops = paddle::operators; -REGISTER_OPERATOR(searchsorted, ops::SearchSortedOp, ops::SearchSortedOpMaker); +DECLARE_INFER_SHAPE_FUNCTOR(searchsorted, SearchsortedInferShapeFunctor, + PD_INFER_META(phi::SearchsortedInferMeta)); +REGISTER_OPERATOR(searchsorted, ops::SearchSortedOp, ops::SearchSortedOpMaker, + SearchsortedInferShapeFunctor); diff --git a/paddle/fluid/operators/set_value_op.cc b/paddle/fluid/operators/set_value_op.cc index 513ab46e9b5eebdb39faf4401d9d8b2fc387a82f..73655bcb18500e54564936eac4400a0c7b49af62 100644 --- a/paddle/fluid/operators/set_value_op.cc +++ b/paddle/fluid/operators/set_value_op.cc @@ -13,9 +13,15 @@ // limitations under the License. #include "paddle/fluid/operators/set_value_op.h" + #include + +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + namespace paddle { namespace framework { class InferShapeContext; @@ -34,6 +40,8 @@ class CPUDeviceContext; namespace paddle { namespace operators { +using Tensor = framework::Tensor; + class SetValue : public framework::OperatorWithKernel { public: SetValue(const std::string &type, const framework::VariableNameMap &inputs, @@ -41,17 +49,6 @@ class SetValue : public framework::OperatorWithKernel { const framework::AttributeMap &attrs) : OperatorWithKernel(type, inputs, outputs, attrs) {} - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "SetValue"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "SetValue"); - auto in_dims = ctx->GetInputDim("Input"); - PADDLE_ENFORCE_LT( - in_dims.size(), 7, - platform::errors::InvalidArgument( - "The rank of input should be less than 7, but received %d.", - in_dims.size())); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -236,10 +233,13 @@ DECLARE_INPLACE_OP_INFERER(SetValueOpInplaceInferer, {"Input", "Out"}); namespace ops = paddle::operators; namespace plat = paddle::platform; +DECLARE_INFER_SHAPE_FUNCTOR(set_value, SetValueInferShapeFunctor, + PD_INFER_META(phi::SetValueInferMeta)); + REGISTER_OPERATOR(set_value, ops::SetValue, ops::SetValueMaker, ops::SetValueGradMaker, ops::SetValueGradMaker, - ops::SetValueOpInplaceInferer); + ops::SetValueOpInplaceInferer, SetValueInferShapeFunctor); REGISTER_OPERATOR(set_value_grad, ops::SetValueGrad); diff --git a/paddle/fluid/operators/shape_op.cc b/paddle/fluid/operators/shape_op.cc index e2c8359beb1290f7b1b592c1ff24b15986f41f73..9001ce5d51dece5c6cee481f3f6f92e69c302c2b 100644 --- a/paddle/fluid/operators/shape_op.cc +++ b/paddle/fluid/operators/shape_op.cc @@ -13,7 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -22,17 +25,6 @@ class ShapeOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("Input"), true, - platform::errors::InvalidArgument( - "Input (Input) of get_shape op should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output (Out) of get_shape op should not be null.")); - auto in_dim = ctx->GetInputDim("Input"); - ctx->SetOutputDim("Out", {in_dim.size()}); - } - framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { auto input_data_type = @@ -89,7 +81,12 @@ Return the shape of the input. namespace ops = paddle::operators; namespace plat = paddle::platform; + +DECLARE_INFER_SHAPE_FUNCTOR(shape, ShapeInferShapeFunctor, + PD_INFER_META(phi::ShapeInferMeta)); + REGISTER_OPERATOR( shape, ops::ShapeOp, ops::ShapeOpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); + paddle::framework::EmptyGradOpMaker, + ShapeInferShapeFunctor); diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu index 19a395e72314db52d52cf704a567dce8dd58318a..41545a1ca20b267e79f43c2af4c58ea64dd479b2 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu @@ -760,8 +760,9 @@ static void SoftmaxWithCrossEntropyHardLabel( */ template __global__ void SoftmaxWithCrossEntropyGradHardLabel( - T* logits_grad, const T* loss_grad, const LabelT* labels, const int64_t n, - const int64_t dim, const int64_t d, const int ignore_index) { + T* logits_grad, const T* loss_grad, const T* softmax, const LabelT* labels, + const int64_t n, const int64_t dim, const int64_t d, + const int ignore_index) { int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; int64_t idx_n = idx / (d * dim); int64_t idx_dim = (idx / d) % dim; @@ -773,10 +774,9 @@ __global__ void SoftmaxWithCrossEntropyGradHardLabel( if (lbl == ignore_index) { logits_grad[idx] = static_cast(0.0); } else if (lbl == idx_dim) { - logits_grad[idx] = - (logits_grad[idx] - static_cast(1.0)) * loss_grad[ids]; + logits_grad[idx] = (softmax[idx] - static_cast(1.0)) * loss_grad[ids]; } else { - logits_grad[idx] *= loss_grad[ids]; + logits_grad[idx] = softmax[idx] * loss_grad[ids]; } } } @@ -1395,11 +1395,20 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel { Tensor* logit_grad = context.Output(framework::GradVarName("Logits")); const Tensor* softmax = context.Input("Softmax"); - if (logit_grad != softmax) { + auto stream = context.cuda_device_context().stream(); + auto ignore_index = context.Attr("ignore_index"); + auto use_softmax = context.Attr("use_softmax"); + + T* logit_grad_data = nullptr; + bool copy_flag = (logit_grad != softmax && (!use_softmax || soft_label)); + if (copy_flag) { framework::TensorCopy(*softmax, context.GetPlace(), context.device_context(), logit_grad); + logit_grad_data = logit_grad->template data(); + } else { + logit_grad_data = + logit_grad->template mutable_data(context.GetPlace()); } - T* logit_grad_data = logit_grad->template data(); const int rank = logit_grad->dims().size(); const int axis = phi::funcs::CanonicalAxis(context.Attr("axis"), rank); @@ -1414,9 +1423,6 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel { #else int block = 512; #endif - auto stream = context.cuda_device_context().stream(); - auto ignore_index = context.Attr("ignore_index"); - auto use_softmax = context.Attr("use_softmax"); // do not with softmax op, and input is softmax if (!use_softmax) { @@ -1451,11 +1457,12 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel { SoftCrossEntropyGradientKernel<<>>( logit_grad_data, loss_grad_data, label_data, n, d, remain); } else { + const T* softmax_data = softmax->template data(); const auto* label_data = labels.template data(); int grid = (n * d + block - 1) / block; SoftmaxWithCrossEntropyGradHardLabel<<>>( - logit_grad_data, loss_grad_data, label_data, n, d / remain, remain, - ignore_index); + logit_grad_data, loss_grad_data, softmax_data, label_data, n, + d / remain, remain, ignore_index); } } }; diff --git a/paddle/fluid/operators/sync_batch_norm_op.cc b/paddle/fluid/operators/sync_batch_norm_op.cc index d198992abde7dc79f0732928a3cb0cb0e6549ded..0c178b02d03099c6e9df4c3eae5ce95352982d7e 100644 --- a/paddle/fluid/operators/sync_batch_norm_op.cc +++ b/paddle/fluid/operators/sync_batch_norm_op.cc @@ -50,6 +50,7 @@ class SyncBatchNormGradMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; + REGISTER_OPERATOR(sync_batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker, ops::BatchNormOpInferVarType, ops::SyncBatchNormGradMaker, diff --git a/paddle/fluid/operators/top_k_v2_op.cc b/paddle/fluid/operators/top_k_v2_op.cc index d1add111e1d24cb711955a9aff06eb19feb35dc9..0a9ae789b01eea8a14952afe0d998005c59c0659 100644 --- a/paddle/fluid/operators/top_k_v2_op.cc +++ b/paddle/fluid/operators/top_k_v2_op.cc @@ -14,7 +14,9 @@ limitations under the License. */ #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -23,56 +25,6 @@ class TopkV2Op : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "topk_v2"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "topk_v2"); - OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "topk_v2"); - - auto input_dims = ctx->GetInputDim("X"); - const int& dim_size = input_dims.size(); - int axis = static_cast(ctx->Attrs().Get("axis")); - PADDLE_ENFORCE_EQ( - (axis < dim_size) && (axis >= (-1 * dim_size)), true, - paddle::platform::errors::InvalidArgument( - "the axis of topk must be [-%d, %d), but you set axis is %d", - dim_size, dim_size, axis)); - - if (axis < 0) axis += dim_size; - - int k; - auto k_is_tensor = ctx->HasInput("K"); - if (k_is_tensor) { - k = -1; - } else { - k = static_cast(ctx->Attrs().Get("k")); - PADDLE_ENFORCE_EQ(k >= 1, true, - paddle::platform::errors::InvalidArgument( - "the attribute of k in the topk must >= 1 or be a " - "Tensor, but received %d .", - k)); - } - - PADDLE_ENFORCE_GE(input_dims.size(), 1, - paddle::platform::errors::InvalidArgument( - "input of topk must have >= 1d shape")); - - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_GE( - input_dims[axis], k, - paddle::platform::errors::InvalidArgument( - "input of topk op must have >= %d columns in axis of %d", k, - axis)); - } - - framework::DDim dims = input_dims; - - dims[axis] = k; - ctx->SetOutputDim("Out", dims); - ctx->SetOutputDim("Indices", dims); - ctx->ShareLoD("X", "Out"); - ctx->ShareLoD("X", "Indices"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -169,8 +121,11 @@ class TopkV2GradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(top_k_v2, TopKInferShapeFunctor, + PD_INFER_META(phi::TopKInferMeta)); REGISTER_OPERATOR(top_k_v2, ops::TopkV2Op, ops::TopkV2OpMaker, ops::TopkV2GradOpMaker, - ops::TopkV2GradOpMaker); + ops::TopkV2GradOpMaker, + TopKInferShapeFunctor); REGISTER_OPERATOR(top_k_v2_grad, ops::TopkV2OpGrad); diff --git a/paddle/fluid/operators/tril_triu_op.cc b/paddle/fluid/operators/tril_triu_op.cc index 3e943c62e1ce17857e78e140efeb50e627e80a4e..c8010e8a128e0b2483c93ed38047b17060bfb0e9 100644 --- a/paddle/fluid/operators/tril_triu_op.cc +++ b/paddle/fluid/operators/tril_triu_op.cc @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/tril_triu_op.h" #include +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { @@ -104,19 +104,3 @@ REGISTER_OPERATOR(tril_triu, ops::TrilTriuOp, ops::TrilTriuOpMaker, ops::TrilTriuGradOpMaker, ops::TrilTriuGradOpMaker); REGISTER_OPERATOR(tril_triu_grad, ops::TrilTriuGradOp); -REGISTER_OP_CPU_KERNEL( - tril_triu, ops::TrilTriuOpKernel, - ops::TrilTriuOpKernel, - ops::TrilTriuOpKernel, - ops::TrilTriuOpKernel, - ops::TrilTriuOpKernel, - ops::TrilTriuOpKernel); -REGISTER_OP_CPU_KERNEL( - tril_triu_grad, - ops::TrilTriuGradOpKernel, - ops::TrilTriuGradOpKernel, - ops::TrilTriuGradOpKernel, - ops::TrilTriuGradOpKernel, - ops::TrilTriuGradOpKernel, - ops::TrilTriuGradOpKernel); diff --git a/paddle/fluid/operators/tril_triu_op.cu b/paddle/fluid/operators/tril_triu_op.cu deleted file mode 100644 index 9cbbdeeb2ce28453f2c22d063975fa82aae5d3b3..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/tril_triu_op.cu +++ /dev/null @@ -1,35 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/tril_triu_op.h" - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL( - tril_triu, ops::TrilTriuOpKernel, - ops::TrilTriuOpKernel, - ops::TrilTriuOpKernel, - ops::TrilTriuOpKernel, - ops::TrilTriuOpKernel, - ops::TrilTriuOpKernel); -REGISTER_OP_CUDA_KERNEL( - tril_triu_grad, - ops::TrilTriuGradOpKernel, - ops::TrilTriuGradOpKernel, - ops::TrilTriuGradOpKernel, - ops::TrilTriuGradOpKernel, - ops::TrilTriuGradOpKernel, - ops::TrilTriuGradOpKernel); diff --git a/paddle/fluid/operators/tril_triu_op.h b/paddle/fluid/operators/tril_triu_op.h deleted file mode 100644 index 3150b7617d10a8f9c2f60dd2e74ab2cbbb2d655e..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/tril_triu_op.h +++ /dev/null @@ -1,102 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/float16.h" -#include "paddle/fluid/platform/for_range.h" - -namespace paddle { -namespace operators { - -template -class TrilTriuCompute { - public: - HOSTDEVICE TrilTriuCompute(const T* in, const int diagonal, const bool lower, - const int64_t H, const int64_t W, T* out) - : in_(in), diagonal_(diagonal), lower_(lower), H_(H), W_(W), out_(out) {} - - HOSTDEVICE void operator()(int64_t idx) { - const int64_t row = (idx / W_) % H_; - const int64_t col = idx % W_; - const bool mask = - lower_ ? (col - row > diagonal_) : (col - row < diagonal_); - out_[idx] = mask ? static_cast(0) : in_[idx]; - } - - private: - const T* in_; - const int diagonal_; - const bool lower_; - const int64_t H_; - const int64_t W_; - T* out_; -}; - -template -class TrilTriuOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const auto* x = context.Input("X"); - const auto* x_data = x->data(); - auto* out = context.Output("Out"); - auto* out_data = out->mutable_data(context.GetPlace()); - - const int diagonal = context.Attr("diagonal"); - const bool lower = context.Attr("lower"); - - const auto& dims = x->dims(); - const auto H = dims[dims.size() - 2]; - const auto W = dims[dims.size() - 1]; - - platform::ForRange for_range( - context.template device_context(), - static_cast(x->numel())); - - paddle::operators::TrilTriuCompute tril_triu_computer( - x_data, diagonal, lower, H, W, out_data); - for_range(tril_triu_computer); - } -}; - -template -class TrilTriuGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const auto* d_out = - context.Input(framework::GradVarName("Out")); - const auto* dout_data = d_out->data(); - auto* d_x = context.Output(framework::GradVarName("X")); - auto* dx_data = d_x->mutable_data(context.GetPlace()); - - const int diagonal = context.Attr("diagonal"); - const bool lower = context.Attr("lower"); - - const auto& dims = d_out->dims(); - const auto H = dims[dims.size() - 2]; - const auto W = dims[dims.size() - 1]; - - platform::ForRange for_range( - context.template device_context(), - static_cast(d_out->numel())); - - paddle::operators::TrilTriuCompute tril_triu_grad_computer( - dout_data, diagonal, lower, H, W, dx_data); - for_range(tril_triu_grad_computer); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/tril_triu_op_npu.cc b/paddle/fluid/operators/tril_triu_op_npu.cc index ad1c1814c05cdf7f96a6f3c05a5cf1a00d2a2e93..4145730357d6007368d26c46d2b1bd47c9085982 100644 --- a/paddle/fluid/operators/tril_triu_op_npu.cc +++ b/paddle/fluid/operators/tril_triu_op_npu.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/tril_triu_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/tril_triu_op_xpu.cc b/paddle/fluid/operators/tril_triu_op_xpu.cc index e36cbcf228cfbf30c8fcd5562ac40f38a5467cdb..a44ea8ff689b85d9f718572c45b4f8fafaf1565d 100644 --- a/paddle/fluid/operators/tril_triu_op_xpu.cc +++ b/paddle/fluid/operators/tril_triu_op_xpu.cc @@ -11,7 +11,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/tril_triu_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/device_wrapper.h" namespace paddle { diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.h b/paddle/fluid/operators/truncated_gaussian_random_op.h index 8af6e281424eaabd8d6ea86843b3c13aa36cba47..a6ff2f686cb76bb03de8074014f82d6ff9e57bd3 100644 --- a/paddle/fluid/operators/truncated_gaussian_random_op.h +++ b/paddle/fluid/operators/truncated_gaussian_random_op.h @@ -1,8 +1,11 @@ /* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -137,9 +140,19 @@ T Erfinv(T x) { template struct TruncatedNormal { T mean, std; - TruncatedNormal(T mean, T std) : mean(mean), std(std) {} + T a_normal_cdf; + T b_normal_cdf; + TruncatedNormal(T mean, T std) : mean(mean), std(std) { + auto normal_cdf = [](T x) { + return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0; + }; + a_normal_cdf = normal_cdf(-2.0); + b_normal_cdf = normal_cdf(2.0); + } + T operator()(T value) const { - return std::sqrt(2.0) * Erfinv(value) * std + mean; + auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value; + return std::sqrt(2.0) * Erfinv(2 * p - 1) * std + mean; } }; diff --git a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc index 4ed0dd22ec086923bbe47af192cab8d001ae734f..261d9cee2d5cd25c510aacb280b9623f985eb1f7 100644 --- a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc +++ b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc @@ -84,13 +84,8 @@ class NPUTruncatedGaussianRandomKernel : public framework::OpKernel { Tensor cpu_tensor(tensor->dtype()); cpu_tensor.Resize(tensor->dims()); T* cpu_data = cpu_tensor.mutable_data(platform::CPUPlace()); - auto normal_cdf = [](float x) { - return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0; - }; - float a_normal_cdf = normal_cdf((-2.0 - mean) / std); - float b_normal_cdf = normal_cdf((2.0 - mean) / std); - std::uniform_real_distribution dist(2.0 * a_normal_cdf - 1.0, - 2.0 * b_normal_cdf - 1.0); + std::uniform_real_distribution dist(std::numeric_limits::min(), + 1.0); TruncatedNormal truncated_normal(mean, std); int64_t size = tensor->numel(); diff --git a/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc b/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc index 984d9f397cc655b4cfd7e0bc211db1665252272f..803b61fbe813f85f48b71d1de7fc41eb26e4b8da 100644 --- a/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc +++ b/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc @@ -32,13 +32,8 @@ class XPUTruncatedGaussianRandomKernel : public framework::OpKernel { auto* tensor = context.Output("Out"); T* data = tensor->mutable_data(context.GetPlace()); - auto normal_cdf = [](float x) { - return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0; - }; - float a_normal_cdf = normal_cdf((-2.0 - mean) / std); - float b_normal_cdf = normal_cdf((2.0 - mean) / std); - std::uniform_real_distribution dist(2.0 * a_normal_cdf - 1.0, - 2.0 * b_normal_cdf - 1.0); + std::uniform_real_distribution dist(std::numeric_limits::min(), + 1.0); TruncatedNormal truncated_normal(mean, std); int64_t size = tensor->numel(); diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h index ce2dba4db02a00398965f0ed656fa64e4fc828df..4001fd744e67784d736cb157743e4b4d7fa4517e 100644 --- a/paddle/fluid/platform/mkldnn_helper.h +++ b/paddle/fluid/platform/mkldnn_helper.h @@ -559,6 +559,34 @@ inline void GetGroupConvWeightsTz(std::vector& weights_tz, // NOLINT } } +inline void RegisterModelLayout( + std::vector>& ops, + const platform::Place& place) { + if (platform::is_cpu_place(place)) { + auto check_attrib = [](std::unique_ptr& op, + const std::string& attrib_name) -> bool { + if (op->HasAttr(attrib_name)) { + auto data_format = op->Attr(attrib_name); + platform::MKLDNNDeviceContext::tls().set_cur_paddle_data_layout( + data_format.compare("NHWC") == 0 ? framework::DataLayout::kNHWC + : framework::DataLayout::kNCHW); + return true; + } else { + return false; + } + }; + + for (auto& op : ops) { + if (check_attrib(op, std::string("data_format"))) { + return; + } + if (check_attrib(op, std::string("data_layout"))) { + return; + } + } + } +} + inline bool HasOpINT8DataType(const paddle::framework::OpDesc* op) { return (op->GetAttrIfExists("mkldnn_data_type") == "int8" || op->GetAttrIfExists("use_quantizer")); diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc index 1df917b8c3594d4505d9e92cd9a8c64bffd50279..e89d8d96342e723724bb867a14bc4262c6ab7b16 100644 --- a/paddle/fluid/pybind/distributed_py.cc +++ b/paddle/fluid/pybind/distributed_py.cc @@ -235,25 +235,13 @@ void BindDistributed(py::module *m) { py::call_guard()); #if defined(PADDLE_WITH_GLOO) - py::class_(*m, "GlooOptions") - .def(py::init<>()) - .def_readwrite("_device", &GlooOptions::device) - .def_static("create", &GlooOptions::create); - - py::class_>(*m, "GlooStore") - .def(py::init( - [](const std::shared_ptr &store) { - return std::make_shared(store); - }), - py::call_guard()); - py::class_>( *m, "ProcessGroupGloo", ProcessGroup) - .def(py::init &, int, int, - std::shared_ptr &>(), + .def(py::init &, int, + int, std::shared_ptr &>(), py::call_guard()) - .def(py::init([](const std::shared_ptr &store, int rank, - int world_size) { + .def(py::init([](const std::shared_ptr &store, + int rank, int world_size) { auto opts = GlooOptions::create(); char *ifname = getenv(GLOO_SOCKET_IFNAME_ENV.c_str()); if (ifname && strlen(ifname) > 1) { diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc index e110432c67d395c865d934a47eaa4a803053db8b..528bd75eb0013b95057d7549e083b2fa1318cac1 100644 --- a/paddle/fluid/pybind/eager_functions.cc +++ b/paddle/fluid/pybind/eager_functions.cc @@ -122,13 +122,33 @@ static PyObject* eager_api_run_backward(PyObject* self, PyObject* args, EAGER_TRY auto tensors = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 0), 0); auto grad_tensors = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 1), 1); - egr::RunBackward(tensors, grad_tensors, - CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 2), 2)); + egr::Backward(tensors, grad_tensors, + CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 2), 2)); Py_INCREF(Py_None); return Py_None; EAGER_CATCH_AND_THROW_RETURN_NULL } +static PyObject* eager_api_run_partial_grad(PyObject* self, PyObject* args, + PyObject* kwargs) { + EAGER_TRY + auto tensors = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 0), 0); + auto inputs = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 1), 1); + auto grad_tensors = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 2), 2); + auto retain_graph = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 3), 3); + auto create_graph = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 4), 4); + auto only_inputs = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 5), 5); + auto allow_unused = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 6), 6); + auto no_grad_vars = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 7), 7); + + std::vector result = + egr::Grad(tensors, inputs, grad_tensors, retain_graph, create_graph, + only_inputs, allow_unused, no_grad_vars); + VLOG(1) << " in eager_api_run_partial_grad, after runing egr::Grad"; + return ToPyObject(result, true /* return_py_none_if_not_initialize */); + EAGER_CATCH_AND_THROW_RETURN_NULL +} + static PyObject* eager_api_tensor_copy(PyObject* self, PyObject* args, PyObject* kwargs) { EAGER_TRY @@ -355,6 +375,7 @@ static PyObject* eager_api_run_costum_op(PyObject* self, PyObject* args, ins_auto_grad_metas.resize(ctx.InputRange().size()); VLOG(7) << "We got slot num of outs is: " << ctx.OutputRange().size(); outs_auto_grad_metas.resize(ctx.OutputRange().size()); + for (size_t i = 0; i < ctx.InputRange().size(); i++) { ins_auto_grad_metas[i] = egr::EagerUtils::nullable_autograd_meta(ctx.InputsBetween( @@ -384,11 +405,15 @@ static PyObject* eager_api_run_costum_op(PyObject* self, PyObject* args, // Prepare Grad outputs size_t no_grad_cnt = 0; for (size_t i = 0; i < ins_auto_grad_metas.size(); i++) { + const std::vector& in_tensors = + ctx.InputsBetween(ctx.InputRangeAt(i).first, + ctx.InputRangeAt(i).second); + if (slot_map[0].find(i) != slot_map[0].end()) { - grad_node->SetGradOutMeta(&ins_auto_grad_metas[i], slot_map[0][i]); + grad_node->SetGradOutMeta(in_tensors, slot_map[0][i]); grad_node->AddEdges(&ins_auto_grad_metas[i], slot_map[0][i]); } else { - grad_node->SetGradOutMeta(&ins_auto_grad_metas[i], + grad_node->SetGradOutMeta(in_tensors, ins_auto_grad_metas.size() - 1 - no_grad_cnt); grad_node->AddEdges(&ins_auto_grad_metas[i], ins_auto_grad_metas.size() - 1 - no_grad_cnt); @@ -397,11 +422,14 @@ static PyObject* eager_api_run_costum_op(PyObject* self, PyObject* args, } // Prepare Grad inputs with grad of fwd outputs for (size_t i = 0; i < outs_auto_grad_metas.size(); i++) { + const std::vector& out_tensors = + ctx.OutputsBetweeen(ctx.OutputRangeAt(i).first, + ctx.OutputRangeAt(i).second); + egr::EagerUtils::SetOutRankWithSlot(&(outs_auto_grad_metas[i]), i); egr::EagerUtils::SetHistory(&(outs_auto_grad_metas[i]), grad_node); - grad_node->SetGradInMeta(&(outs_auto_grad_metas[i]), i); - egr::EagerUtils::CheckAndRetainGrad(ctx.OutputsBetweeen( - ctx.OutputRangeAt(i).first, ctx.OutputRangeAt(i).second)); + grad_node->SetGradInMeta(out_tensors, i); + egr::EagerUtils::CheckAndRetainGrad(out_tensors); } // Prepare Grad inputs with fwd outputs @@ -452,6 +480,9 @@ PyMethodDef variable_functions[] = { METH_VARARGS | METH_KEYWORDS, NULL}, {"run_backward", (PyCFunction)(void (*)(void))eager_api_run_backward, METH_VARARGS | METH_KEYWORDS, NULL}, + {"run_partial_grad", + (PyCFunction)(void (*)(void))eager_api_run_partial_grad, + METH_VARARGS | METH_KEYWORDS, NULL}, {"_run_custom_op", (PyCFunction)(void (*)(void))eager_api_run_costum_op, METH_VARARGS | METH_KEYWORDS, NULL}, {"tensor_copy", (PyCFunction)(void (*)(void))eager_api_tensor_copy, diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index 082ec382c79cd9c98ac75db14bc552883088b885..49745e5679d9af3c8cf07ba0cac679217a691052 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -36,6 +36,8 @@ limitations under the License. */ #include "paddle/phi/common/data_type.h" #include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/sparse_coo_tensor.h" +#include "paddle/phi/core/sparse_csr_tensor.h" namespace paddle { namespace pybind { @@ -226,6 +228,19 @@ static PyObject* tensor_method__copy_to(TensorObject* self, PyObject* args, EAGER_CATCH_AND_THROW_RETURN_NULL } +static PyObject* tensor_method_cpu(TensorObject* self, PyObject* args, + PyObject* kwargs) { + EAGER_TRY + auto cp_tensor = + self->tensor.copy_to(phi::TransToPhiBackend(phi::CPUPlace()), true); + egr::EagerUtils::autograd_meta(&cp_tensor)->SetStopGradient(true); + egr::EagerUtils::autograd_meta(&cp_tensor) + ->SetPersistable( + egr::EagerUtils::autograd_meta(&(self->tensor))->Persistable()); + return ToPyObject(cp_tensor); + EAGER_CATCH_AND_THROW_RETURN_NULL +} + static PyObject* tensor_method_reconstruct_from_(TensorObject* self, PyObject* args, PyObject* kwargs) { @@ -264,7 +279,7 @@ static PyObject* tensor_method_copy_(TensorObject* self, PyObject* args, egr::EagerUtils::autograd_meta(&(src_tensor))->Persistable()); } - self->tensor.copy_(src_tensor, blocking); + self->tensor.copy_(src_tensor, self->tensor.inner_place(), blocking); VLOG(6) << "Finish Copy Tensor " << src_tensor.name() << " to " << self->tensor.name(); @@ -314,23 +329,25 @@ static PyObject* tensor_clear_gradient(TensorObject* self, PyObject* args, grad = meta->MutableGrad(); } - if (grad->is_selected_rows()) { - auto selected_rows = - std::dynamic_pointer_cast(grad->impl()); - if (selected_rows->mutable_value()->IsInitialized()) { - selected_rows->mutable_rows()->clear(); - selected_rows->mutable_value()->clear(); - } - } else if (grad->is_dense_tensor()) { - if (grad->initialized()) { - if (set_to_zero) { - grad->set_impl(paddle::experimental::zeros_like(*grad).impl()); - } else { - VLOG(4) << "Gradient of " << self->tensor.name() - << " is initialized, will be released."; - auto dense_tensor = - std::dynamic_pointer_cast(grad->impl()); - dense_tensor->MoveMemoryHolder(); + if (grad->impl()) { + if (grad->is_selected_rows()) { + auto selected_rows = + std::dynamic_pointer_cast(grad->impl()); + if (selected_rows->mutable_value()->IsInitialized()) { + selected_rows->mutable_rows()->clear(); + selected_rows->mutable_value()->clear(); + } + } else if (grad->is_dense_tensor()) { + if (grad->initialized()) { + if (set_to_zero) { + grad->set_impl(paddle::experimental::zeros_like(*grad).impl()); + } else { + VLOG(4) << "Gradient of " << self->tensor.name() + << " is initialized, will be released."; + auto dense_tensor = + std::dynamic_pointer_cast(grad->impl()); + dense_tensor->MoveMemoryHolder(); + } } } } @@ -703,6 +720,107 @@ static PyObject* set_grad_type(TensorObject* self, PyObject* args, EAGER_CATCH_AND_THROW_RETURN_NULL } +static PyObject* tensor_method_get_non_zero_indices(TensorObject* self, + PyObject* args, + PyObject* kwargs) { + EAGER_TRY + PADDLE_ENFORCE(self->tensor.is_sparse_coo_tensor(), + paddle::platform::errors::Fatal( + "this method is only effective for SparseCooTensor")); + auto sparse_coo_tensor = + std::dynamic_pointer_cast(self->tensor.impl()); + paddle::experimental::Tensor tensor(std::make_shared( + sparse_coo_tensor->non_zero_indices())); + return ToPyObject(tensor); + EAGER_CATCH_AND_THROW_RETURN_NULL +} + +static PyObject* tensor_method_get_non_zero_elements(TensorObject* self, + PyObject* args, + PyObject* kwargs) { + EAGER_TRY + PADDLE_ENFORCE( + self->tensor.is_sparse_coo_tensor() || + self->tensor.is_sparse_csr_tensor(), + paddle::platform::errors::Fatal("this method is only effective for " + "SparseCooTensor or SparseCsrTensor")); + if (self->tensor.is_sparse_coo_tensor()) { + auto sparse_coo_tensor = + std::dynamic_pointer_cast(self->tensor.impl()); + paddle::experimental::Tensor tensor(std::make_shared( + sparse_coo_tensor->non_zero_elements())); + return ToPyObject(tensor); + } else { + auto sparse_csr_tensor = + std::dynamic_pointer_cast(self->tensor.impl()); + paddle::experimental::Tensor tensor(std::make_shared( + sparse_csr_tensor->non_zero_elements())); + return ToPyObject(tensor); + } + EAGER_CATCH_AND_THROW_RETURN_NULL +} + +static PyObject* tensor_method_get_non_zero_crows(TensorObject* self, + PyObject* args, + PyObject* kwargs) { + EAGER_TRY + PADDLE_ENFORCE(self->tensor.is_sparse_csr_tensor(), + paddle::platform::errors::Fatal( + "this method is only effective for SparseCsrTensor")); + auto sparse_csr_tensor = + std::dynamic_pointer_cast(self->tensor.impl()); + paddle::experimental::Tensor tensor( + std::make_shared(sparse_csr_tensor->non_zero_crows())); + return ToPyObject(tensor); + EAGER_CATCH_AND_THROW_RETURN_NULL +} + +static PyObject* tensor_method_get_non_zero_cols(TensorObject* self, + PyObject* args, + PyObject* kwargs) { + EAGER_TRY + PADDLE_ENFORCE(self->tensor.is_sparse_csr_tensor(), + paddle::platform::errors::Fatal( + "this method is only effective for SparseCsrTensor")); + auto sparse_csr_tensor = + std::dynamic_pointer_cast(self->tensor.impl()); + paddle::experimental::Tensor tensor( + std::make_shared(sparse_csr_tensor->non_zero_cols())); + return ToPyObject(tensor); + EAGER_CATCH_AND_THROW_RETURN_NULL +} + +static PyObject* tensor_method_is_sparse(TensorObject* self, PyObject* args, + PyObject* kwargs) { + EAGER_TRY + return ToPyObject(self->tensor.is_sparse_coo_tensor() || + self->tensor.is_sparse_csr_tensor()); + EAGER_CATCH_AND_THROW_RETURN_NULL +} + +static PyObject* tensor_method_is_sparse_coo(TensorObject* self, PyObject* args, + PyObject* kwargs) { + EAGER_TRY + return ToPyObject(self->tensor.is_sparse_coo_tensor()); + EAGER_CATCH_AND_THROW_RETURN_NULL +} + +static PyObject* tensor_method_is_sparse_csr(TensorObject* self, PyObject* args, + PyObject* kwargs) { + EAGER_TRY + return ToPyObject(self->tensor.is_sparse_csr_tensor()); + EAGER_CATCH_AND_THROW_RETURN_NULL +} + +static PyObject* tensor__inplace_version(TensorObject* self, PyObject* args, + PyObject* kwargs) { + EAGER_TRY + uint32_t inplace_version = self->tensor.current_inplace_version(); + + return ToPyObject(inplace_version); + EAGER_CATCH_AND_THROW_RETURN_NULL +} + PyMethodDef variable_methods[] = { {"numpy", (PyCFunction)(void (*)(void))tensor_method_numpy, METH_VARARGS | METH_KEYWORDS, NULL}, @@ -751,6 +869,28 @@ PyMethodDef variable_methods[] = { METH_VARARGS | METH_KEYWORDS, NULL}, {"_set_grad_type", (PyCFunction)(void (*)(void))set_grad_type, METH_VARARGS | METH_KEYWORDS, NULL}, + /***the method of sparse tensor****/ + {"non_zero_indices", + (PyCFunction)(void (*)(void))tensor_method_get_non_zero_indices, + METH_VARARGS | METH_KEYWORDS, NULL}, + {"non_zero_elements", + (PyCFunction)(void (*)(void))tensor_method_get_non_zero_elements, + METH_VARARGS | METH_KEYWORDS, NULL}, + {"non_zero_crows", + (PyCFunction)(void (*)(void))tensor_method_get_non_zero_crows, + METH_VARARGS | METH_KEYWORDS, NULL}, + {"non_zero_cols", + (PyCFunction)(void (*)(void))tensor_method_get_non_zero_cols, + METH_VARARGS | METH_KEYWORDS, NULL}, + {"is_sparse", (PyCFunction)(void (*)(void))tensor_method_is_sparse, + METH_VARARGS | METH_KEYWORDS, NULL}, + {"is_sparse_coo", (PyCFunction)(void (*)(void))tensor_method_is_sparse_coo, + METH_VARARGS | METH_KEYWORDS, NULL}, + {"is_sparse_csr", (PyCFunction)(void (*)(void))tensor_method_is_sparse_csr, + METH_VARARGS | METH_KEYWORDS, NULL}, + /***the method of sparse tensor****/ + {"_inplace_version", (PyCFunction)(void (*)(void))tensor__inplace_version, + METH_VARARGS | METH_KEYWORDS, NULL}, {NULL, NULL, 0, NULL}}; } // namespace pybind diff --git a/paddle/fluid/pybind/eager_op_function_generator.cc b/paddle/fluid/pybind/eager_op_function_generator.cc index 102cdbb91ab066c4a6d499688bca30c1c3d185ad..685e20aef2591492340d228f0a48d7a426ddb889 100644 --- a/paddle/fluid/pybind/eager_op_function_generator.cc +++ b/paddle/fluid/pybind/eager_op_function_generator.cc @@ -162,17 +162,22 @@ static inline std::string TempName(const std::string& name) { std::string GenerateOpFunctionsBody( const paddle::framework::proto::OpProto* op_proto, std::string func_name, - bool use_inplace_strategy = false, std::map inplace_map = {}) { auto& op_type = op_proto->type(); std::string input_args = ""; - std::string call_api_str = "auto out = " + op_type + "_dygraph_function("; + std::string call_api_str = ""; std::string ins_initializer_with_null = ""; std::string py_arg = ""; int arg_idx = 0; int input_args_num = 0; std::string ins_cast_str = ""; std::string view_strategy_str = ""; + if (!inplace_map.empty()) { + // change call_api_str for inplace op + call_api_str = "auto out = " + op_type + "__dygraph_function("; + } else { + call_api_str = "auto out = " + op_type + "_dygraph_function("; + } for (auto& input : op_proto->inputs()) { auto& in_name = input.name(); // skip those dispensable inputs, like ResidualData in conv2d @@ -288,8 +293,31 @@ std::string GenerateOpFunctionsBody( HANDLE_VIEW_BETWEEN_INPUT_AND_OUTPUT, viwe_input_name, viwe_output_name, viwe_input_name, viwe_output_name); } - - return_str = "return ToPyObject(out);"; + if (!inplace_map.empty()) { + // For inplace op, Use the input PyObject directly. + for (auto& inplace_pair : inplace_map) { + // Find index of inplace tensor, and directly use input PyObject. + std::string inplace_arg_name = inplace_pair.second; + std::string inplace_return_name = inplace_pair.first; + const char* RETURN_INPLACE_TENSOR_TEMPLATE = + "ssize_t arg_id = GetIdxFromCoreOpsInfoMap(core_ops_args_info, " + "\"%s\", \"%s\");\n" + " ssize_t return_id = " + "GetIdxFromCoreOpsInfoMap(core_ops_returns_info, \"%s\", \"%s\");\n" + " return ToPyObject(out, return_id, args, arg_id);"; + return_str = paddle::string::Sprintf(RETURN_INPLACE_TENSOR_TEMPLATE, + op_type, inplace_arg_name, op_type, + inplace_return_name); + // only support one inplace_var in temporary. + PADDLE_ENFORCE_EQ( + inplace_map.size(), 1, + paddle::platform::errors::InvalidArgument( + "size of inplace_map must be 1, but got %d", inplace_map.size())); + break; + } + } else { + return_str = "return ToPyObject(out);"; + } std::string function_args = ""; if (input_args == "") { @@ -383,7 +411,8 @@ GenerateOpFunctions() { continue; } std::string func_name = "eager_api_" + op_type; - std::string op_function_str = GenerateOpFunctionsBody(op_proto, func_name); + std::string op_function_str = + GenerateOpFunctionsBody(op_proto, func_name, {}); // generate pybind item auto bind_function_str = paddle::string::Sprintf( @@ -391,6 +420,40 @@ GenerateOpFunctions() { op_function_list.emplace_back(std::move(op_function_str)); bind_function_list.emplace_back(std::move(bind_function_str)); + + // NOTE(pangyoki): Inplace Strategy. + // In this case, output will reuse input varbase. + // Dygraph mode needs to be aligned with the in-place strategy in static + // mode, and the mapping relationships between output and input that have + // been defined in static mode should be used in dygraph mode. + // Find which ops need to use Inplace strategy in static mode, and get the + // mapping relationship between Inplace output and input. + auto& infer_inplace = + paddle::framework::OpInfoMap::Instance().Get(op_type).infer_inplace_; + std::map inplace_map; + // `sum` op has duplicate input. Don't consider adding inplace strategy + // for `sum` in temporary. + if (op_type != "sum" && infer_inplace) { + // Inplace OP: op_type_. + // The inplace OP needs a new implementation method. + auto in_to_outs = infer_inplace(true); + for (auto& inplace_pair : in_to_outs) { + inplace_map[inplace_pair.second] = inplace_pair.first; + } + + std::string inplace_op_type = op_type + "_"; + std::string inplace_func_name = "eager_api_" + inplace_op_type; + std::string inplace_op_function_str = + GenerateOpFunctionsBody(op_proto, inplace_func_name, inplace_map); + + // generate pybind item + auto inplace_bind_function_str = + paddle::string::Sprintf(PYBIND_ITEM_TEMPLATE, inplace_op_type, + inplace_func_name, inplace_op_type); + + op_function_list.emplace_back(std::move(inplace_op_function_str)); + bind_function_list.emplace_back(std::move(inplace_bind_function_str)); + } } if (append_custom_head_file) { op_function_list.emplace_back(CUSTOM_HANDWRITE_OP_FUNC_FILE); diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc index 2572866b8f5198b2414163d4198e06b54d11fedc..ff8980d727e70a41223878f22f019353f8b71972 100644 --- a/paddle/fluid/pybind/eager_properties.cc +++ b/paddle/fluid/pybind/eager_properties.cc @@ -96,7 +96,7 @@ int tensor_properties_set_grad(TensorObject* self, PyObject* value, "Detected NULL grad" "Please check if you have manually cleared" "the grad inside autograd_meta")); - grad->copy_(src, true); + grad->copy_(src, self->tensor.inner_place(), true); return 0; EAGER_CATCH_AND_THROW_RETURN_ZERO } diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc index 217edad0c0a105cc649c6c8c4433b0c8eab0119b..a23bb1230e128657e0bd416d7e1875997e6cf6e8 100644 --- a/paddle/fluid/pybind/eager_utils.cc +++ b/paddle/fluid/pybind/eager_utils.cc @@ -417,6 +417,8 @@ PyObject* ToPyObject(bool value) { PyObject* ToPyObject(int value) { return PyLong_FromLong(value); } +PyObject* ToPyObject(uint32_t value) { return PyLong_FromUnsignedLong(value); } + PyObject* ToPyObject(int64_t value) { return PyLong_FromLongLong(value); } PyObject* ToPyObject(float value) { return PyLong_FromDouble(value); } @@ -442,6 +444,20 @@ PyObject* ToPyObject(const paddle::experimental::Tensor& value) { return obj; } +PyObject* ToPyObject(const paddle::experimental::Tensor& value, + ssize_t value_idx, PyObject* args, ssize_t arg_idx) { + // For inplace op, directly return the input PyObject of the inplace tensor. + // [Parameter] + // value: Useless parameter. + // value_idx: Useless parameter. + // args: Input PyObject. + // arg_idx: Index of inplace PyObject in input args. Used to find the input + // inplace PyObject. + PyObject* obj = PyTuple_GET_ITEM(args, arg_idx); + Py_INCREF(obj); + return obj; +} + PyObject* ToPyObject(const std::vector& value) { PyObject* result = PyList_New((Py_ssize_t)value.size()); @@ -492,20 +508,26 @@ PyObject* ToPyObject(const std::vector& value) { return result; } -PyObject* ToPyObject(const std::vector& value) { +PyObject* ToPyObject(const std::vector& value, + bool return_py_none_if_not_initialize) { PyObject* result = PyList_New((Py_ssize_t)value.size()); for (size_t i = 0; i < value.size(); i++) { - PyObject* obj = p_tensor_type->tp_alloc(p_tensor_type, 0); - if (obj) { - auto v = reinterpret_cast(obj); - new (&(v->tensor)) paddle::experimental::Tensor(); - v->tensor = value[i]; + if (!value[i].initialized() && return_py_none_if_not_initialize) { + Py_INCREF(Py_None); + PyList_SET_ITEM(result, static_cast(i), Py_None); } else { - PADDLE_THROW(platform::errors::Fatal( - "tp_alloc return null, can not new a PyObject.")); + PyObject* obj = p_tensor_type->tp_alloc(p_tensor_type, 0); + if (obj) { + auto v = reinterpret_cast(obj); + new (&(v->tensor)) paddle::experimental::Tensor(); + v->tensor = value[i]; + } else { + PADDLE_THROW(platform::errors::Fatal( + "tp_alloc return null, can not new a PyObject.")); + } + PyList_SET_ITEM(result, static_cast(i), obj); } - PyList_SET_ITEM(result, static_cast(i), obj); } return result; diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h index 2187555e1c3c7f64bd864e4212bfc6ebe1fb1684..fba1485bcf44ea70db286225fbbe3c70caceb4bd 100644 --- a/paddle/fluid/pybind/eager_utils.h +++ b/paddle/fluid/pybind/eager_utils.h @@ -56,6 +56,7 @@ framework::proto::VarType::Type CastPyArg2ProtoType(PyObject* obj, ssize_t arg_pos); PyObject* ToPyObject(int value); +PyObject* ToPyObject(uint32_t value); PyObject* ToPyObject(bool value); PyObject* ToPyObject(int64_t value); PyObject* ToPyObject(float value); @@ -63,12 +64,15 @@ PyObject* ToPyObject(double value); PyObject* ToPyObject(const char* value); PyObject* ToPyObject(const std::string& value); PyObject* ToPyObject(const paddle::experimental::Tensor& value); +PyObject* ToPyObject(const paddle::experimental::Tensor& value, + ssize_t value_idx, PyObject* args, ssize_t arg_idx); PyObject* ToPyObject(const std::vector& value); PyObject* ToPyObject(const std::vector& value); PyObject* ToPyObject(const std::vector& value); PyObject* ToPyObject(const std::vector& value); PyObject* ToPyObject(const std::vector& value); -PyObject* ToPyObject(const std::vector& value); +PyObject* ToPyObject(const std::vector& value, + bool return_py_none_if_not_initialize = false); PyObject* ToPyObject(const platform::Place& value); PyObject* ToPyObject(const framework::LoDTensor* value); PyObject* ToPyObject(const paddle::framework::proto::VarType::Type& dtype); @@ -83,6 +87,17 @@ struct TupleTensorResult { TupleTensorResult::Run(out, result); PyTuple_SET_ITEM(result, N - 1, ToPyObject(std::get(out))); } + + static void Run(const Tuple& out, PyObject* result, ssize_t value_idx, + PyObject* args, ssize_t arg_idx) { + TupleTensorResult::Run(out, result, value_idx, args, arg_idx); + if (N - 1 == value_idx) { + PyTuple_SET_ITEM(result, N - 1, ToPyObject(std::get(out), + value_idx, args, arg_idx)); + } else { + PyTuple_SET_ITEM(result, N - 1, ToPyObject(std::get(out))); + } + } }; template @@ -90,6 +105,16 @@ struct TupleTensorResult { static void Run(const Tuple& out, PyObject* result) { PyTuple_SET_ITEM(result, 0, ToPyObject(std::get<0>(out))); } + + static void Run(const Tuple& out, PyObject* result, ssize_t value_idx, + PyObject* args, ssize_t arg_idx) { + if (value_idx == 0) { + PyTuple_SET_ITEM(result, 0, + ToPyObject(std::get<0>(out), value_idx, args, arg_idx)); + } else { + PyTuple_SET_ITEM(result, 0, ToPyObject(std::get<0>(out))); + } + } }; template @@ -102,6 +127,26 @@ PyObject* ToPyObject(const std::tuple& out) { return result; } +template +PyObject* ToPyObject(const std::tuple& out, ssize_t value_idx, + PyObject* args, ssize_t arg_idx) { + // For inplace op, directly return the input PyObject of the inplace tensor. + // [Parameter] + // out: Outputs tuple after executing op. + // value_idx: Index of inplace tensor in outputs tuple. Used to find the + // output inplace tensor. + // args: Input PyObject. + // arg_idx: Index of inplace PyObject in input args. Used to find the input + // inplace PyObject. + auto len = sizeof...(Args); + PyObject* result = PyTuple_New(len); + + TupleTensorResult::Run(out, result, value_idx, + args, arg_idx); + + return result; +} + paddle::experimental::Scalar CastPyArg2Scalar(PyObject* obj, const std::string& op_type, ssize_t arg_pos); diff --git a/paddle/fluid/pybind/fleet_py.cc b/paddle/fluid/pybind/fleet_py.cc index 3145a9cf7655c053c269990e00982226eae49c7a..01dae420cc6ab84edc0b0df11b0b4cf6408a87f7 100644 --- a/paddle/fluid/pybind/fleet_py.cc +++ b/paddle/fluid/pybind/fleet_py.cc @@ -225,7 +225,7 @@ void BindGraphPyClient(py::module* m) { .def("stop_server", &GraphPyClient::stop_server) .def("get_node_feat", [](GraphPyClient& self, std::string node_type, - std::vector node_ids, + std::vector node_ids, std::vector feature_names) { auto feats = self.get_node_feat(node_type, node_ids, feature_names); @@ -239,7 +239,7 @@ void BindGraphPyClient(py::module* m) { }) .def("set_node_feat", [](GraphPyClient& self, std::string node_type, - std::vector node_ids, + std::vector node_ids, std::vector feature_names, std::vector> bytes_feats) { std::vector> feats(bytes_feats.size()); diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 9b373a58181f165b52fe809b817977a076b7d961..3a2c93309f34454ae0ce2d3419e3fce474f7c06b 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -52,11 +52,13 @@ limitations under the License. */ #include "paddle/fluid/imperative/type_defs.h" #include "paddle/fluid/memory/allocation/mmap_allocator.h" #include "paddle/fluid/operators/utils.h" +#include "paddle/fluid/pybind/eager_utils.h" #include "paddle/fluid/pybind/op_function.h" #include "paddle/fluid/pybind/pybind_boost_headers.h" #include "paddle/fluid/pybind/slice_utils.h" #include "paddle/fluid/pybind/tensor_py.h" #include "paddle/phi/core/compat/arg_map_context.h" +#include "paddle/phi/core/compat/type_defs.h" namespace paddle { namespace pybind { @@ -117,7 +119,11 @@ class PyVariableWrapperHook : public imperative::VariableWrapperHook { return var; } - return PyObjectCast>(res)->SharedVar(); + auto res_varbase = PyObjectCast>(res); + // Here the reference count of `res` is 2, so we decreases the reference + // count manually to avoid memory leaks + Py_DECREF(res); + return res_varbase->SharedVar(); } private: @@ -436,6 +442,28 @@ static imperative::NameVarBaseMap ConvertToNameVarBaseMap( return result; } +paddle::imperative::NameTensorMap ConvertToNameTensorMap( + const PyNameVarBaseMap &map) { + paddle::imperative::NameTensorMap result; + for (auto &pair : map) { + auto var_vec = CastPyArg2VectorOfTensor(pair.second.ptr(), 0); + if (!var_vec.empty()) { + // change vector -> vector> + std::vector> dst_var_vec; + for (auto &v : var_vec) { + dst_var_vec.emplace_back( + std::make_shared(std::move(v))); + } + result.emplace(pair.first, std::move(dst_var_vec)); + } + } + + PADDLE_ENFORCE_EQ( + PyErr_Occurred(), nullptr, + platform::errors::InvalidArgument(py::str(py::handle(PyErr_Occurred())))); + return result; +} + template static void VarBaseCopy(std::shared_ptr &src, // NOLINT imperative::VarBase &dst, // NOLINT @@ -2079,8 +2107,8 @@ void BindImperative(py::module *m_ptr) { const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs, framework::AttributeMap attrs) { // TODO(xiongkun): move this function outside of tracer. - auto ins_map = ConvertToNameVarBaseMap(ins); - auto outs_map = ConvertToNameVarBaseMap(outs); + auto ins_map = ConvertToNameTensorMap(ins); + auto outs_map = ConvertToNameTensorMap(outs); { auto to_vector = [](paddle::SmallVector &vec) { return std::vector(vec.begin(), vec.end()); diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index b008308e27d9afaa9d8c47290489d50a762f2a41..c8f0acd0b8a853f541a6fb8cbafe73f27688c71a 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -551,6 +551,9 @@ void BindAnalysisConfig(py::module *m) { .def("params_file", &AnalysisConfig::params_file) .def("enable_use_gpu", &AnalysisConfig::EnableUseGpu, py::arg("memory_pool_init_size_mb"), py::arg("device_id") = 0) + .def("exp_enable_use_gpu_fp16", &AnalysisConfig::Exp_EnableUseGpuFp16, + py::arg("gpu_fp16_disabled_op_types") = + std::unordered_set({})) .def("enable_xpu", &AnalysisConfig::EnableXpu, py::arg("l3_workspace_size") = 16 * 1024 * 1024, py::arg("locked") = false, py::arg("autotune") = true, diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc index bb45c1c40603f953c70f0e63b6e762037312e8c3..ecbacd37d5666b85d5ddaef595d106e2400b055c 100644 --- a/paddle/fluid/pybind/ir.cc +++ b/paddle/fluid/pybind/ir.cc @@ -143,6 +143,7 @@ void BindNode(py::module *m) { .def("var", &Node::Var, return_value_policy::reference) .def("op", &Node::Op, return_value_policy::reference) .def("id", &Node::id) + .def("graph_id", &Node::GraphId) .def("original_desc_id", &Node::OriginalDescId) .def("is_op", &Node::IsOp) .def("is_var", &Node::IsVar) diff --git a/paddle/fluid/pybind/kernel_signature_generator.cc b/paddle/fluid/pybind/kernel_signature_generator.cc index 8d78adaf5a4735d87e2206df6c8b55875db68118..1520174fba288b4ecf683e79c36b6c0228237b2e 100644 --- a/paddle/fluid/pybind/kernel_signature_generator.cc +++ b/paddle/fluid/pybind/kernel_signature_generator.cc @@ -46,10 +46,19 @@ int main(int argc, char **argv) { auto &kernel_factory = phi::KernelFactory::Instance(); std::string kernel_signature_map_str{"{"}; for (const auto &op_kernel_pair : kernel_factory.kernels()) { - if (kernel_signature_map.Has(op_kernel_pair.first)) { + std::string op_name = op_kernel_pair.first; + const paddle::flat_hash_map &kernel_name_map = + phi::OpUtilsMap::Instance().base_kernel_name_map(); + for (auto &it : kernel_name_map) { + if (it.second == op_name) { + op_name = it.first; + break; + } + } + if (kernel_signature_map.Has(op_name)) { kernel_signature_map_str = kernel_signature_map_str + "\"" + op_kernel_pair.first + "\":{"; - auto &args = kernel_signature_map.Get(op_kernel_pair.first).args; + auto &args = kernel_signature_map.Get(op_name).args; kernel_signature_map_str += "\"inputs\":["; auto inputs_ = std::get<0>(args); diff --git a/paddle/fluid/pybind/op_function_common.cc b/paddle/fluid/pybind/op_function_common.cc index 09c3cea398b2aec4d7cf0953ffb0aed75de37601..1d483abd7746c104c3f1dcf318f45850e4fcb855 100644 --- a/paddle/fluid/pybind/op_function_common.cc +++ b/paddle/fluid/pybind/op_function_common.cc @@ -854,5 +854,30 @@ void InitOpsAttrTypeMap() { } } +ssize_t GetIdxFromCoreOpsInfoMap( + const std::unordered_map>& + core_ops_info_map, + const std::string& op_type, const std::string& name) { + // `core_ops_info_map` can be `core_ops_args_info` or `core_ops_returns_info`. + // `core_ops_args_info`: get index from core_ops_args_info[op_type] according + // to input name. + // `core_ops_returns_info`: get index from core_ops_returns_info[op_type] + // according to return name. + if (!core_ops_info_map.count(op_type)) { + PADDLE_THROW(platform::errors::Fatal( + "Op %s is not found in core_ops_*_info map.", op_type)); + } else { + auto args_list = core_ops_info_map.at(op_type); + auto it = std::find(args_list.begin(), args_list.end(), name); + if (it == args_list.end()) { + PADDLE_THROW(platform::errors::Fatal("%s is not found in op %s's args.", + name, op_type)); + } else { + return std::distance(args_list.begin(), it); + } + } + return -1; +} + } // namespace pybind } // namespace paddle diff --git a/paddle/fluid/pybind/op_function_common.h b/paddle/fluid/pybind/op_function_common.h index 7ead9852667252d189b1fcdecc6b4ac7b86d785f..33d0e242a027d250904a21ca36a39b6a639178e1 100644 --- a/paddle/fluid/pybind/op_function_common.h +++ b/paddle/fluid/pybind/op_function_common.h @@ -146,5 +146,10 @@ unsigned long GetUnsignedLongFromArgs( // NOLINT void InitOpsAttrTypeMap(); +ssize_t GetIdxFromCoreOpsInfoMap( + const std::unordered_map>& + core_ops_info_map, + const std::string& op_type, const std::string& name); + } // namespace pybind } // namespace paddle diff --git a/paddle/fluid/pybind/op_function_generator.h b/paddle/fluid/pybind/op_function_generator.h index 9e86e3df8a6884ec1b75b8525ad858ff8f2e233c..d8750c1d6c115a6de8a493cac4ccadbd47bc10fd 100644 --- a/paddle/fluid/pybind/op_function_generator.h +++ b/paddle/fluid/pybind/op_function_generator.h @@ -88,6 +88,7 @@ std::map> op_ins_map = { {"nce", {"Input", "Label", "Weight", "Bias", "SampleWeight", "CustomDistProbs", "CustomDistAlias", "CustomDistAliasProbs"}}, + {"check_finite_and_unscale", {"X", "Scale", "FloatStatus"}}, }; // NOTE(zhiqiu): Like op_ins_map. diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 21bbc7f3e369bf66935487d3f3619c9a0890399b..ed42d0792eafbc8661883a7e8d5b396fac14686f 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -114,6 +114,7 @@ limitations under the License. */ #include "paddle/fluid/pybind/metrics_py.h" #include "paddle/fluid/pybind/ps_gpu_wrapper_py.h" #include "paddle/fluid/pybind/pybind_boost_headers.h" +#include "paddle/phi/backends/device_manager.h" #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/pybind/nccl_wrapper_py.h" @@ -742,6 +743,11 @@ PYBIND11_MODULE(core_noavx, m) { // stored in this static instance to avoid illegal memory access. m.def("clear_kernel_factory", []() { phi::KernelFactory::Instance().kernels().clear(); }); + m.def("clear_device_manager", []() { +#ifdef PADDLE_WITH_CUSTOM_DEVICE + phi::DeviceManager::Clear(); +#endif + }); // NOTE(zjl): ctest would load environment variables at the beginning even // though we have not `import paddle.fluid as fluid`. So we add this API diff --git a/paddle/infrt/CMakeLists.txt b/paddle/infrt/CMakeLists.txt index 4e273f6d551edd74ec979e6ec34aedabdb58bd10..e777a8e3ab4e6a59662ce7b4eb9a31a7409d6f56 100644 --- a/paddle/infrt/CMakeLists.txt +++ b/paddle/infrt/CMakeLists.txt @@ -3,12 +3,22 @@ if (NOT WITH_INFRT) endif() option(INFRT_WITH_PHI "Compile INFRT with PHI" ON) +option(INFRT_WITH_GPU "Compile INFRT with GPU" OFF) +option(INFRT_WITH_TRT "Compile INFRT with TensorRT" OFF) #TODO(xiaowei) remove fluid include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform) if (INFRT_WITH_PHI) - add_definitions("-DINFRT_WITH_PHI") + add_definitions("-DINFRT_WITH_PHI") + + # TODO(wilber): Now Infrt gpu/trt depends on phi's components, Modify compile dependency options later. + if (INFRT_WITH_GPU) + add_definitions("-DINFRT_WITH_GPU") + if (INFRT_WITH_TRT) + add_definitions("-DINFRT_WITH_TRT") + endif() + endif() endif() # compile flags @@ -92,7 +102,6 @@ set(infrt_mlir_incs test_kernels_inc tensor_shape_inc dense_tensor_inc - pd_ops_inc pd_extra_ops_inc trt_ops_inc ) @@ -106,6 +115,9 @@ if (INFRT_WITH_PHI) endif() cc_library(infrt SHARED SRCS ${infrt_src} DEPS glog boost ${mlir_libs} ${phi_libs} paddle_framework_proto infrt_naive) +if (INFRT_WITH_TRT) + target_link_libraries(infrt infrt_trt) +endif() cc_library(infrt_static SRCS ${infrt_src} DEPS glog boost ${mlir_libs} ${phi_libs} paddle_framework_proto) add_dependencies(infrt ${infrt_mlir_incs} mlir-headers) diff --git a/paddle/infrt/backends/host/phi_allocator.h b/paddle/infrt/backends/host/phi_allocator.h index c8f97e04a1b8376efbac749fffa70d77c7b95e72..6e3bef9299162d493825f49e3962c75f2845e2d0 100644 --- a/paddle/infrt/backends/host/phi_allocator.h +++ b/paddle/infrt/backends/host/phi_allocator.h @@ -13,6 +13,10 @@ limitations under the License. */ #include "paddle/phi/core/allocator.h" +#ifdef INFRT_WITH_GPU +#include +#endif + namespace infrt { namespace backends { @@ -29,5 +33,22 @@ class CpuPhiAllocator : public phi::Allocator { } }; +#ifdef INFRT_WITH_GPU +// TODO(wilber): Just for demo test. we need a more efficient gpu allocator. +class GpuPhiAllocator : public phi::Allocator { + public: + static void deleter(phi::Allocation* ptr) { cudaFree(ptr->ptr()); } + + AllocationPtr Allocate(size_t bytes_size) { + void* ptr; + cudaMalloc(&ptr, bytes_size); + return AllocationPtr( + new phi::Allocation( + ptr, bytes_size, phi::Place(phi::AllocationType::GPU)), + deleter); + } +}; +#endif + } // namespace backends } // namespace infrt diff --git a/paddle/infrt/backends/host/phi_context.h b/paddle/infrt/backends/host/phi_context.h index 5713fdbbaf82b2ea2190d2ee1b1dc5d944f2c262..bcd63dbb39fe8c52499138423bc9b86fa5de9d57 100644 --- a/paddle/infrt/backends/host/phi_context.h +++ b/paddle/infrt/backends/host/phi_context.h @@ -13,6 +13,7 @@ limitations under the License. */ #include "paddle/infrt/backends/host/phi_allocator.h" #include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/backends/gpu/gpu_context.h" namespace infrt { namespace backends { @@ -31,5 +32,16 @@ class CpuPhiContext : public phi::CPUContext { std::unique_ptr alloc_{std::make_unique()}; }; +class GpuPhiContext : public phi::GPUContext { + public: + using Base = phi::GPUContext; + using phi::GPUContext::SetStream; + using phi::GPUContext::SetEigenDevice; + using phi::GPUContext::SetBlasHandle; + using phi::GPUContext::SetDnnHandle; + using phi::GPUContext::SetSolverHandle; + using phi::GPUContext::SetSparseHandle; +}; + } // namespace backends } // namespace infrt diff --git a/paddle/infrt/backends/tensorrt/test_trt_engine.cc b/paddle/infrt/backends/tensorrt/test_trt_engine.cc index 12cf14060e27c1d58e3fd9b14cc12b3c1f7f8907..89dd3b0dc7abf48102b48f16fb974b3c902fe049 100644 --- a/paddle/infrt/backends/tensorrt/test_trt_engine.cc +++ b/paddle/infrt/backends/tensorrt/test_trt_engine.cc @@ -37,9 +37,9 @@ namespace infrt { namespace backends { namespace tensorrt { -const char* model_input = "model_input"; -const char* model_output = "model_output1"; -const char* model_output2 = "model_output2"; +const char* model_input = "input_0"; +const char* model_output = "output_0"; +const char* model_output2 = "output_1"; TrtUniquePtr ConstructNetwork( nvinfer1::IBuilder* builder, nvinfer1::Dims dims, bool is_static_shape) { @@ -82,9 +82,176 @@ TrtUniquePtr ConstructNetwork( return network; } +TrtUniquePtr ConstructFCNetwork( + nvinfer1::IBuilder* builder, nvinfer1::Dims dims, bool is_static_shape) { + TrtUniquePtr network; + if (is_static_shape) { + network.reset(builder->createNetworkV2(0U)); + } else { + auto networkFlags = + 1U << static_cast( + nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); + network.reset(builder->createNetworkV2(networkFlags)); + } + + ITensor* data = + network->addInput(model_input, nvinfer1::DataType::kFLOAT, dims); + CHECK_NOTNULL(data); + nvinfer1::Weights kernel_weights; + kernel_weights.type = nvinfer1::DataType::kFLOAT; + kernel_weights.count = 7840; + std::vector weight_data(kernel_weights.count); + for (size_t i = 0; i < weight_data.size(); ++i) { + weight_data[i] = i % 255 * 0.02f; + } + kernel_weights.values = weight_data.data(); + auto* layer = network->addFullyConnected( + *data, 10, kernel_weights, nvinfer1::Weights{}); + CHECK_NOTNULL(layer); + auto* out = layer->getOutput(0); + out->setName(model_output); + network->markOutput(*out); + return network; +} + +TrtUniquePtr ConstructConvNetwork( + nvinfer1::IBuilder* builder, nvinfer1::Dims dims, bool is_static_shape) { + TrtUniquePtr network; + if (is_static_shape) { + network.reset(builder->createNetworkV2(0U)); + } else { + auto networkFlags = + 1U << static_cast( + nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); + network.reset(builder->createNetworkV2(networkFlags)); + } + + ITensor* data = + network->addInput(model_input, nvinfer1::DataType::kFLOAT, dims); + CHECK_NOTNULL(data); + nvinfer1::Weights kernel_weights, bias_weights; + kernel_weights.type = nvinfer1::DataType::kFLOAT; + bias_weights.type = nvinfer1::DataType::kFLOAT; + kernel_weights.count = 81; + bias_weights.count = 3; + std::vector weight_data(kernel_weights.count); + for (size_t i = 0; i < weight_data.size(); ++i) { + weight_data[i] = i * 0.02f; + } + std::vector bias_data(bias_weights.count); + for (size_t i = 0; i < bias_data.size(); ++i) { + bias_data[i] = i * 0.5f; + } + kernel_weights.values = weight_data.data(); + bias_weights.values = bias_data.data(); + nvinfer1::Dims ksize; + ksize.nbDims = 2; + ksize.d[0] = 3; + ksize.d[1] = 3; + auto* layer = + network->addConvolutionNd(*data, 3, ksize, kernel_weights, bias_weights); + CHECK_NOTNULL(layer); + auto* out = layer->getOutput(0); + out->setName(model_output); + network->markOutput(*out); + return network; +} + // sigmoid(x) = 1 / (1 + exp(-x)) inline float sigmoid(float x) { return 1.f / (1.f + exp(-1 * x)); } +TEST(trt, run_fc_static) { + TrtEngine engine(0); + auto net = ConstructFCNetwork( + engine.GetTrtBuilder(), nvinfer1::Dims3{1, 28, 28}, true); + BuildOptions build_options; + build_options.max_batch = 4; + build_options.workspace = 1024; + engine.Build(std::move(net), build_options); + + InferenceOptions inference_options; + inference_options.batch = 1; + + phi::GPUPlace place; + phi::GPUContext context; + context.PartialInitWithoutAllocator(); + context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(place, context.stream()) + .get()); + context.PartialInitWithAllocator(); + + phi::DenseTensorMeta meta( + phi::DataType::FLOAT32, + phi::make_ddim({inference_options.batch, 1, 28, 28})); + phi::DenseTensor input; + input.set_meta(meta); + context.Alloc(&input, input.numel() * sizeof(float)); + std::vector host_data(inference_options.batch * 1 * 28 * 28, 0); + for (size_t i = 0; i < host_data.size(); ++i) { + host_data[i] = i % 100 * 0.016f; + } + paddle::memory::Copy(place, + input.data(), + phi::CPUPlace(), + host_data.data(), + sizeof(float) * host_data.size(), + context.stream()); + + std::unordered_map inputs; + inputs.emplace(std::make_pair(model_input, &input)); + engine.PrepareOutputHandle("output_0"); + engine.SetUpInference(inference_options, inputs); + engine.GetEngineInfo(); + engine.Run(context); + cudaStreamSynchronize(context.stream()); +} + +TEST(trt, run_conv_static) { + TrtEngine engine(0); + auto net = ConstructConvNetwork( + engine.GetTrtBuilder(), nvinfer1::Dims3{3, 28, 28}, true); + BuildOptions build_options; + build_options.max_batch = 4; + build_options.workspace = 1024; + engine.Build(std::move(net), build_options); + + InferenceOptions inference_options; + inference_options.batch = 1; + + phi::GPUPlace place; + phi::GPUContext context; + context.PartialInitWithoutAllocator(); + context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(place, context.stream()) + .get()); + context.PartialInitWithAllocator(); + + phi::DenseTensorMeta meta( + phi::DataType::FLOAT32, + phi::make_ddim({inference_options.batch, 3, 28, 28})); + phi::DenseTensor input; + input.set_meta(meta); + context.Alloc(&input, input.numel() * sizeof(float)); + std::vector host_data(inference_options.batch * 3 * 28 * 28, 0); + for (size_t i = 0; i < host_data.size(); ++i) { + host_data[i] = i % 100 * 0.016f; + } + paddle::memory::Copy(place, + input.data(), + phi::CPUPlace(), + host_data.data(), + sizeof(float) * host_data.size(), + context.stream()); + + std::unordered_map inputs; + inputs.emplace(std::make_pair(model_input, &input)); + engine.PrepareOutputHandle("output_0"); + engine.SetUpInference(inference_options, inputs); + engine.GetEngineInfo(); + engine.Run(context); + cudaStreamSynchronize(context.stream()); +} + TEST(trt, run_static) { TrtEngine static_trt_engine(0); auto net = ConstructNetwork( @@ -122,27 +289,26 @@ TEST(trt, run_static) { std::unordered_map inputs; inputs.emplace(std::make_pair(model_input, &input)); - phi::DenseTensor output, output2; - std::unordered_map outputs; - outputs.emplace(std::make_pair(model_output, &output)); - outputs.emplace(std::make_pair(model_output2, &output2)); - - static_trt_engine.SetUpInference(inference_options, inputs, &outputs); + static_trt_engine.PrepareOutputHandle("output_0"); + static_trt_engine.PrepareOutputHandle("output_1"); + static_trt_engine.SetUpInference(inference_options, inputs); static_trt_engine.GetEngineInfo(); static_trt_engine.Run(context); + phi::DenseTensor* output0 = static_trt_engine.GetOutput("output_0"); + phi::DenseTensor* output1 = static_trt_engine.GetOutput("output_1"); std::vector output_data1(inference_options.batch * 1 * 28 * 28, 0); std::vector output_data2(inference_options.batch * 2 * 28 * 28, 0); paddle::memory::Copy(phi::CPUPlace(), output_data1.data(), place, - output.data(), + output0->data(), sizeof(float) * output_data1.size(), context.stream()); paddle::memory::Copy(phi::CPUPlace(), output_data2.data(), place, - output2.data(), + output1->data(), sizeof(float) * output_data2.size(), context.stream()); cudaStreamSynchronize(context.stream()); @@ -208,27 +374,27 @@ TEST(trt, run_dynamic) { context.stream()); std::unordered_map inputs; - std::unordered_map outputs; inputs.emplace(std::make_pair(model_input, &input)); - outputs.emplace(std::make_pair(model_output, &output)); - outputs.emplace(std::make_pair(model_output2, &output2)); - - engine.SetUpInference(inference_options, inputs, &outputs); + engine.PrepareOutputHandle("output_0"); + engine.PrepareOutputHandle("output_1"); + engine.SetUpInference(inference_options, inputs); engine.GetEngineInfo(); engine.Run(context); + phi::DenseTensor* output0 = engine.GetOutput("output_0"); + phi::DenseTensor* output1 = engine.GetOutput("output_1"); std::vector output_data1(inference_options.batch * 1 * 16 * 16, 0); std::vector output_data2(inference_options.batch * 2 * 16 * 16, 0); paddle::memory::Copy(phi::CPUPlace(), output_data1.data(), place, - output.data(), + output0->data(), sizeof(float) * output_data1.size(), context.stream()); paddle::memory::Copy(phi::CPUPlace(), output_data2.data(), place, - output2.data(), + output1->data(), sizeof(float) * output_data2.size(), context.stream()); cudaStreamSynchronize(context.stream()); diff --git a/paddle/infrt/backends/tensorrt/trt_engine.cc b/paddle/infrt/backends/tensorrt/trt_engine.cc index 232653e8c41f71fd9bb32c9eac302b047d122b66..43d356b6d6983afdca220029d34d9d5cd27da009 100644 --- a/paddle/infrt/backends/tensorrt/trt_engine.cc +++ b/paddle/infrt/backends/tensorrt/trt_engine.cc @@ -21,6 +21,7 @@ #include "paddle/phi/backends/dynload/tensorrt.h" #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/dense_tensor.h" namespace infrt { namespace backends { @@ -235,10 +236,20 @@ bool TrtEngine::SetupNetworkAndConfig(const BuildOptions& build, return true; } +void TrtEngine::PrepareOutputHandle(const std::string& out_name) { + phi::DenseTensor t; + outputs_.emplace(out_name, t); +} + +phi::DenseTensor* TrtEngine::GetOutput(const std::string& name) { + return &outputs_[name]; +} + +size_t TrtEngine::GetOutputNum() const { return outputs_.size(); } + bool TrtEngine::SetUpInference( const InferenceOptions& inference, - const std::unordered_map& inputs, - std::unordered_map* outputs) { + const std::unordered_map& inputs) { // TODO(wilber): now only create one exec_context FreshDeviceId(); CHECK(engine_ != nullptr); @@ -252,10 +263,10 @@ bool TrtEngine::SetUpInference( bindings_.front()->AddBinding( bind_index, it.first, true, it.second, nvinfer1::DataType::kFLOAT); } - for (auto& it : *outputs) { + for (auto& it : outputs_) { const int bind_index = engine_->getBindingIndex(it.first.c_str()); bindings_.front()->AddBinding( - bind_index, it.first, false, it.second, nvinfer1::DataType::kFLOAT); + bind_index, it.first, false, &it.second, nvinfer1::DataType::kFLOAT); } return true; @@ -290,11 +301,13 @@ void TrtEngine::StaticRun(const phi::GPUContext& ctx) { const int bind_index = engine_->getBindingIndex(bind.name.c_str()); std::vector ddim; auto dims = engine_->getBindingDimensions(bind_index); + CHECK_NE(runtime_batch, -1) << "runtime_batch should not be -1."; ddim.push_back(runtime_batch); for (int i = 0; i < dims.nbDims; ++i) { ddim.push_back(dims.d[i]); } bind.buffer->Resize(phi::make_ddim(ddim)); + // TODO(wilber): now only support float output. ctx.Alloc(bind.buffer, sizeof(float) * bind.buffer->numel()); buffers[bind_index] = static_cast(bind.buffer->data()); } diff --git a/paddle/infrt/backends/tensorrt/trt_engine.h b/paddle/infrt/backends/tensorrt/trt_engine.h index 3c8243e3c3838e30eb70877f8c82d623c103eaff..a26474f8cbb357d42cd6d951829bbdc24a256640 100644 --- a/paddle/infrt/backends/tensorrt/trt_engine.h +++ b/paddle/infrt/backends/tensorrt/trt_engine.h @@ -81,11 +81,17 @@ class TrtEngine { // TODO(wilber): How to support multiple execution contexts? bool SetUpInference( const InferenceOptions& inference, - const std::unordered_map& inputs, - std::unordered_map* outputs); + const std::unordered_map& inputs); void GetEngineInfo(); + void PrepareOutputHandle(const std::string& out_name); + + // TODO(wilber): The output tensor names are: output_0, output_1, ... + phi::DenseTensor* GetOutput(const std::string&); + + size_t GetOutputNum() const; + private: void FreshDeviceId(); @@ -112,6 +118,7 @@ class TrtEngine { std::vector> bindings_; int device_id_{0}; bool is_dynamic_shape_{false}; + std::unordered_map outputs_; }; } // namespace tensorrt diff --git a/paddle/infrt/dialect/CMakeLists.txt b/paddle/infrt/dialect/CMakeLists.txt index a3f2d0afafc417cc7a4cbba8a3d6bfa92c9bef00..cf3906c32e559d9fa33d0583be9adb1b2591e78b 100644 --- a/paddle/infrt/dialect/CMakeLists.txt +++ b/paddle/infrt/dialect/CMakeLists.txt @@ -7,16 +7,10 @@ gather_srcs(infrt_src SRCS dense_tensor.cc mlir_loader.cc diagnostic_utils.cc - pd_ops.cc ) mlir_tablegen_on(tensor_shape DIALECT ts) mlir_tablegen_on(dense_tensor DIALECT dt) -mlir_tablegen_on(pd_op_base DIALECT pd) -mlir_tablegen_on(pd_ops) -mlir_tablegen_on(pd_extra_ops) - -mlir_add_rewriter(rewrite) # TODO(Superjomn) add a cmake function cc_executable to ecapsulate the following code add_executable(infrtopt opt.cc) @@ -24,10 +18,10 @@ target_link_libraries(infrtopt infrt) add_executable(print-ir print_ir.cc) target_link_libraries(print-ir infrt ${mlir_libs}) -add_dependencies(print-ir pd_ops_inc) cc_test_tiny(test_infrt_mlir_loader SRCS mlir_loader_test.cc DEPS infrt ${MLIR_IR_LIBS}) add_subdirectory(infrt) +add_subdirectory(pd) add_subdirectory(tensorrt) if (INFRT_WITH_PHI) diff --git a/paddle/infrt/dialect/dense_tensor.td b/paddle/infrt/dialect/dense_tensor.td index 666c7b300af33db0c27e5b3ab8a74aa4b1591c9b..59df4e9697370e9d8db4bbc0a5d69e8ef03950a5 100644 --- a/paddle/infrt/dialect/dense_tensor.td +++ b/paddle/infrt/dialect/dense_tensor.td @@ -130,7 +130,7 @@ def TensorMapGetTensorOp : DT_Op<"tensor_map_get_tensor", [NoSideEffect]> { } def TensorMapGetSizeOp : DT_Op<"tensor_map_get_size", [NoSideEffect]> { - let summary = "ddt.tensor_map_get_size operation"; + let summary = "dt.tensor_map_get_size operation"; let description = [{ An operation that get the size of a TensorMap. @@ -141,6 +141,32 @@ def TensorMapGetSizeOp : DT_Op<"tensor_map_get_size", [NoSideEffect]> { let assemblyFormat = "`(` $map `)` attr-dict `->` type($size)"; } +def Infrt_TensorListGetTensorOp : DT_Op<"tensor_list_get_tensor", [NoSideEffect]> { + let summary = "dt.tensor_list_get_tensor operation"; + + let description = [{ + An operation that can get a tensor from a TensorList. + }]; + + let arguments = (ins + DenseTensorList:$l, + I32Attr:$id + ); + let results = (outs DenseTensor:$output); + let verifier = ?; +} + +def TensorListGetSizeOp : DT_Op<"tensor_list_get_size", [NoSideEffect]> { + let summary = "dt.tensor_list_get_size operation"; + + let description = [{ + An operation that get the size of a TensorList. + }]; + + let arguments = (ins DenseTensorList:$map); + let results = (outs I32:$size); +} + def GetTensorShapeOp : DT_Op<"get_tensor_shape", [NoSideEffect]> { let summary = "dt.get_tensor_shape operation"; diff --git a/paddle/infrt/dialect/infrt/common/types.cc b/paddle/infrt/dialect/infrt/common/types.cc index 62419a196288bb052a9f240ecc25f34c102a5b35..c10679b01342f03b35e816bf290f71790f541ee2 100644 --- a/paddle/infrt/dialect/infrt/common/types.cc +++ b/paddle/infrt/dialect/infrt/common/types.cc @@ -30,6 +30,8 @@ llvm::Optional GetLayoutType(llvm::StringRef key) { return LayoutType::NCHW; else if (key.equals_insensitive("NHWC")) return LayoutType::NHWC; + else if (key.equals_insensitive("ANY")) + return LayoutType::ANY; else return llvm::None; } @@ -39,6 +41,8 @@ llvm::Optional GetPrecisionType(llvm::StringRef key) { return PrecisionType::FLOAT32; else if (key.equals_insensitive("FP16")) return PrecisionType::FLOAT16; + else if (key.equals_insensitive("UNK")) + return PrecisionType::UNK; else return llvm::None; } @@ -67,6 +71,9 @@ llvm::StringRef GetString(LayoutType type) { case (LayoutType::NHWC): str = "NHWC"; break; + case (LayoutType::ANY): + str = "ANY"; + break; default: str = "Unsupported"; } @@ -82,6 +89,9 @@ llvm::StringRef GetString(PrecisionType type) { case (PrecisionType::FLOAT16): str = "FP16"; break; + case (PrecisionType::UNK): + str = "UNK"; + break; default: str = "Unsupported"; } diff --git a/paddle/infrt/dialect/infrt/ir/infrt_base.td b/paddle/infrt/dialect/infrt/ir/infrt_base.td index c5130e89bb13a58a0aa0cf3aeae1b00e269eb259..86cfc375330b19878528645a2e810efb797e153f 100644 --- a/paddle/infrt/dialect/infrt/ir/infrt_base.td +++ b/paddle/infrt/dialect/infrt/ir/infrt_base.td @@ -89,6 +89,13 @@ def DenseTensorMap : Infrt_Type<"DenseTensorMap"> { let parameters = (ins); } +// TODO(wilber): Add !infrt.vec type. +def DenseTensorList : Infrt_Type<"DenseTensorList"> { + let summary = "infrt dense tensor map"; + let description = [{dense_tensor map}]; + let parameters = (ins); +} + // Type Constrait for concrete DenseTensor type. class DenseTensor : Type, diff --git a/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc b/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc index 42de08ebc41938c40675435d4af10f758c52052b..f8d8f514749f802299600acac60b12de70a8d3fe 100644 --- a/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc +++ b/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc @@ -90,6 +90,9 @@ mlir::Type InfrtDialect::parseType(::mlir::DialectAsmParser &parser) const { return LoDTensorType::get( parser.getContext(), shape, elementType, lod_level); } + if (keyword == "dense_tensor_map") { + return DenseTensorMapType::get(parser.getContext()); + } if (keyword == "dense_tensor") { // parse DenseTensor, for example: !i=Infrt.tensor llvm::StringRef target; @@ -134,6 +137,11 @@ mlir::Type InfrtDialect::parseType(::mlir::DialectAsmParser &parser) const { return DenseTensorType::get( parser.getContext(), *targetType, *precisionType, *layoutType); } + + if (keyword == "tensor_list") { + return infrt::DenseTensorListType::get(parser.getContext()); + } + // Todo: parse other type return mlir::Type(); } @@ -154,9 +162,13 @@ void InfrtDialect::printType(::mlir::Type type, << lod_tensor_type.getLod_level() << ">"; return; } + if (type.isa()) { + os << "dense_tensor_map"; + return; + } // print DenseTensorType, for example: !infrt.dense_tensor - if (type.isa()) { + if (type.isa()) { auto dense_tensor_type = type.cast(); os << "dense_tensor<" << dense_tensor_type.getTarget() << ", " << dense_tensor_type.getPrecision() << ", " @@ -164,6 +176,16 @@ void InfrtDialect::printType(::mlir::Type type, return; } + if (type.isa()) { + os << "tensor_list"; + return; + } + // print DenseTensorType, for example: !infrt.dense_tensor + if (type.isa()) { + os << "dense_tensor_map"; + return; + } + llvm_unreachable("unknown infrt type."); } diff --git a/paddle/infrt/dialect/infrt/ir/infrt_ops.td b/paddle/infrt/dialect/infrt/ir/infrt_ops.td index f5430b03d0d75cfa8ba91f03ebc90ee0f73c25d7..82eba2a1746cce31e3fe99ae71c782bb88524930 100644 --- a/paddle/infrt/dialect/infrt/ir/infrt_ops.td +++ b/paddle/infrt/dialect/infrt/ir/infrt_ops.td @@ -53,9 +53,9 @@ def Infrt_CallOp : Infrt_Op<"call"> { }]; } -def Infrt_CvtTensorOp : Infrt_Op<"cvt_tensor", [NoSideEffect]> { - let summary = "convert tensor type op"; - let description = [{convert tensor type op!}]; +def Infrt_TensorCastOp : Infrt_Op<"tensor_cast", [NoSideEffect]> { + let summary = "cast tensor type op"; + let description = [{cast tensor type op!}]; let arguments = (ins AnyType:$input); let results = (outs AnyType:$output); } diff --git a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td index 51addb4deb43824965806962613d1ab4bd1c1e3d..3d825a9c762f4833e577125d20423a5f1d41737f 100644 --- a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td +++ b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td @@ -3,19 +3,19 @@ include "mlir/Interfaces/SideEffectInterfaces.td" include "paddle/infrt/dialect/infrt/ir/infrt_ops.td" -include "paddle/infrt/dialect/pd_ops.td" +include "paddle/infrt/dialect/pd/ir/pd_ops.td" -def FuseCvtTensorPattern : Pat< - (Infrt_CvtTensorOp (Infrt_CvtTensorOp $arg)), - (Infrt_CvtTensorOp $arg)>; +def FuseTensorCastPattern : Pat< + (Infrt_TensorCastOp (Infrt_TensorCastOp $arg)), + (Infrt_TensorCastOp $arg)>; -def FuseFeedCvtTensorPattern : Pat< - (Infrt_CvtTensorOp (PD_FeedOp $name)), +def FuseFeedTensorCastPattern : Pat< + (Infrt_TensorCastOp (PD_FeedOp $name)), (PD_FeedOp $name)>; def TypesAreIdentical : Constraint>; -def RedundantCvtTensorOptPattern : Pat< - (Infrt_CvtTensorOp:$res $arg), (replaceWithValue $arg), +def RedundantTensorCastOptPattern : Pat< + (Infrt_TensorCastOp:$res $arg), (replaceWithValue $arg), [(TypesAreIdentical $res, $arg)]>; diff --git a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc index 25ecf2ae99dc3613944fcedaee427b540f0faae4..eec0e0bc7c5ab624e9db7744c357b58ff5107eef 100644 --- a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc +++ b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc @@ -16,7 +16,7 @@ #include #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" -#include "paddle/infrt/dialect/pd_ops.h" +#include "paddle/infrt/dialect/pd/ir/pd_ops.h" namespace { #include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse.cpp.inc" // NOLINT @@ -27,8 +27,12 @@ struct InfrtOpFusePass : public mlir::PassWrapper { public: ::llvm::StringRef getName() const override { return "infrtOpFusePass"; } + + llvm::StringRef getArgument() const override { return "infrt-op-fuse"; } + void runOnFunction() override; }; + // Implementation of the InfrtOpFusePass. void InfrtOpFusePass::runOnFunction() { ::mlir::RewritePatternSet patterns(&getContext()); @@ -39,14 +43,18 @@ void InfrtOpFusePass::runOnFunction() { if (nullptr == terminator_op) return; for (auto operand : terminator_op->getOperands()) { auto *op1 = operand.getDefiningOp(); - auto cvt_op = ::llvm::dyn_cast<::infrt::CvtTensorOp>(op1); + auto cvt_op = ::llvm::dyn_cast<::infrt::TensorCastOp>(op1); if (!cvt_op) continue; mlir::Value value = cvt_op.input(); operand.replaceAllUsesWith(value); cvt_op.erase(); } } + } // namespace + std::unique_ptr infrt::createInfrtOpFusePass() { return std::make_unique(); } + +mlir::PassRegistration infrt_op_fuse_pass; diff --git a/paddle/infrt/dialect/init_dialects.cc b/paddle/infrt/dialect/init_dialects.cc index 0c5944ebf84750be8cf789552219157da3170c39..56c375c72d2bbb24e1a279c6f160e0ea7a98bd83 100644 --- a/paddle/infrt/dialect/init_dialects.cc +++ b/paddle/infrt/dialect/init_dialects.cc @@ -20,20 +20,23 @@ #include "paddle/infrt/dialect/infrt/ir/basic_kernels.h" #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" -#include "paddle/infrt/dialect/pd_ops.h" +#include "paddle/infrt/dialect/pd/ir/pd_ops.h" #include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h" #include "paddle/infrt/dialect/phi/ir/phi_base.h" #include "paddle/infrt/dialect/phi/ir/phi_kernels.h" #include "paddle/infrt/dialect/tensor_shape.h" +#include "paddle/infrt/dialect/tensorrt/trt_ops.h" namespace infrt { void registerCinnDialects(mlir::DialectRegistry ®istry) { // NOLINT registry.insert { let summary = "Computes the Fully Connected result of two tensors"; diff --git a/paddle/infrt/dialect/pd_op_base.td b/paddle/infrt/dialect/pd/ir/pd_op_base.td similarity index 85% rename from paddle/infrt/dialect/pd_op_base.td rename to paddle/infrt/dialect/pd/ir/pd_op_base.td index f6af4c83aed8bd0b7ce04c172169b036e674777b..e28854a848023c1161c8cda24edb705f536b5698 100644 --- a/paddle/infrt/dialect/pd_op_base.td +++ b/paddle/infrt/dialect/pd/ir/pd_op_base.td @@ -8,7 +8,7 @@ include "mlir/IR/OpBase.td" include "mlir/Interfaces/SideEffectInterfaces.td" include "paddle/infrt/dialect/infrt/ir/infrt_base.td" -def PD_Dialect : Dialect { +def Paddle_Dialect : Dialect { let name = "pd"; let description = [{ @@ -16,16 +16,16 @@ def PD_Dialect : Dialect { This dialect contains the PaddlePaddle operators. }]; - - let cppNamespace = "mlir::pd"; + let hasConstantMaterializer = 1; + let cppNamespace = "infrt::pd"; } class PD_Op traits = []> : - Op; + Op; class PD_PaddleAttr : - Attr()">, + Attr()">, "PaddlePaddle " # description # " attribute">; @@ -33,12 +33,12 @@ class PD_PaddleAttr : // PaddlePaddle type definitions //===----------------------------------------------------------------------===// -def PD_PDDialectType : Type()">, "PaddlePaddle type">; +def PD_PDDialectType : Type()">, "PaddlePaddle type">; class PD_PaddleType : - Type()">, + Type()">, "Paddle " # description # " type">, - BuildableType<"getType()">; + BuildableType<"getType()">; //===----------------------------------------------------------------------===// // Integer types diff --git a/paddle/infrt/dialect/pd/ir/pd_ops.cc b/paddle/infrt/dialect/pd/ir/pd_ops.cc new file mode 100644 index 0000000000000000000000000000000000000000..b5ba48581ee62f4e77328ed9f91ad956632dbbb7 --- /dev/null +++ b/paddle/infrt/dialect/pd/ir/pd_ops.cc @@ -0,0 +1,76 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/dialect/pd/ir/pd_ops.h" + +#include +#include + +#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" +#include "paddle/infrt/dialect/pd/ir/pd_opsDialect.cpp.inc" +#define GET_OP_CLASSES +#include "paddle/infrt/dialect/pd/ir/pd_ops.cpp.inc" // NOLINT +#define GET_OP_CLASSES +#include "paddle/infrt/dialect/pd/ir/pd_extra_ops.cpp.inc" // NOLINT + +namespace infrt { +namespace pd { +void PaddleDialect::initialize() { + addOperations< +#define GET_OP_LIST +#include "paddle/infrt/dialect/pd/ir/pd_ops.cpp.inc" // NOLINT + , +#define GET_OP_LIST +#include "paddle/infrt/dialect/pd/ir/pd_extra_ops.cpp.inc" // NOLINT + >(); +} + +mlir::Operation *PaddleDialect::materializeConstant(mlir::OpBuilder &builder, + mlir::Attribute value, + mlir::Type type, + mlir::Location loc) { + return builder.create(loc, value); +} + +void ConstantOp::build(mlir::OpBuilder &builder, + mlir::OperationState &state, + mlir::Attribute value) { + if (auto elem_attr = value.dyn_cast()) { + return ConstantOp::build(builder, state, elem_attr); + } else if (value.isa()) { + mlir::ShapedType type = + mlir::RankedTensorType::get(/*shape=*/{}, value.getType()); + state.addAttribute("value", mlir::DenseElementsAttr::get(type, value)); + state.addTypes(type); + return; + } + llvm_unreachable("unsupported attribute type for building pd.constant"); +} + +mlir::LogicalResult ConstantOp::inferReturnTypes( + mlir::MLIRContext *context, + mlir::Optional location, + mlir::ValueRange operands, + mlir::DictionaryAttr attributes, + mlir::RegionRange regions, + llvm::SmallVectorImpl &inferredReturnTypes) { + inferredReturnTypes.push_back(attributes.get("value").getType()); + return mlir::success(); +} +mlir::OpFoldResult ConstantOp::fold( + ::llvm::ArrayRef operands) { + return value(); +} +} // namespace pd +} // namespace infrt diff --git a/paddle/infrt/dialect/pd/ir/pd_ops.h b/paddle/infrt/dialect/pd/ir/pd_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..8383ff6ed8201c4f8948ebaa4effaac3d783cc52 --- /dev/null +++ b/paddle/infrt/dialect/pd/ir/pd_ops.h @@ -0,0 +1,33 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +//===----------------------------------------------------------------------===// +// Dialect +//===----------------------------------------------------------------------===// +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" +#include "paddle/infrt/dialect/pd/ir/pd_opsDialect.h.inc" +#define GET_OP_CLASSES +#include "paddle/infrt/dialect/pd/ir/pd_ops.h.inc" +#define GET_OP_CLASSES +#include "paddle/infrt/dialect/pd/ir/pd_extra_ops.hpp.inc" diff --git a/paddle/infrt/dialect/pd/pass/CMakeLists.txt b/paddle/infrt/dialect/pd/pass/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..827df597b76e2ec5b4cf639c984a425f9be8b6c9 --- /dev/null +++ b/paddle/infrt/dialect/pd/pass/CMakeLists.txt @@ -0,0 +1,8 @@ + +core_gather_headers() + +gather_srcs(infrt_src SRCS + pd_op_fuse_pass.cc + ) + +mlir_add_rewriter(pd_op_fuse) diff --git a/paddle/infrt/dialect/rewrite.td b/paddle/infrt/dialect/pd/pass/pd_op_fuse.td similarity index 97% rename from paddle/infrt/dialect/rewrite.td rename to paddle/infrt/dialect/pd/pass/pd_op_fuse.td index 62e7471a390dfeee1a9ddfc15033e85db0adca2e..f5a8ea78d7d9da5cc70b50d31836b4f4933d5853 100644 --- a/paddle/infrt/dialect/rewrite.td +++ b/paddle/infrt/dialect/pd/pass/pd_op_fuse.td @@ -3,8 +3,8 @@ include "paddle/infrt/dialect/infrt/ir/infrt_base.td" include "mlir/Interfaces/SideEffectInterfaces.td" -include "paddle/infrt/dialect/pd_ops.td" -include "paddle/infrt/dialect/pd_extra_ops.td" +include "paddle/infrt/dialect/pd/ir/pd_ops.td" +include "paddle/infrt/dialect/pd/ir/pd_extra_ops.td" //===----------------------------------------------------------------------===// // This is to fuse the composition: 'Matmul o ElementwiseAdd' into 'PD_FusedFC'. diff --git a/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.cc b/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..8bdf957db27d8c2b20025931a76826628feddbdd --- /dev/null +++ b/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.cc @@ -0,0 +1,44 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.h" // NOLINT + +#include +#include "paddle/infrt/dialect/pd/ir/pd_ops.h" + +namespace { +#include "paddle/infrt/dialect/pd/pass/pd_op_fuse.cpp.inc" // NOLINT + +/* + * PdOpFusePass. + */ +struct PdOpFusePass + : public mlir::PassWrapper { + public: + ::llvm::StringRef getName() const override { return "PdOpFusePass"; } + + llvm::StringRef getArgument() const override { return "pd-op-fuse"; } + + void runOnFunction() override; +}; + +// Implementation of the PdOpFusePass. +void PdOpFusePass::runOnFunction() { + ::mlir::RewritePatternSet patterns(&getContext()); + populateWithGenerated(patterns); + (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns)); +} + +} // namespace + +mlir::PassRegistration infrt_op_fuse_pass; diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu b/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.h similarity index 69% rename from paddle/fluid/operators/reduce_ops/reduce_all_op.cu rename to paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.h index a1f1a228aeb3a20807059a306a2fbff22d4a0bb8..854545ab1a2638224e16a300bfccb1f953f81c77 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu +++ b/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.h @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,8 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/reduce_ops/reduce_all_op.h" +#pragma once +#include -REGISTER_OP_CUDA_KERNEL( - reduce_all, - ops::ReduceCudaKernel); +namespace infrt { +/* + * PdOpFusePass. + */ +std::unique_ptr CreatePdOpFusePass(); + +} // namespace infrt diff --git a/paddle/infrt/dialect/pd_ops.cc b/paddle/infrt/dialect/pd_ops.cc deleted file mode 100644 index 96e9e307f2fd3f33be3d2273a7aa66c363e4beb1..0000000000000000000000000000000000000000 --- a/paddle/infrt/dialect/pd_ops.cc +++ /dev/null @@ -1,178 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/infrt/dialect/pd_ops.h" - -#include -#include - -#define GET_OP_CLASSES -#include "paddle/infrt/dialect/pd_ops.cpp.inc" // NOLINT -#define GET_OP_CLASSES -#include "paddle/infrt/dialect/pd_extra_ops.cpp.inc" // NOLINT - -namespace mlir { -namespace pd { - -#include "paddle/infrt/dialect/rewrite.cpp.inc" // NOLINT - -PaddleDialect::PaddleDialect(MLIRContext *context) - : Dialect("pd", context, TypeID::get()) { - addOperations< -#define GET_OP_LIST -#include "paddle/infrt/dialect/pd_ops.cpp.inc" // NOLINT - , -#define GET_OP_LIST -#include "paddle/infrt/dialect/pd_extra_ops.cpp.inc" // NOLINT - >(); -} - -mlir::Operation *PaddleDialect::materializeConstant(mlir::OpBuilder &builder, - mlir::Attribute value, - mlir::Type type, - mlir::Location loc) { - return builder.create(loc, value); -} - -void ConstantOp::build(OpBuilder &builder, - OperationState &state, - Attribute value) { - if (auto elem_attr = value.dyn_cast()) { - return ConstantOp::build(builder, state, elem_attr); - } else if (value.isa()) { - ShapedType type = RankedTensorType::get(/*shape=*/{}, value.getType()); - state.addAttribute("value", DenseElementsAttr::get(type, value)); - state.addTypes(type); - return; - } - llvm_unreachable("unsupported attribute type for building pd.constant"); -} - -LogicalResult ConstantOp::inferReturnTypes( - MLIRContext *context, - Optional location, - ValueRange operands, - DictionaryAttr attributes, - RegionRange regions, - SmallVectorImpl &inferredReturnTypes) { - inferredReturnTypes.push_back(attributes.get("value").getType()); - return success(); -} -mlir::OpFoldResult ConstantOp::fold( - ::llvm::ArrayRef operands) { - return value(); -} -/* -LogicalResult ElementwiseAdd::inferReturnTypes( - MLIRContext *context, - Optional location, - ValueRange operands, - DictionaryAttr attributes, - RegionRange regions, - SmallVectorImpl &inferredReturnTypes) { - inferredReturnTypes.push_back(operands[0].getType()); - return success(); -} -*/ - -void Elementwise_addOp::getCanonicalizationPatterns( - mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) { - results.insert(context); -} - -/* -mlir::OpFoldResult ElementwiseAdd::fold( - llvm::ArrayRef operands) { - if (getElementTypeOrSelf(getType()).isa()) { - if (!operands[0] || !operands[1]) return {}; - DenseElementsAttr lhs = operands[0].dyn_cast(); - DenseElementsAttr rhs = operands[1].dyn_cast(); - if (!lhs || !rhs) return {}; - ShapedType type = getType().template cast(); - if (!type.hasStaticShape()) return {}; - Type etype = type.getElementType(); - if (!etype.isa()) return {}; - SmallVector values; - values.reserve(lhs.getNumElements()); - for (const auto zip : - llvm::zip(lhs.getValues(), rhs.getValues())) { - values.push_back( - std::plus()(std::get<0>(zip), std::get<1>(zip))); - } - return DenseElementsAttr::get(type, values); - } - return {}; -} - -LogicalResult ElementwiseDiv::inferReturnTypes( - MLIRContext *context, - Optional location, - ValueRange operands, - DictionaryAttr attributes, - RegionRange regions, - SmallVectorImpl &inferredReturnTypes) { - inferredReturnTypes.push_back(operands[0].getType()); - return success(); -} - -LogicalResult ElementwiseMul::inferReturnTypes( - MLIRContext *context, - Optional location, - ValueRange operands, - DictionaryAttr attributes, - RegionRange regions, - SmallVectorImpl &inferredReturnTypes) { - inferredReturnTypes.push_back(operands[0].getType()); - return success(); -} - -LogicalResult ElementwiseSub::inferReturnTypes( - MLIRContext *context, - Optional location, - ValueRange operands, - DictionaryAttr attributes, - RegionRange regions, - SmallVectorImpl &inferredReturnTypes) { - inferredReturnTypes.push_back(operands[0].getType()); - return success(); -} - -LogicalResult MulOp::inferReturnTypes( - MLIRContext *context, - Optional location, - ValueRange operands, - DictionaryAttr attributes, - RegionRange regions, - SmallVectorImpl &inferredReturnTypes) { - inferredReturnTypes.push_back(operands[0].getType()); - return success(); -} - -void ReluOp::getCanonicalizationPatterns( - mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) { - results.insert(context); -} - -void FusedRepeatedFCRelu::getCanonicalizationPatterns( - mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) { - results.insert(context); -} - -void BatchNormOp::getCanonicalizationPatterns( - mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) { - results.insert(context); -}*/ - -} // namespace pd -} // namespace mlir diff --git a/paddle/infrt/dialect/pd_ops.h b/paddle/infrt/dialect/pd_ops.h deleted file mode 100644 index e6b0f30c059054189fe3a86bb112da923ad76423..0000000000000000000000000000000000000000 --- a/paddle/infrt/dialect/pd_ops.h +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" - -namespace mlir { -namespace pd { - -class PaddleDialect : public Dialect { - public: - explicit PaddleDialect(MLIRContext* context); - - static StringRef getDialectNamespace() { return "pd"; } - - /// A hook used to materialize constant values with the given type. - Operation* materializeConstant(OpBuilder& builder, - Attribute value, - Type type, - Location loc) override; - - Type parseType(DialectAsmParser& parser) const override { - return Dialect::parseType(parser); - } - void printType(Type type, DialectAsmPrinter& printer) const override { - Dialect::printType(type, printer); - } -}; - -} // namespace pd -} // namespace mlir - -#define GET_OP_CLASSES -#include "paddle/infrt/dialect/pd_ops.hpp.inc" -#define GET_OP_CLASSES -#include "paddle/infrt/dialect/pd_extra_ops.hpp.inc" diff --git a/paddle/infrt/dialect/phi/CMakeLists.txt b/paddle/infrt/dialect/phi/CMakeLists.txt index 4e73a533d99a79168b3e68b88d917f48ec811444..67f6bb8a2d7bbfa604614e4909169c08ea18e1b3 100644 --- a/paddle/infrt/dialect/phi/CMakeLists.txt +++ b/paddle/infrt/dialect/phi/CMakeLists.txt @@ -5,9 +5,6 @@ endif() add_subdirectory(ir) add_subdirectory(pass) -add_executable(phi-ir-exec phi_ir_exec.cc) -target_link_libraries(phi-ir-exec infrt) - add_executable(phi-exec phi_exec.cc) target_link_libraries(phi-exec infrt) diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td index 8c3a79498d74d3b80e1590bbc2c0530c7af6411e..1fda2d9d8886008c6415b5a1cf36d53c1500707a 100644 --- a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td +++ b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td @@ -21,8 +21,8 @@ def PHI_DenseTensorDialect : Dialect { class PDT_Op traits = []> : Op {} -class CreateDenseTensorOp - : PDT_Op<"create_dense_tensor", [NoSideEffect]> { +class CreateDenseTensorOp + : PDT_Op<"create_dense_tensor." # target, [NoSideEffect]> { let arguments = (ins Context:$context, I64ArrayAttr:$dims, LayoutAttr:$layout, I64ArrayAttr:$lod, PrecisionAttr:$precision); let results = (outs DenseTensor:$output); @@ -51,9 +51,11 @@ class CreateContextOp let results = (outs Context:$output); } -def PDT_CreateDenseTensorOp : CreateDenseTensorOp; +def PDT_CreateCPUDenseTensorOp : CreateDenseTensorOp<"cpu">; +def PDT_CreateGPUDenseTensorOp : CreateDenseTensorOp<"gpu">; def PDT_FillDenseTensorOp_f32 : FillDenseTensorOp; def PDT_CreateCPUContextOp : CreateContextOp<"cpu">; +def PDT_CreateGPUContextOp : CreateContextOp<"gpu">; def PDT_PrintDenseTensor : PrintDenseTensorOp; def FakeKernelOp : PDT_Op<"fake_phi_kernel"> { diff --git a/paddle/infrt/dialect/phi/ir/phi_base.cc b/paddle/infrt/dialect/phi/ir/phi_base.cc index d8095d7f3f13fcfbf9b2ccab6db182850633d632..f91381fe729034b3e2d36068dce43d531bfedc1c 100644 --- a/paddle/infrt/dialect/phi/ir/phi_base.cc +++ b/paddle/infrt/dialect/phi/ir/phi_base.cc @@ -29,6 +29,7 @@ namespace infrt { namespace phi { void PHIDialect::initialize() { + LOG(INFO) << "PHI Dialect initalized"; addOperations< #define GET_OP_LIST #include "paddle/infrt/dialect/phi/ir/infrt_phi_base.cpp.inc" // NOLINT diff --git a/paddle/infrt/dialect/phi/pass/CMakeLists.txt b/paddle/infrt/dialect/phi/pass/CMakeLists.txt index 5c55a6b0acaed7be9ee86b4662d895d08ca05bdc..dc60ecf63fe2eaeffc11ac932e594274c01f8580 100644 --- a/paddle/infrt/dialect/phi/pass/CMakeLists.txt +++ b/paddle/infrt/dialect/phi/pass/CMakeLists.txt @@ -2,6 +2,8 @@ core_gather_headers() gather_srcs(infrt_src SRCS proto_arg_map_context.cc - phi_op_cvt_pass.cc + phi_op_convert_pass.cc kernel_op_desc.cc - ) + ) + +cc_test(test_kernel_op_desc SRCS kernel_op_desc_test.cc DEPS infrt) diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc index 353b1054e71374987207e1055289258915e0774d..a26e8e2dca57081d9935883bbe0f01188abf1f1b 100644 --- a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc +++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc @@ -73,7 +73,7 @@ std::string getPhiLayoutSuffix(LayoutType layout) { } } -std::vector getCandidateKernels( +std::vector GetCandidateKernels( std::string name, const std::vector& valid_palces) { std::vector candidate_kernels; PhiKernelDesc phi_kernel_desc; @@ -88,19 +88,20 @@ std::vector getCandidateKernels( if (kernel_key_map.find(kernel_key) == kernel_key_map.end()) continue; place.layout = LayoutType::ANY; } - phi_kernel_desc.kernelType = place; - phi_kernel_desc.inputsType.clear(); - phi_kernel_desc.outputsType.clear(); + phi_kernel_desc.kernel_type = place; + phi_kernel_desc.input_types.clear(); + phi_kernel_desc.output_types.clear(); phi::KernelArgsDef args_def = kernel_key_map.at(kernel_key).args_def(); const paddle::SmallVector& input_arg = args_def.input_defs(); const paddle::SmallVector& output_arg = args_def.output_defs(); for (auto tensor_arg : input_arg) { - phi_kernel_desc.inputsType.emplace_back(ConvertPlaceFromPhi(tensor_arg)); + phi_kernel_desc.input_types.emplace_back(ConvertPlaceFromPhi(tensor_arg)); } for (auto tensor_arg : output_arg) { - phi_kernel_desc.outputsType.emplace_back(ConvertPlaceFromPhi(tensor_arg)); + phi_kernel_desc.output_types.emplace_back( + ConvertPlaceFromPhi(tensor_arg)); } candidate_kernels.emplace_back(phi_kernel_desc); } diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc.h b/paddle/infrt/dialect/phi/pass/kernel_op_desc.h index b1f7c6c0811def9141e8012518fff5f504934149..cdc8f7cbff553687bed63d165b18c4bc8efdb807 100644 --- a/paddle/infrt/dialect/phi/pass/kernel_op_desc.h +++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc.h @@ -21,16 +21,16 @@ namespace infrt { struct PhiKernelDesc { - std::vector inputsType; // kernel input place - std::vector outputsType; // kernel output place - Place kernelType; // kernel place + std::vector input_types; // kernel input place + std::vector output_types; // kernel output place + Place kernel_type; // kernel place }; std::string getPhiTargetPrefix(TargetType target); std::string getPhiPrecisionSuffix(PrecisionType precision); std::string getPhiLayoutSuffix(LayoutType layout); -std::vector getCandidateKernels( +std::vector GetCandidateKernels( std::string name, const std::vector& valid_palces); } // namespace infrt diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu b/paddle/infrt/dialect/phi/pass/kernel_op_desc_test.cc similarity index 51% rename from paddle/fluid/operators/reduce_ops/reduce_min_op.cu rename to paddle/infrt/dialect/phi/pass/kernel_op_desc_test.cc index 44548b8d2e778e4a570d085be6f2538b64ab7824..bd5f0799a60d5d3925e1e1265997820c37b438e6 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu +++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -11,13 +11,22 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" -#include "paddle/fluid/operators/reduce_ops/reduce_op.h" -// reduce_min -REGISTER_OP_CUDA_KERNEL( - reduce_min, - ops::ReduceCudaKernel, - ops::ReduceCudaKernel, - ops::ReduceCudaKernel, - ops::ReduceCudaKernel); +#include +#include +#include + +#include "paddle/infrt/dialect/phi/pass/kernel_op_desc.h" +#include "paddle/phi/kernels/declarations.h" + +namespace infrt { + +TEST(phi, get_op_desc) { + std::vector places; + places.emplace_back( + TargetType::CPU, PrecisionType::FLOAT32, LayoutType::NCHW); + auto kernels = GetCandidateKernels("addmm", places); + ASSERT_GE(kernels.size(), 1UL); +} + +} // namespace infrt diff --git a/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..13cba6eeabb669cf93deb9a37d87d2ddff66e5c0 --- /dev/null +++ b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc @@ -0,0 +1,268 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/infrt/common/string.h" +#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" +#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h" +#include "paddle/infrt/dialect/phi/ir/phi_base.h" +#include "paddle/infrt/dialect/phi/ir/phi_kernels.h" +#include "paddle/infrt/dialect/phi/pass/kernel_op_desc.h" +#include "paddle/infrt/dialect/phi/pass/proto_arg_map_context.h" +#include "paddle/phi/core/compat/op_utils.h" +#include "paddle/phi/core/kernel_factory.h" +#include "paddle/phi/ops/compat/signatures.h" + +namespace { +class PhiOpConvertPass + : public mlir::PassWrapper { + public: + ::llvm::StringRef getName() const override { return "PhiOpConvertPass"; } + void runOnFunction() override; + PhiOpConvertPass(); + explicit PhiOpConvertPass(const std::vector &valid_places) + : valid_places_(valid_places) {} + + PhiOpConvertPass(const PhiOpConvertPass &other) + : mlir::PassWrapper(*this), + valid_places_(other.valid_places_) {} + + ::llvm::StringRef getArgument() const override { return "phi-op-convert"; } + void getDependentDialects(mlir::DialectRegistry ®istry) const override; + + private: + void convertStage(); + void dispatchStage(); + + // Force a specified data format for all layout sensitive operations. + Option valid_places_options_{ + *this, + "valid-targets", + llvm::cl::desc("Set the valid target, [CPU-FP32-NCHW]")}; + + std::vector valid_places_; +}; +// Implementation of the PhiOpConvertPass. +void PhiOpConvertPass::runOnFunction() { + convertStage(); + dispatchStage(); +} + +void PhiOpConvertPass::convertStage() { + mlir::Block &body = getFunction().front(); + std::vector worklist; + for (auto &op : body.without_terminator()) { + worklist.push_back(&op); + } + mlir::OpBuilder builder(&body, body.begin()); + while (!worklist.empty()) { + auto *op = worklist.back(); + worklist.pop_back(); + if (!op) continue; + + auto op_name = op->getName().getIdentifier().str(); + + // only convert op in pd dialect. + if (op_name.substr(0, 3) != "pd.") continue; + op_name = op_name.substr(3); + if (pd_dialect_inputs_info_map_.find(op_name) == + pd_dialect_inputs_info_map_.end() || + pd_dialect_outputs_info_map_.find(op_name) == + pd_dialect_outputs_info_map_.end()) { + LOG(WARNING) << "No op info found for " << op_name; + // Todo: print log + continue; + } + auto loc = getFunction().getLoc(); + builder.setInsertionPoint(op); + if (phi::KernelFactory::Instance().HasCompatiblePhiKernel(op_name)) { + std::string kernel_name = phi::TransToPhiKernelName(op_name); + auto kernel_op = builder.create(loc, + op->getResultTypes(), + op->getOperands(), + kernel_name, + op->getAttrDictionary()); + op->replaceAllUsesWith(kernel_op.getResults()); + } else { + ::phi::KernelSignature kernel_sign = + ::phi::OpUtilsMap::Instance().GetArgumentMappingFn(op_name)( + infrt::ProtoArgumentMappingContext(op)); + // resort input&output according to kernel_sign + ::llvm::SmallVector inputs, ori_output; + ::llvm::SmallVector output_types; + for (const std::string &str : std::get<0>(kernel_sign.args)) { + if (pd_dialect_inputs_info_map_.at(op_name).count(str) == 0) { + LOG(ERROR) << "No input info for Op " << op_name << " and argument " + << str; + return; + } + uint8_t index = pd_dialect_inputs_info_map_.at(op_name).at(str); + inputs.push_back(op->getOperands()[index]); + } + + for (const std::string &str : std::get<2>(kernel_sign.args)) { + if (pd_dialect_outputs_info_map_.at(op_name).count(str) == 0) { + LOG(ERROR) << "No output info for Op " << op_name << " and argument " + << str; + return; + } + uint8_t index = pd_dialect_outputs_info_map_.at(op_name).at(str); + output_types.push_back(op->getResultTypes()[index]); + ori_output.push_back(op->getResult(index)); + } + auto kernel_op = builder.create( + loc, output_types, inputs, kernel_sign.name, op->getAttrDictionary()); + for (size_t index = 0; index < ori_output.size(); ++index) { + ori_output[index].replaceAllUsesWith(kernel_op.getResult(index)); + } + } + CHECK(op->use_empty()); + op->erase(); + } +} + +void PhiOpConvertPass::dispatchStage() { + std::vector worklist; + mlir::Block &block = getFunction().front(); + for (auto &op : block) { + infrt::KernelOp kernel_op = ::llvm::dyn_cast_or_null(&op); + if (nullptr != kernel_op) worklist.push_back(kernel_op); + } + + mlir::OpBuilder builder(&block, block.begin()); + std::map phi_context; + for (infrt::KernelOp kernel_op : worklist) { + std::string kernel_name = kernel_op.name().str(); + std::vector candidates = + GetCandidateKernels(kernel_name, valid_places_); + if (candidates.empty()) { + LOG(FATAL) << "No candidate kernels for op:" << kernel_name; + continue; + } + builder.setInsertionPoint(kernel_op); + + // Todo: Implimentation the concrete pass pick strategy + const infrt::PhiKernelDesc &phi_kernel_desc = candidates.front(); + + kernel_name = + infrt::getPhiTargetPrefix(phi_kernel_desc.kernel_type.target) + + kernel_name + + infrt::getPhiPrecisionSuffix(phi_kernel_desc.kernel_type.precision) + + infrt::getPhiLayoutSuffix(phi_kernel_desc.kernel_type.layout); + + mlir::OperationName operation_name(kernel_name, kernel_op.getContext()); + mlir::OperationState operation_state(kernel_op.getLoc(), operation_name); + + if (phi_context.find(phi_kernel_desc.kernel_type.target) == + phi_context.end()) { + switch (phi_kernel_desc.kernel_type.target) { + case infrt::TargetType::CPU: { + auto context_value = + builder + .create( + kernel_op.getLoc(), + infrt::phi::ContextType::get(kernel_op.getContext(), + infrt::TargetType::CPU)) + .output(); + phi_context[infrt::TargetType::CPU] = context_value; + } break; + case infrt::TargetType::GPU: + case infrt::TargetType::UNK: + default: + LOG(FATAL) << "Unsupported TargetType"; + break; + } + } + operation_state.addOperands( + phi_context.at(phi_kernel_desc.kernel_type.target)); + + for (size_t index = 0; index < phi_kernel_desc.input_types.size(); + ++index) { + mlir::Value input = kernel_op.getOperand(index); + auto cvt_tensor_type_op = builder.create( + kernel_op.getLoc(), + infrt::DenseTensorType::get( + kernel_op.getContext(), + phi_kernel_desc.input_types[index].target, + phi_kernel_desc.input_types[index].precision, + phi_kernel_desc.input_types[index].layout), + input); + operation_state.addOperands(cvt_tensor_type_op.output()); + } + + for (size_t index = 0; index < phi_kernel_desc.output_types.size(); + ++index) { + operation_state.addTypes(infrt::DenseTensorType::get( + kernel_op.getContext(), + phi_kernel_desc.output_types[index].target, + phi_kernel_desc.output_types[index].precision, + phi_kernel_desc.output_types[index].layout)); + } + operation_state.addAttributes(kernel_op.attrsAttr().getValue()); + mlir::Operation *phi_operation = builder.createOperation(operation_state); + for (size_t index = 0; index < phi_kernel_desc.output_types.size(); + ++index) { + mlir::Value input = phi_operation->getResult(index); + auto cvt_tensor_type_op = builder.create( + kernel_op.getLoc(), kernel_op.getResultTypes()[index], input); + kernel_op.getResult(index).replaceAllUsesWith( + cvt_tensor_type_op.output()); + } + kernel_op.erase(); + } +} + +PhiOpConvertPass::PhiOpConvertPass() { + if (!valid_places_options_.hasValue()) { + valid_places_.emplace_back(infrt::TargetType::CPU, + infrt::PrecisionType::FLOAT32, + infrt::LayoutType::NCHW); + return; + } + + LOG(FATAL) << "To be done for specifying places in command line"; +} + +void PhiOpConvertPass::getDependentDialects( + mlir::DialectRegistry ®istry) const { + registry.insert(); + registry.insert(); + registry.insert(); + registry.insert(); + registry.insert(); +} + +} // namespace + +mlir::PassRegistration phi_op_convert; + +std::unique_ptr infrt::createPhiOpCvtPass( + std::vector valid_places) { + return std::make_unique(valid_places); +} + +std::unique_ptr infrt::createPhiOpCvtPass() { + return std::make_unique(); +} diff --git a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h similarity index 86% rename from paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h rename to paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h index 8b1944042aa7c42fef87786af0d0fa131c6f0535..5a2c0ee96ed0de120f8667d8f2fb91314c02e9ac 100644 --- a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h +++ b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h @@ -21,7 +21,8 @@ namespace infrt { * phiOpCvtPass. * Convert the general operators from pd Dialect to phi dialect. */ -std::unique_ptr createPhiOpCvtPass( - std::vector valid_places = std::vector()); +std::unique_ptr createPhiOpCvtPass(std::vector valid_places); + +std::unique_ptr createPhiOpCvtPass(); } // namespace infrt diff --git a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc b/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc deleted file mode 100644 index 485bf2a75d890aa0df5e888e7284ae3451aa514c..0000000000000000000000000000000000000000 --- a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc +++ /dev/null @@ -1,212 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" -#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h" -#include "paddle/infrt/dialect/phi/pass/kernel_op_desc.h" -#include "paddle/infrt/dialect/phi/pass/proto_arg_map_context.h" -#include "paddle/phi/core/compat/op_utils.h" -#include "paddle/phi/ops/compat/signatures.h" - -namespace { -class phiOpCvtPass - : public mlir::PassWrapper { - public: - ::llvm::StringRef getName() const override { return "phiOpCvtPass"; } - void runOnFunction() override; - explicit phiOpCvtPass( - std::vector valid_places = std::vector()) - : valid_places_(valid_places) {} - - private: - void convertStage(); - void diapatchStage(); - std::vector valid_places_; -}; - -// Implementation of the phiOpCvtPass. -void phiOpCvtPass::runOnFunction() { - convertStage(); - diapatchStage(); -} -void phiOpCvtPass::convertStage() { - mlir::Block &body = getFunction().front(); - std::vector worklist; - for (auto &op : body.without_terminator()) { - worklist.push_back(&op); - } - mlir::OpBuilder builder(&body, body.begin()); - while (!worklist.empty()) { - auto *op = worklist.back(); - worklist.pop_back(); - if (op == nullptr) continue; - - std::string op_name = op->getName().getIdentifier().str(); - - // only convert op in pd dialect. - if (op_name.substr(0, 3) != "pd.") continue; - op_name = op_name.substr(3); - if (pd_dialect_inputs_info_map_.find(op_name) == - pd_dialect_inputs_info_map_.end() || - pd_dialect_outputs_info_map_.find(op_name) == - pd_dialect_outputs_info_map_.end()) { - // Todo: print log - continue; - } - - ::phi::KernelSignature kernel_sign = - ::phi::OpUtilsMap::Instance().GetArgumentMappingFn(op_name)( - infrt::ProtoArgumentMappingContext(op)); - // resort input&output according to kernel_sign - ::llvm::SmallVector inputs, ori_output; - ::llvm::SmallVector output_types; - for (const std::string &str : std::get<0>(kernel_sign.args)) { - if (pd_dialect_inputs_info_map_.at(op_name).count(str) == 0) { - // Todo: print error log - return; - } - uint8_t index = pd_dialect_inputs_info_map_.at(op_name).at(str); - inputs.push_back(op->getOperands()[index]); - } - - for (const std::string &str : std::get<2>(kernel_sign.args)) { - if (pd_dialect_outputs_info_map_.at(op_name).count(str) == 0) { - // Todo: print error log - return; - } - uint8_t index = pd_dialect_outputs_info_map_.at(op_name).at(str); - output_types.push_back(op->getResultTypes()[index]); - ori_output.push_back(op->getResult(index)); - } - - auto loc = getFunction().getLoc(); - builder.setInsertionPoint(op); - auto kernel_op = builder.create( - loc, output_types, inputs, kernel_sign.name, op->getAttrDictionary()); - for (size_t index = 0; index < ori_output.size(); ++index) { - ori_output[index].replaceAllUsesWith(kernel_op.getResult(index)); - } - if (!op->use_empty()) { - // Todo: print error log - return; - } - op->erase(); - } -} -void phiOpCvtPass::diapatchStage() { - std::vector worklist; - mlir::Block &block = getFunction().front(); - for (auto &op : block) { - infrt::KernelOp kernel_op = ::llvm::dyn_cast_or_null(&op); - if (nullptr != kernel_op) worklist.push_back(kernel_op); - } - - mlir::OpBuilder builder(&block, block.begin()); - std::map phi_context; - for (infrt::KernelOp kernel_op : worklist) { - std::string kernel_name = kernel_op.name().str(); - std::vector candidates = - getCandidateKernels(kernel_name, valid_places_); - if (candidates.empty()) { - LOG(FATAL) << "No candidate kernels for op:" << kernel_name; - continue; - } - builder.setInsertionPoint(kernel_op); - - // Todo: Implimentation the concrete pass pick strategy - const infrt::PhiKernelDesc &phi_kernel_desc = candidates.front(); - - kernel_name = - infrt::getPhiTargetPrefix(phi_kernel_desc.kernelType.target) + - kernel_name + - infrt::getPhiPrecisionSuffix(phi_kernel_desc.kernelType.precision) + - infrt::getPhiLayoutSuffix(phi_kernel_desc.kernelType.layout); - - mlir::OperationName operation_name(kernel_name, kernel_op.getContext()); - mlir::OperationState operation_state(kernel_op.getLoc(), operation_name); - - if (phi_context.find(phi_kernel_desc.kernelType.target) == - phi_context.end()) { - switch (phi_kernel_desc.kernelType.target) { - case infrt::TargetType::CPU: { - auto context_value = - builder - .create( - kernel_op.getLoc(), - infrt::phi::ContextType::get(kernel_op.getContext(), - infrt::TargetType::CPU)) - .output(); - phi_context[infrt::TargetType::CPU] = context_value; - } break; - case infrt::TargetType::GPU: - case infrt::TargetType::UNK: - default: - LOG(FATAL) << "Unsupported TargetType"; - break; - } - } - operation_state.addOperands( - phi_context.at(phi_kernel_desc.kernelType.target)); - for (size_t index = 0; index < phi_kernel_desc.inputsType.size(); ++index) { - mlir::Value input = kernel_op.getOperand(index); - auto cvt_tensor_type_op = builder.create( - kernel_op.getLoc(), - infrt::DenseTensorType::get( - kernel_op.getContext(), - phi_kernel_desc.inputsType[index].target, - phi_kernel_desc.inputsType[index].precision, - phi_kernel_desc.inputsType[index].layout), - input); - operation_state.addOperands(cvt_tensor_type_op.output()); - } - for (size_t index = 0; index < phi_kernel_desc.outputsType.size(); - ++index) { - operation_state.addTypes(infrt::DenseTensorType::get( - kernel_op.getContext(), - phi_kernel_desc.outputsType[index].target, - phi_kernel_desc.outputsType[index].precision, - phi_kernel_desc.outputsType[index].layout)); - } - operation_state.addAttributes(kernel_op.attrsAttr().getValue()); - mlir::Operation *phi_operation = builder.createOperation(operation_state); - for (size_t index = 0; index < phi_kernel_desc.outputsType.size(); - ++index) { - mlir::Value input = phi_operation->getResult(index); - auto cvt_tensor_type_op = builder.create( - kernel_op.getLoc(), kernel_op.getResultTypes()[index], input); - kernel_op.getResult(index).replaceAllUsesWith( - cvt_tensor_type_op.output()); - } - kernel_op.erase(); - } -} - -} // namespace - -std::unique_ptr infrt::createPhiOpCvtPass( - std::vector valid_places) { - return std::make_unique(valid_places); -} diff --git a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc index 64b184359700ee2625e3c61d21617619a50771e3..1cd5b5a85511fe20e8029185caf4c93d95979b72 100644 --- a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc +++ b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc @@ -60,6 +60,10 @@ bool ProtoArgumentMappingContext::IsSelectedRowsInput( const std::string& name) const { return false; } +bool ProtoArgumentMappingContext::IsDenseTensorVectorInput( + const std::string& name) const { + return false; +} bool ProtoArgumentMappingContext::IsDenseTensorOutput( const std::string& name) const { diff --git a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h index e4e9b5c3ff8a15dbe00dc1bd57fdce1a087437d8..5cf2ef979076d697f1991ad33cd38c36dda16cab 100644 --- a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h +++ b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h @@ -16,7 +16,7 @@ limitations under the License. */ #include #include -#include "paddle/infrt/dialect/pd_ops_info.h" +#include "paddle/infrt/dialect/pd/common/pd_ops_info.h" #include "paddle/phi/core/compat/arg_map_context.h" namespace infrt { @@ -42,6 +42,7 @@ class ProtoArgumentMappingContext : public ::phi::ArgumentMappingContext { bool IsDenseTensorInput(const std::string& name) const override; bool IsSelectedRowsInput(const std::string& name) const override; + bool IsDenseTensorVectorInput(const std::string& name) const override; bool IsDenseTensorOutput(const std::string& name) const override; bool IsSelectedRowsOutput(const std::string& name) const override; diff --git a/paddle/infrt/dialect/phi/phi_ir_exec.cc b/paddle/infrt/dialect/phi/phi_ir_exec.cc index de61dba8e744c88f279761520ac1815bb265d875..0beb5bff29f6df73be75a18611a5207bb1e3aad7 100644 --- a/paddle/infrt/dialect/phi/phi_ir_exec.cc +++ b/paddle/infrt/dialect/phi/phi_ir_exec.cc @@ -18,7 +18,7 @@ #include "paddle/infrt/common/global.h" #include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h" #include "paddle/infrt/dialect/mlir_loader.h" -#include "paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h" +#include "paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h" int main(int argc, char** argv) { static llvm::cl::opt input_file( diff --git a/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td b/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td index 46c250b05492cefe61d8e677a352a217718189b8..6467c1285f85e0c8bfca7b873ce64a09a52074ff 100644 --- a/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td +++ b/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td @@ -3,7 +3,7 @@ include "mlir/Interfaces/SideEffectInterfaces.td" include "paddle/infrt/dialect/infrt/ir/infrt_base.td" -include "paddle/infrt/dialect/pd_ops.td" +include "paddle/infrt/dialect/pd/ir/pd_ops.td" include "paddle/infrt/dialect/tensorrt/trt_ops.td" def PD2TRT_Matmul_Lower : Pat< diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc index ad6b136463a71dcc2fcd9ce2b4e2da6f68e88dd2..0878163a955af236c6a40f60850e9e5cad67b2aa 100644 --- a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc +++ b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc @@ -17,11 +17,12 @@ #include #include #include -#include #include #include #include +#include "paddle/infrt/dialect/pd/ir/pd_ops.h" + namespace infrt { namespace trt { namespace { @@ -54,8 +55,8 @@ bool reverseDfs(std::vector source, // merge the first&second graph op to a new graph op. void mergeTwoAdjacentGraphOp(mlir::OpBuilder &builder, // NOLINT - mlir::pd::GraphOp first, - mlir::pd::GraphOp second) { + infrt::pd::GraphOp first, + infrt::pd::GraphOp second) { // comput inputs and outputs ::llvm::SmallVector inputs(first.getOperands()), outputs; for (mlir::Value input : second.getOperands()) { @@ -84,7 +85,7 @@ void mergeTwoAdjacentGraphOp(mlir::OpBuilder &builder, // NOLINT // create the new graph op builder.setInsertionPoint(first); auto loc = first.getLoc(); - auto graph_op = builder.create(loc, return_types, inputs); + auto graph_op = builder.create(loc, return_types, inputs); mlir::Block *block = new mlir::Block; auto copy_range = second.getBody()->without_terminator(); block->getOperations().splice(block->begin(), @@ -149,13 +150,13 @@ void TRTGraphFusePass::runOnFunction() { do { changed = false; for (auto &op : body) { - mlir::pd::GraphOp graph_op = - ::llvm::dyn_cast_or_null(&op); + infrt::pd::GraphOp graph_op = + ::llvm::dyn_cast_or_null(&op); if (nullptr == graph_op) continue; for (auto user_op : op.getUsers()) { - mlir::pd::GraphOp user_graph_op = - ::llvm::dyn_cast_or_null(user_op); + infrt::pd::GraphOp user_graph_op = + ::llvm::dyn_cast_or_null(user_op); if (nullptr == user_graph_op) continue; // get all dst input nodes except src. std::vector source_nodes; diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc index e3a7b455024c65d40ccbafb28fba9e9b0ead0369..ade61bfc370f550cf85267b3088d697bf1bea997 100644 --- a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc +++ b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc @@ -15,24 +15,24 @@ #include "paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h" #include -#include "paddle/infrt/dialect/pd_ops.h" +#include "paddle/infrt/dialect/pd/ir/pd_ops.h" namespace infrt { namespace trt { // Implementation of the trtGraphSplitPass。 void TRTGraphSplitPass::runOnFunction() { - std::vector worklist; + std::vector worklist; mlir::Block& block = getFunction().front(); for (auto& op : block) { - mlir::pd::GraphOp graph_op = - ::llvm::dyn_cast_or_null(&op); + infrt::pd::GraphOp graph_op = + ::llvm::dyn_cast_or_null(&op); if (nullptr != graph_op && graph_op.getBody()->getOperations().size() <= min_subgraph_size_) { worklist.push_back(graph_op); } } while (!worklist.empty()) { - mlir::pd::GraphOp graph_op = worklist.back(); + infrt::pd::GraphOp graph_op = worklist.back(); worklist.pop_back(); mlir::Block* body = graph_op.getBody(); auto return_op = body->getTerminator(); diff --git a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc index 83bebdb6bf19bdf8f75d11d693813b8169e297a0..19c6b13e971ec779ed178413ca08b42b23dc71d1 100644 --- a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc +++ b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc @@ -14,7 +14,7 @@ #include "paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h" #include #include -#include "paddle/infrt/dialect/pd_ops.h" +#include "paddle/infrt/dialect/pd/ir/pd_ops.h" #include "paddle/infrt/dialect/tensorrt/trt_dialect_types.h" namespace infrt { @@ -27,7 +27,7 @@ struct PD2TRT_GraphLower : public ::mlir::RewritePattern { : ::mlir::RewritePattern("pd.graph", 1, context, {"trt.create_engine"}) {} ::mlir::LogicalResult matchAndRewrite( ::mlir::Operation *op, ::mlir::PatternRewriter &rewriter) const override { - auto casted_op = ::llvm::dyn_cast(op); + auto casted_op = ::llvm::dyn_cast(op); ::mlir::Operation::operand_range inputs = casted_op.inputs(); auto ods_loc = rewriter.getFusedLoc(op->getLoc()); CreateEngineOp create_engine_op; diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc index 9f348b4122fc74033703c92459e6cfa5b3a1f3a2..ef9ccc82678f4bf2e2b518bf346d25393b9e480c 100644 --- a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc +++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc @@ -17,7 +17,7 @@ #include #include "paddle/infrt/dialect/infrt/ir/basic_kernels.h" #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" -#include "paddle/infrt/dialect/pd_ops.h" +#include "paddle/infrt/dialect/pd/ir/pd_ops.h" namespace infrt { namespace trt { @@ -35,13 +35,13 @@ void TRTOpTellerPass::runOnFunction() { auto *op = worklist.back(); worklist.pop_back(); if (op == nullptr) continue; - if (::llvm::dyn_cast_or_null(op)) continue; - if (::llvm::dyn_cast_or_null(op)) continue; - if (::llvm::dyn_cast_or_null(op)) continue; + if (::llvm::dyn_cast_or_null(op)) continue; + if (::llvm::dyn_cast_or_null(op)) continue; + if (::llvm::dyn_cast_or_null(op)) continue; if (::llvm::dyn_cast_or_null<::infrt::ReturnOp>(op)) continue; builder.setInsertionPoint(op); auto loc = getFunction().getLoc(); - auto graph_op = builder.create( + auto graph_op = builder.create( loc, op->getResultTypes(), op->getOperands()); ::llvm::SmallVector tblgen_repl_values; diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.cc b/paddle/infrt/dialect/tensorrt/trt_ops.cc index d5222976625a2adece9a87c8952dba10137ae9ba..415a78a6967ab6fd4e2a38380d09a5d5c64b1c2f 100644 --- a/paddle/infrt/dialect/tensorrt/trt_ops.cc +++ b/paddle/infrt/dialect/tensorrt/trt_ops.cc @@ -21,6 +21,10 @@ #include "paddle/infrt/common/global.h" #include "paddle/infrt/dialect/tensorrt/trt_dialect_types.h" +#include "paddle/infrt/dialect/dense_tensor.h" +#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" +#include "paddle/infrt/dialect/phi/ir/phi_base.h" + namespace infrt { namespace trt { diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.h b/paddle/infrt/dialect/tensorrt/trt_ops.h index 78d960b5120454bdd01b779abedbe2f7ec0d5853..76768037dbdb3072976d9f6cf0cdfb4f7956bdd4 100644 --- a/paddle/infrt/dialect/tensorrt/trt_ops.h +++ b/paddle/infrt/dialect/tensorrt/trt_ops.h @@ -30,7 +30,7 @@ #include #include "paddle/infrt/dialect/infrt/ir/basic_kernels.h" #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" -#include "paddle/infrt/dialect/pd_ops.h" +#include "paddle/infrt/dialect/pd/ir/pd_ops.h" namespace infrt { namespace trt { diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.td b/paddle/infrt/dialect/tensorrt/trt_ops.td index 132a1d7805bdb85af8716e384ec29357a6ff68ad..803a11ed5b7e5ce46211a85471536c0300d42630 100755 --- a/paddle/infrt/dialect/tensorrt/trt_ops.td +++ b/paddle/infrt/dialect/tensorrt/trt_ops.td @@ -7,6 +7,8 @@ include "mlir/Interfaces/CallInterfaces.td" include "mlir/IR/OpBase.td" include "paddle/infrt/dialect/tensorrt/trt_op_base.td" +include "paddle/infrt/dialect/infrt/ir/infrt_base.td" +include "paddle/infrt/dialect/phi/ir/infrt_phi_base.td" def TRT_CreateEngineOp : TRT_Op<"create_engine", [SingleBlockImplicitTerminator<"::infrt::ReturnOp">]> { let summary = "trt CreateEngine Op"; @@ -14,8 +16,8 @@ def TRT_CreateEngineOp : TRT_Op<"create_engine", [SingleBlockImplicitTerminator< Describe a tensorrt subgraph. }]; let regions = (region SizedRegion<1>:$body); - let arguments = (ins Variadic:$inputs, DefaultValuedAttr:$run_once); - let results = (outs TRT_EngineType:$output); + let arguments = (ins Variadic:$inputs, DefaultValuedAttr:$run_once); + let results = (outs TRT_EngineType:$engine); } def TRT_ExecuteOp : TRT_Op<"execute", [NoSideEffect]> { @@ -23,8 +25,25 @@ def TRT_ExecuteOp : TRT_Op<"execute", [NoSideEffect]> { let description = [{ Describe a tensorrt runtime. }]; - let arguments = (ins TRT_EngineType:$engine, Variadic:$inputs); - let results = (outs Variadic:$output); + let arguments = (ins TRT_EngineType:$engine, Variadic:$inputs); + let results = (outs Variadic:$output); +} + +def TRT_EngineComputeOp : TRT_Op<"compute", [NoSideEffect]> { + let summary = "trt compute engine"; + let description = [{ + execute engine + }]; + let arguments = (ins TRT_EngineType:$engine, Context:$context); + let results = (outs DenseTensorList:$outputs); +} + +def TRT_InspectEngineOp : TRT_Op<"inspect_engine", [NoSideEffect]> { + let summary = "trt inspect engine"; + let description = [{ + Show engine + }]; + let arguments = (ins TRT_EngineType:$engine); } def TRT_ActivationOp : TRT_Op<"Activation", [NoSideEffect]> { @@ -34,11 +53,44 @@ def TRT_ActivationOp : TRT_Op<"Activation", [NoSideEffect]> { TensorRT IActivationLayer. }]; - let arguments = (ins TRT_Tensor:$input, SI32Attr:$activation_type, + let arguments = (ins DenseTensor:$input, SI32Attr:$activation_type, DefaultValuedAttr:$alpha, DefaultValuedAttr:$beta); - let results = (outs TRT_Tensor:$output); + let results = (outs DenseTensor:$output); +} + +def TRT_FullyConnectedOp : TRT_Op<"FullyConnected", [NoSideEffect]> { + let summary = "TensorRT IFullyConnectedLayer"; + let description = [{ + TensorRT IFullyConnectedLayer + }]; + let arguments = (ins + DenseTensor:$input_tensor, + DenseTensor:$kernel_weights, + DenseTensor:$bias_weights, + SI32Attr:$out_channel_num + ); + let results = (outs + DenseTensor:$output_tensor + ); +} + +def TRT_ConvolutionOp : TRT_Op<"Convolution", [NoSideEffect]> { + let summary = "TensorRT IConvolutionLayer"; + let description = [{ + TensorRT IConvolutionLayer + }]; + let arguments = (ins + DenseTensor:$input_tensor, + DenseTensor:$kernel_weights, + DenseTensor:$bias_weights, + SI32Attr:$out_channel_num, + I32ArrayAttr:$kernel_size + ); + let results = (outs + DenseTensor:$output_tensor + ); } def TRT_ElementWiseOp : TRT_Op<"ElementWise", [NoSideEffect]> { @@ -48,9 +100,9 @@ def TRT_ElementWiseOp : TRT_Op<"ElementWise", [NoSideEffect]> { TensorRT IElementWiseLayer. }]; - let arguments = (ins TRT_Tensor:$input1, TRT_Tensor:$input2, SI32Attr:$elementwise_operation); + let arguments = (ins DenseTensor:$input1, DenseTensor:$input2, SI32Attr:$elementwise_operation); - let results = (outs TRT_Tensor:$output); + let results = (outs DenseTensor:$output); } def TRT_MatrixMultiplyOp : TRT_Op<"MatrixMultiply", [NoSideEffect]> { @@ -60,10 +112,10 @@ def TRT_MatrixMultiplyOp : TRT_Op<"MatrixMultiply", [NoSideEffect]> { TensorRT IMatrixMultiplyLayer. }]; - let arguments = (ins TRT_Tensor:$input1, BoolAttr:$transpose1, - TRT_Tensor:$input2, BoolAttr:$transpose2); + let arguments = (ins DenseTensor:$input1, BoolAttr:$transpose1, + DenseTensor:$input2, BoolAttr:$transpose2); - let results = (outs TRT_Tensor:$output); + let results = (outs DenseTensor:$output); } #endif // TRT_OPS diff --git a/paddle/infrt/host_context/mlir_exec.cc b/paddle/infrt/host_context/mlir_exec.cc index 1506282f6268191a2eece5540d30fbe90d8eeb52..81bf873ddf0cf3f1a94489bd3b0b2769274b1b4a 100644 --- a/paddle/infrt/host_context/mlir_exec.cc +++ b/paddle/infrt/host_context/mlir_exec.cc @@ -30,10 +30,13 @@ #include "paddle/infrt/kernel/test_kernels.h" #ifdef INFRT_WITH_PHI #include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h" -#include "paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h" +#include "paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h" #include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.h" #include "paddle/infrt/kernel/phi/registry.h" -#endif +#if defined(INFRT_WITH_GPU) && defined(INFRT_WITH_TRT) +#include "paddle/infrt/kernel/tensorrt/registry.h" +#endif // INFRT_WITH_GPU && INFRT_WITH_TRT +#endif // INFRT_WITH_PHI static llvm::cl::list cl_shared_libs( // NOLINT "shared_libs", @@ -62,6 +65,9 @@ int main(int argc, char** argv) { #ifdef INFRT_WITH_PHI kernel::RegisterPhiKernels(®istry); kernel::RegisterInferShapeLaunchers(®istry); +#if defined(INFRT_WITH_GPU) && defined(INFRT_WITH_TRT) + kernel::RegisterTrtKernels(®istry); +#endif // INFRT_WITH_GPU && INFRT_WITH_TRT #endif // load extra shared library diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.cc b/paddle/infrt/host_context/mlir_to_runtime_translate.cc index c613843cd1779599fbac5aea6042b26b151534e8..bcd44540b336eee6d9a76fc14057e8454b9ae329 100644 --- a/paddle/infrt/host_context/mlir_to_runtime_translate.cc +++ b/paddle/infrt/host_context/mlir_to_runtime_translate.cc @@ -16,12 +16,14 @@ #include #include +#include #include #include #include #include #include +#include #include #include #include @@ -42,6 +44,13 @@ #include "paddle/infrt/host_context/value.h" #include "paddle/infrt/tensor/tensor_shape.h" +#ifdef INFRT_WITH_PHI +#ifdef INFRT_WITH_TRT +#include "paddle/infrt/kernel/tensorrt/trt_kernels.h" +#endif +#include "paddle/phi/core/dense_tensor.h" +#endif + namespace infrt { namespace host_context { @@ -277,33 +286,65 @@ bool MlirToRuntimeTranslator::EmitGeneralOp( impl_->runtime->NewOpExecutable(op->getName().getStringRef().str()); VLOG(3) << "processing general op : " << op->getName().getStringRef().str(); + // TODO(wilber): Find a more appropriate way to handle special cases. + if (op->getName().getStringRef() == "trt.create_engine") { +#ifdef INFRT_WITH_TRT + auto* symbols = impl_->runtime->symbol_table(); + ::infrt::kernel::tensorrt::MlirOperationWithInfrtSymbol mlir_operation; + mlir_operation.operation = op; + mlir_operation.symbol_table = symbols; + impl_->cur_op->AppendArgument(new Value(mlir_operation)); + // TODO(wilber): how to pass DenseTensor to create_engine op? temporialiy + // add a naive implement. + for (int i = 0, e = op->getNumOperands(); i < e; ++i) { + auto operand = op->getOperand(i); + Value* arg_value{nullptr}; + if (operand.isa()) { + mlir::BlockArgument arg = operand.dyn_cast(); + arg_value = GetValue(arg); + } else { + arg_value = GetValue(operand); + if (!arg_value) { + auto upstream_op = operand.getDefiningOp(); + arg_value = GetOpResult(upstream_op); + } + } + if (arg_value->is_type()) { + impl_->runtime->FeedInArgs( + std::make_pair(std::to_string(i), ValueRef(arg_value))); + } + } +#else + CHECK(false) << "should not reach here"; +#endif + } else { + // process operands + for (int i = 0, e = op->getNumOperands(); i < e; i++) { + // function argument as value + auto operand = op->getOperand(i); + /// if (operand.getKind() == mlir::Value::Kind::BlockArgument) { + if (operand.isa()) { + mlir::BlockArgument arg = operand.dyn_cast(); + Value* arg_value = GetValue(arg); + impl_->cur_op->AppendArgument(arg_value); + VLOG(3) << "* op mlir operand: " << DumpToString(arg) << " " + << GetValue(arg); + continue; + } - // process operands - for (int i = 0, e = op->getNumOperands(); i < e; i++) { - // function argument as value - auto operand = op->getOperand(i); - /// if (operand.getKind() == mlir::Value::Kind::BlockArgument) { - if (operand.isa()) { - mlir::BlockArgument arg = operand.dyn_cast(); - Value* arg_value = GetValue(arg); + // normal value + Value* arg_value = GetValue(operand); + if (!arg_value) { + auto upstream_op = operand.getDefiningOp(); + arg_value = GetOpResult(upstream_op); + } + CHECK(arg_value) << "No-exist argument value found: " + << DumpToString(operand); impl_->cur_op->AppendArgument(arg_value); - VLOG(3) << "* op mlir operand: " << DumpToString(arg) << " " - << GetValue(arg); - continue; - } - // normal value - Value* arg_value = GetValue(operand); - if (!arg_value) { - auto upstream_op = operand.getDefiningOp(); - arg_value = GetOpResult(upstream_op); + VLOG(3) << "* op mlir operand: " << DumpToString(operand) << " " + << GetValue(operand) << " vs " << arg_value; } - CHECK(arg_value) << "No-exist argument value found: " - << DumpToString(operand); - impl_->cur_op->AppendArgument(arg_value); - - VLOG(3) << "* op mlir operand: " << DumpToString(operand) << " " - << GetValue(operand) << " vs " << arg_value; } // process attributes @@ -383,33 +424,6 @@ bool MlirToRuntimeTranslator::EmitGeneralOp( impl_->cur_op->AppendAttribute(tmp[i]); } - // process results - llvm::SmallVector res_values; - for (int i = 0, e = op->getNumResults(); i < e; i++) { - auto res = op->getResult(i); - if (res.getType().isa<::infrt::DenseTensorType>()) { - auto r = impl_->value_map.try_emplace( - res, ValueRef(new Value{::phi::DenseTensor()})); - CHECK(r.second) << "Duplicate add mlir value [" << DumpToString(res) - << "]"; - res_values.push_back(r.first->second.get()); - } else { - res_values.push_back(AddValue(res)); - } - - VLOG(3) << "* op mlir res: " << DumpToString(res) << " " << GetValue(res); - } - impl_->cur_op->SetResults(res_values); - -#ifdef INFRT_DEBUG - { - VLOG(3) << "check result"; - for (int i = 0; i < impl_->cur_op->frame().GetNumResults(); i++) { - VLOG(3) << "+ res value: " << impl_->cur_op->frame().GetResults()[i]; - } - } -#endif - // process regions, we treat regions as attribute. auto num_regions = op->getNumRegions(); if (num_regions > 0) { @@ -438,6 +452,33 @@ bool MlirToRuntimeTranslator::EmitGeneralOp( impl_->cur_op->AppendAttribute(new Value(function)); } + // process results + llvm::SmallVector res_values; + for (int i = 0, e = op->getNumResults(); i < e; i++) { + auto res = op->getResult(i); + if (res.getType().isa<::infrt::DenseTensorType>()) { + auto r = impl_->value_map.try_emplace( + res, ValueRef(new Value{::phi::DenseTensor()})); + CHECK(r.second) << "Duplicate add mlir value [" << DumpToString(res) + << "]"; + res_values.push_back(r.first->second.get()); + } else { + res_values.push_back(AddValue(res)); + } + + VLOG(3) << "* op mlir res: " << DumpToString(res) << " " << GetValue(res); + } + impl_->cur_op->SetResults(res_values); + +#ifdef INFRT_DEBUG + { + VLOG(3) << "check result"; + for (int i = 0; i < impl_->cur_op->frame().GetNumResults(); i++) { + VLOG(3) << "+ res value: " << impl_->cur_op->frame().GetResults()[i]; + } + } +#endif + return true; } diff --git a/paddle/infrt/host_context/paddle_mlir.cc b/paddle/infrt/host_context/paddle_mlir.cc index 18c25827b8ec5a71907e694cea4e7680b598e883..29328520212fd4d020afc28c1e48d2db604414bc 100644 --- a/paddle/infrt/host_context/paddle_mlir.cc +++ b/paddle/infrt/host_context/paddle_mlir.cc @@ -13,15 +13,17 @@ // limitations under the License. #include "paddle/infrt/host_context/paddle_mlir.h" -#include "paddle/infrt/dialect/pd_ops_info.h" +#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h" +#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" +#include "paddle/infrt/dialect/pd/common/pd_ops_info.h" MLIRModelGenImpl::MLIRModelGenImpl() : context_(infrt::Global::getMLIRContext()), builder_(context_) { - context_->allowUnregisteredDialects(); context_->getOrLoadDialect(); context_->getOrLoadDialect(); context_->getOrLoadDialect(); - context_->getOrLoadDialect(); + context_->getOrLoadDialect(); + context_->getOrLoadDialect<::infrt::InfrtDialect>(); module_ = mlir::ModuleOp::create(mlir::UnknownLoc::get(context_)); } @@ -55,7 +57,6 @@ mlir::ModuleOp MLIRModelGenImpl::ImportPaddleModel( UpdateModelParams(program, &mainFunc); UpdateModelOps(program); UpdateModelOutputs(program); - return module_; } @@ -90,11 +91,15 @@ llvm::SmallVector MLIRModelGenImpl::GetModelInputsType( if (var_desc.name() == input_var_name) { std::vector dims = RepeatedToVector( var_desc.type().lod_tensor().tensor().dims()); - mlir::Type precision_; - ConvertDataType(var_desc.type().lod_tensor().tensor().data_type(), - builder_, - &precision_); - mlir::Type type_ = mlir::RankedTensorType::get(dims, precision_); + infrt::PrecisionType precision_; + ConvertDataTypeToPhi( + var_desc.type().lod_tensor().tensor().data_type(), &precision_); + mlir::Type type_ = + infrt::DenseTensorType::get(context_, + infrt::TargetType::CPU, + precision_, + infrt::LayoutType::ANY); + operandTypes.push_back(type_); } } @@ -116,11 +121,14 @@ llvm::SmallVector MLIRModelGenImpl::GetModelOutputsType( if (var_desc.name() == input_var_name) { std::vector dims = RepeatedToVector( var_desc.type().lod_tensor().tensor().dims()); - mlir::Type precision_; - ConvertDataType(var_desc.type().lod_tensor().tensor().data_type(), - builder_, - &precision_); - mlir::Type type_ = mlir::RankedTensorType::get(dims, precision_); + infrt::PrecisionType precision_; + ConvertDataTypeToPhi( + var_desc.type().lod_tensor().tensor().data_type(), &precision_); + mlir::Type type_ = + infrt::DenseTensorType::get(context_, + infrt::TargetType::CPU, + precision_, + infrt::LayoutType::ANY); resultTypes.push_back(type_); } } @@ -167,11 +175,11 @@ void MLIRModelGenImpl::UpdateModelParams( auto name = builder_.getStringAttr(var_desc.name()); std::vector dims = RepeatedToVector( var_desc.type().lod_tensor().tensor().dims()); - mlir::Type precision_; - ConvertDataType(var_desc.type().lod_tensor().tensor().data_type(), - builder_, - &precision_); - mlir::Type type_ = mlir::RankedTensorType::get(dims, precision_); + infrt::PrecisionType precision_; + ConvertDataTypeToPhi(var_desc.type().lod_tensor().tensor().data_type(), + &precision_); + mlir::Type type_ = infrt::DenseTensorType::get( + context_, infrt::TargetType::CPU, precision_, infrt::LayoutType::ANY); auto op = builder_.create( mlir::UnknownLoc::get(context_), type_, map, name); params_map_.insert(std::pair( @@ -197,8 +205,9 @@ void MLIRModelGenImpl::UpdateModelOutputs( llvm::SmallVector resultTypes; llvm::SmallVector attrs; + mlir::OperationState state(loc, - mlir::ReturnOp::getOperationName(), + ::infrt::ReturnOp::getOperationName(), operands, resultTypes, attrs); @@ -256,11 +265,13 @@ llvm::SmallVector MLIRModelGenImpl::GetOpOutputType( if (var_desc.name() == var_name) { std::vector dims = RepeatedToVector( var_desc.type().lod_tensor().tensor().dims()); - mlir::Type precision_; - ConvertDataType(var_desc.type().lod_tensor().tensor().data_type(), - builder_, - &precision_); - mlir::Type type_ = mlir::RankedTensorType::get(dims, precision_); + infrt::PrecisionType precision_; + ConvertDataTypeToPhi(var_desc.type().lod_tensor().tensor().data_type(), + &precision_); + mlir::Type type_ = infrt::DenseTensorType::get(context_, + infrt::TargetType::CPU, + precision_, + infrt::LayoutType::ANY); resultTypes.push_back(type_); } } @@ -321,7 +332,7 @@ llvm::SmallVector MLIRModelGenImpl::GetOpAttributes( switch (type) { ATTR_IMPL_CASE(FLOAT, f, getF32FloatAttr); ATTR_IMPL_CASE(BOOLEAN, b, getBoolAttr); - ATTR_IMPL_CASE(INT, i, getI32IntegerAttr); + ATTR_IMPL_CASE(INT, i, getSI32IntegerAttr); ATTR_IMPL_CASE(LONG, l, getI64IntegerAttr); ATTR_IMPL_CASE(STRING, s, getStringAttr); @@ -397,3 +408,38 @@ bool ConvertDataType(infrt::paddle::framework_proto::VarType::Type dtype, return false; } } + +bool ConvertDataTypeToPhi(infrt::paddle::framework_proto::VarType::Type dtype, + infrt::PrecisionType *type) { + switch (dtype) { + case infrt::paddle::framework_proto::VarType::Type::VarType_Type_FP16: + *type = infrt::PrecisionType::FLOAT16; + return true; + case infrt::paddle::framework_proto::VarType::Type::VarType_Type_FP32: + *type = infrt::PrecisionType::FLOAT32; + return true; + case infrt::paddle::framework_proto::VarType::Type::VarType_Type_FP64: + *type = infrt::PrecisionType::FLOAT64; + return true; + case infrt::paddle::framework_proto::VarType::Type::VarType_Type_BOOL: + *type = infrt::PrecisionType::BOOL; + return true; + case infrt::paddle::framework_proto::VarType::Type::VarType_Type_INT8: + *type = infrt::PrecisionType::INT8; + return true; + case infrt::paddle::framework_proto::VarType::Type::VarType_Type_INT16: + *type = infrt::PrecisionType::INT16; + return true; + case infrt::paddle::framework_proto::VarType::Type::VarType_Type_INT32: + *type = infrt::PrecisionType::INT32; + return true; + case infrt::paddle::framework_proto::VarType::Type::VarType_Type_INT64: + *type = infrt::PrecisionType::INT64; + return true; + case infrt::paddle::framework_proto::VarType::Type::VarType_Type_UINT8: + *type = infrt::PrecisionType::UINT8; + return true; + default: + return false; + } +} diff --git a/paddle/infrt/host_context/paddle_mlir.h b/paddle/infrt/host_context/paddle_mlir.h index e825cbb5a11ea0dfcacfc2b1bbb63bf201219c9d..a351b5cf80e2356a6481ccd302a544dcfe595e05 100644 --- a/paddle/infrt/host_context/paddle_mlir.h +++ b/paddle/infrt/host_context/paddle_mlir.h @@ -14,22 +14,22 @@ #ifndef PADDLE_INFRT_HOST_CONTEXT_PADDLE_MLIR_H_ #define PADDLE_INFRT_HOST_CONTEXT_PADDLE_MLIR_H_ +#include +#include +#include +#include +#include +#include #include #include #include -#include "llvm/Support/CommandLine.h" -#include "mlir/Dialect/StandardOps/IR/Ops.h" -#include "mlir/IR/AsmState.h" -#include "mlir/IR/Builders.h" -#include "mlir/IR/MLIRContext.h" #include "paddle/infrt/common/global.h" #include "paddle/infrt/common/string.h" #include "paddle/infrt/dialect/dense_tensor.h" #include "paddle/infrt/dialect/infrt/ir/basic_kernels.h" - #include "paddle/infrt/dialect/init_dialects.h" -#include "paddle/infrt/dialect/pd_ops.h" +#include "paddle/infrt/dialect/pd/ir/pd_ops.h" #include "paddle/infrt/dialect/tensor_shape.h" #include "paddle/infrt/paddle/model_parser.h" @@ -102,4 +102,7 @@ inline std::vector RepeatedToVector( bool ConvertDataType(infrt::paddle::framework_proto::VarType::Type dtype, mlir::Builder builder, mlir::Type *type); +bool ConvertDataTypeToPhi(infrt::paddle::framework_proto::VarType::Type dtype, + infrt::PrecisionType *type); + #endif // PADDLE_INFRT_HOST_CONTEXT_PADDLE_MLIR_H_ diff --git a/paddle/infrt/host_context/value.h b/paddle/infrt/host_context/value.h index 957d852442b10620244e230a2f7704eb7fa0a33e..1f0b1dabd94d8dcf28e8e0543a8e3b12ed250704 100644 --- a/paddle/infrt/host_context/value.h +++ b/paddle/infrt/host_context/value.h @@ -24,6 +24,7 @@ #include "paddle/infrt/common/shared.h" #include "paddle/infrt/dialect/infrt/common/types.h" #include "paddle/infrt/host_context/function.h" +#include "paddle/infrt/host_context/symbol_table.h" #include "paddle/infrt/support/variant.h" #include "paddle/infrt/tensor/dense_host_tensor.h" #include "paddle/infrt/tensor/dense_tensor_view.h" @@ -41,7 +42,15 @@ #include "paddle/phi/common/scalar_array.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/meta_tensor.h" -#endif + +#ifdef INFRT_WITH_GPU +#include "paddle/phi/backends/gpu/gpu_context.h" +#endif // INFRT_WITH_GPU +#ifdef INFRT_WITH_TRT +#include "paddle/infrt/backends/tensorrt/trt_engine.h" +#include "paddle/infrt/kernel/tensorrt/trt_kernels.h" +#endif // INFRT_WITH_TRT +#endif // INFRT_WITH_PHI namespace infrt { namespace host_context { @@ -72,8 +81,13 @@ using ValueVariantType = ::phi::MetaTensor, ::phi::DenseTensor, backends::CpuPhiContext, +#ifdef INFRT_WITH_GPU + backends::GpuPhiContext, + ::phi::GPUContext, +#endif ::phi::CPUContext, std::vector, + std::vector, paddle::experimental::ScalarBase, paddle::experimental::ScalarArrayBase, std::vector, @@ -81,6 +95,10 @@ using ValueVariantType = paddle::experimental::Backend, paddle::experimental::DataLayout, paddle::experimental::DataType, +#ifdef INFRT_WITH_TRT + ::infrt::backends::tensorrt::TrtEngine, + ::infrt::kernel::tensorrt::MlirOperationWithInfrtSymbol, +#endif // INFRT_WITH_TRT #endif std::vector, std::vector, @@ -120,8 +138,18 @@ class Value : public common::Object { #ifdef INFRT_WITH_PHI explicit Value(::phi::CPUContext&& x) : data(std::move(x)) {} explicit Value(backends::CpuPhiContext&& x) : data(std::move(x)) {} +#ifdef INFRT_WITH_GPU + explicit Value(::phi::GPUContext&& x) : data(std::move(x)) {} + explicit Value(backends::GpuPhiContext&& x) : data(std::move(x)) {} +#endif explicit Value(::phi::DenseTensor&& x) : data(std::move(x)) {} explicit Value(::phi::MetaTensor&& x) : data(std::move(x)) {} +#ifdef INFRT_WITH_TRT + explicit Value(::infrt::backends::tensorrt::TrtEngine&& x) + : data(std::move(x)) {} + explicit Value(::infrt::kernel::tensorrt::MlirOperationWithInfrtSymbol x) + : data(x) {} +#endif // INFRT_WITH_TRT #endif template diff --git a/paddle/infrt/kernel/CMakeLists.txt b/paddle/infrt/kernel/CMakeLists.txt index f1cbfba1c46b33e461a7c9f08cf646625fbafb24..f20344f6f6b84ae8e63f44c7b7b83c6ba9d8d6da 100644 --- a/paddle/infrt/kernel/CMakeLists.txt +++ b/paddle/infrt/kernel/CMakeLists.txt @@ -1,4 +1,5 @@ add_subdirectory(phi) +add_subdirectory(tensorrt) core_gather_headers() diff --git a/paddle/infrt/kernel/phi/context_kernels.cc b/paddle/infrt/kernel/phi/context_kernels.cc index 39ef172fadef9e0f6317dec192c251c6a1df6828..b27eacf9e522d2bbb8b7ffd70ad57f54e5775499 100644 --- a/paddle/infrt/kernel/phi/context_kernels.cc +++ b/paddle/infrt/kernel/phi/context_kernels.cc @@ -25,6 +25,16 @@ namespace phi { return ctx; } +#ifdef INFRT_WITH_GPU +::phi::GPUContext CreateGPUContext() { + ::phi::GPUContext context; + context.PartialInitWithoutAllocator(); + context.SetAllocator(new ::infrt::backends::GpuPhiAllocator{}); + context.PartialInitWithAllocator(); + return context; +} +#endif + } // namespace phi } // namespace kernel } // namespace infrt diff --git a/paddle/infrt/kernel/phi/context_kernels.h b/paddle/infrt/kernel/phi/context_kernels.h index 3e9580b91da5724b42c72224847e45715f47dbb7..ae3f76c8fe536f96689680668cc52e4981894063 100644 --- a/paddle/infrt/kernel/phi/context_kernels.h +++ b/paddle/infrt/kernel/phi/context_kernels.h @@ -25,6 +25,10 @@ namespace phi { ::phi::CPUContext CreateCPUContext(); +#ifdef INFRT_WITH_GPU +::phi::GPUContext CreateGPUContext(); +#endif + } // namespace phi } // namespace kernel } // namespace infrt diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc index 777fb29ac60d9c7125898752747bbdf553f370c0..6d16b814c6b02b08e279190d5a685d65c124942d 100644 --- a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc +++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc @@ -15,6 +15,12 @@ #include "paddle/infrt/kernel/phi/dense_tensor_kernels.h" #include "paddle/infrt/dialect/phi/data_type.h" #include "paddle/infrt/kernel/phi/context_kernels.h" +#include "paddle/phi/backends/all_context.h" +#include "paddle/phi/common/place.h" + +#ifdef INFRT_WITH_GPU +#include +#endif namespace infrt { namespace kernel { @@ -34,26 +40,83 @@ namespace phi { {})); } +::phi::DenseTensor CreateGPUDenseTensor( + const ::phi::GPUContext& context, + host_context::Attribute> dims, + host_context::Attribute> lod, + host_context::Attribute<::infrt::LayoutType> layout, + host_context::Attribute<::infrt::PrecisionType> precision) { + return ::phi::DenseTensor( + const_cast<::phi::Allocator*>(&context.GetAllocator()), + ::phi::DenseTensorMeta(ConvertPrecisionToPhi(precision.get()), + ::phi::make_ddim(dims.get()), + ConvertLayoutToPhi(layout.get()), + {})); +} + void FillDenseTensorF32(::phi::DenseTensor* dense_tensor, host_context::Attribute> value) { - auto place = ::phi::CPUPlace(); + auto place = dense_tensor->place(); float* a_data = dense_tensor->mutable_data(place); - for (int64_t i = 0; i < dense_tensor->numel(); ++i) { - a_data[i] = (value.get())[i]; + if (place.GetType() == ::phi::AllocationType::CPU) { + for (int64_t i = 0; i < dense_tensor->numel(); ++i) { + a_data[i] = (value.get())[i]; + } + } else if (place.GetType() == ::phi::AllocationType::GPU) { +#ifdef INFRT_WITH_GPU + // TODO(wilber): how to set the stream parameter to copy with stream. + cudaMemcpy(a_data, + value.get().data(), + sizeof(float) * value.get().size(), + cudaMemcpyHostToDevice); +#endif + } else { + llvm_unreachable("temporarily not support other target."); } } void PrintDenseTensor(::phi::DenseTensor* dense_tensor) { -#define PRINT_META_DATA(PHI_DATATYPE, DTYPE) \ - case ::phi::DataType::PHI_DATATYPE: { \ - DTYPE* data = dense_tensor->data(); \ - if (dense_tensor->numel() == 0) break; \ - std::cout << data[0]; \ - for (int64_t i = 1; i < dense_tensor->numel(); i++) { \ - std::cout << "," << data[i]; \ - } \ - break; \ +#ifndef INFRT_WITH_GPU +#define PRINT_META_DATA(PHI_DATATYPE, DTYPE) \ + case ::phi::DataType::PHI_DATATYPE: { \ + auto place = dense_tensor->place(); \ + if (place.GetType() == ::phi::AllocationType::CPU) { \ + DTYPE* data = dense_tensor->data(); \ + if (dense_tensor->numel() == 0) break; \ + std::cout << data[0]; \ + for (int64_t i = 1; i < dense_tensor->numel(); i++) { \ + std::cout << "," << data[i]; \ + } \ + } \ + break; \ + } +#else +#define PRINT_META_DATA(PHI_DATATYPE, DTYPE) \ + case ::phi::DataType::PHI_DATATYPE: { \ + auto place = dense_tensor->place(); \ + DTYPE* data = dense_tensor->data(); \ + if (dense_tensor->numel() == 0) break; \ + if (place.GetType() == ::phi::AllocationType::CPU) { \ + std::cout << data[0]; \ + for (int64_t i = 1; i < dense_tensor->numel(); i++) { \ + std::cout << "," << data[i]; \ + } \ + } else if (place.GetType() == ::phi::AllocationType::GPU) { \ + std::vector host_data(dense_tensor->numel(), 0); \ + cudaMemcpy(host_data.data(), \ + data, \ + sizeof(DTYPE) * dense_tensor->numel(), \ + cudaMemcpyDeviceToHost); \ + std::cout << host_data[0]; \ + for (int64_t i = 1; i < dense_tensor->numel(); i++) { \ + std::cout << "," << host_data[i]; \ + } \ + } else { \ + llvm_unreachable("temporarily not support other target."); \ + } \ + break; \ } +#endif ::phi::DDim dims = dense_tensor->dims(); std::cout << "dense_tensor: shape=shape" << dims.to_str() << "," diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.h b/paddle/infrt/kernel/phi/dense_tensor_kernels.h index 8cc0e39e0e4431f073ac37a7f0557f2c837dc753..47d89506e2aa615b0bc425a4c373c904d937e03f 100644 --- a/paddle/infrt/kernel/phi/dense_tensor_kernels.h +++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.h @@ -30,6 +30,13 @@ namespace phi { host_context::Attribute<::infrt::LayoutType> layout, host_context::Attribute<::infrt::PrecisionType> precision); +::phi::DenseTensor CreateGPUDenseTensor( + const ::phi::GPUContext& context, + host_context::Attribute> dims, + host_context::Attribute> lod, + host_context::Attribute<::infrt::LayoutType> layout, + host_context::Attribute<::infrt::PrecisionType> precision); + void FillDenseTensorF32(::phi::DenseTensor* dense_tensor, host_context::Attribute> values); void PrintDenseTensor(::phi::DenseTensor* dense_tensor); diff --git a/paddle/infrt/kernel/phi/registry.cc b/paddle/infrt/kernel/phi/registry.cc index 0e071418603f8390ca3283f617b06cf1fa91b94c..36d40118f16a0bd1779765064caaac6dbe414772 100644 --- a/paddle/infrt/kernel/phi/registry.cc +++ b/paddle/infrt/kernel/phi/registry.cc @@ -35,7 +35,7 @@ void RegisterPhiKernels(host_context::KernelRegistry* registry) { registry->AddKernel("phi_dt.create_context.cpu", INFRT_KERNEL(infrt::kernel::phi::CreateCPUContext)); registry->AddKernelWithAttrs( - "phi_dt.create_dense_tensor", + "phi_dt.create_dense_tensor.cpu", INFRT_KERNEL(infrt::kernel::phi::CreateDenseTensor), {"dims", "lod", "layout", "precision"}); registry->AddKernelWithAttrs( @@ -44,6 +44,15 @@ void RegisterPhiKernels(host_context::KernelRegistry* registry) { {"value"}); registry->AddKernel("phi_dt.print_tensor", INFRT_KERNEL(infrt::kernel::phi::PrintDenseTensor)); + +#ifdef INFRT_WITH_GPU + registry->AddKernel("phi_dt.create_context.gpu", + INFRT_KERNEL(infrt::kernel::phi::CreateGPUContext)); + registry->AddKernelWithAttrs( + "phi_dt.create_dense_tensor.gpu", + INFRT_KERNEL(infrt::kernel::phi::CreateGPUDenseTensor), + {"dims", "lod", "layout", "precision"}); +#endif } } // namespace kernel diff --git a/paddle/infrt/kernel/tensor_kernels.cc b/paddle/infrt/kernel/tensor_kernels.cc index b7503aa4ef35894dda514fdb7fa4336485323094..a9077220cfc709116479a5d91b39d56ad4007af8 100644 --- a/paddle/infrt/kernel/tensor_kernels.cc +++ b/paddle/infrt/kernel/tensor_kernels.cc @@ -25,6 +25,10 @@ #include "paddle/infrt/tensor/tensor_map.h" #include "paddle/infrt/tensor/tensor_shape.h" +#ifdef INFRT_WITH_PHI +#include "paddle/phi/core/dense_tensor.h" +#endif + namespace infrt { namespace kernel { using namespace host_context; // NOLINT @@ -62,6 +66,20 @@ DenseHostTensor TensorMapGetTensor(TensorMap map, Attribute name) { int32_t TensorMapGetSize(TensorMap map) { return map.size(); } +// TODO(wilber): Maybe we should place TensorList type in dt dialect. +#ifdef INFRT_WITH_PHI +phi::DenseTensor TensorListGetTensor(std::vector list, + Attribute idx) { + CHECK_LT(idx.get(), static_cast(list.size())) + << "idx should less than list size"; + return *list[idx.get()]; +} + +int32_t TensorListGetSize(const std::vector &list) { + return list.size(); +} +#endif + DenseHostTensor ShallowCopyTensor(DenseHostTensor v) { return v; } template @@ -126,6 +144,14 @@ void RegisterTensorKernels(host_context::KernelRegistry *registry) { INFRT_KERNEL(TensorMapGetTensor)); registry->AddKernel("dt.tensor_map_get_size", INFRT_KERNEL(TensorMapGetSize)); +// TensorList related methods. +#ifdef INFRT_WITH_PHI + registry->AddKernelWithAttrs( + "dt.tensor_list_get_tensor", INFRT_KERNEL(TensorListGetTensor), {"id"}); + registry->AddKernel("dt.tensor_list_get_size", + INFRT_KERNEL(TensorListGetSize)); +#endif + registry->AddKernel("dt.shallow_copy_tensor", INFRT_KERNEL(ShallowCopyTensor)); diff --git a/paddle/infrt/kernel/tensorrt/CMakeLists.txt b/paddle/infrt/kernel/tensorrt/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..cd35fccbe2aa35453a4d4ac13364ef6bb5a6b6aa --- /dev/null +++ b/paddle/infrt/kernel/tensorrt/CMakeLists.txt @@ -0,0 +1,10 @@ +if (NOT (INFRT_WITH_PHI AND INFRT_WITH_GPU AND INFRT_WITH_TRT)) + return() +endif() + +core_gather_headers() + +gather_srcs(infrt_src SRCS + registry.cc + trt_kernels.cc +) diff --git a/paddle/infrt/kernel/tensorrt/registry.cc b/paddle/infrt/kernel/tensorrt/registry.cc new file mode 100644 index 0000000000000000000000000000000000000000..a37e3c0f7f2785e23c8a0b9a25d3283396215f70 --- /dev/null +++ b/paddle/infrt/kernel/tensorrt/registry.cc @@ -0,0 +1,33 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/kernel/tensorrt/registry.h" + +#include "paddle/infrt/host_context/kernel_registry.h" +#include "paddle/infrt/host_context/kernel_utils.h" +#include "paddle/infrt/kernel/tensorrt/trt_kernels.h" + +namespace infrt { +namespace kernel { + +void RegisterTrtKernels(host_context::KernelRegistry* registry) { + registry->AddKernel("trt.create_engine", + INFRT_KERNEL(tensorrt::CreateTrtEngine)); + registry->AddKernel("trt.inspect_engine", + INFRT_KERNEL(tensorrt::PrintTrtLayer)); + registry->AddKernel("trt.compute", INFRT_KERNEL(tensorrt::TrtEngineCompute)); +} + +} // namespace kernel +} // namespace infrt diff --git a/paddle/infrt/kernel/tensorrt/registry.h b/paddle/infrt/kernel/tensorrt/registry.h new file mode 100644 index 0000000000000000000000000000000000000000..762329ca61d02a16edc150854afcc3dd431a941d --- /dev/null +++ b/paddle/infrt/kernel/tensorrt/registry.h @@ -0,0 +1,35 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +namespace infrt { +namespace host_context { + +struct KernelRegistry; + +} // namespace host_context +} // namespace infrt + +namespace infrt { +namespace kernel { + +/** + * Register all the trt kernels to registry. + */ +void RegisterTrtKernels(host_context::KernelRegistry* registry); + +} // namespace kernel +} // namespace infrt diff --git a/paddle/infrt/kernel/tensorrt/trt_helper.h b/paddle/infrt/kernel/tensorrt/trt_helper.h new file mode 100644 index 0000000000000000000000000000000000000000..96122bffacdb2251c28e311ae02fe6f9c5319615 --- /dev/null +++ b/paddle/infrt/kernel/tensorrt/trt_helper.h @@ -0,0 +1,66 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "glog/logging.h" +#include "llvm/Support/ErrorHandling.h" +#include "mlir/IR/BuiltinAttributes.h" +#include "paddle/phi/common/data_type.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace infrt { +namespace kernel { +namespace tensorrt { + +static nvinfer1::DataType TensorTypeToWeightType(phi::DataType tensor_type) { + switch (tensor_type) { + case phi::DataType::FLOAT32: + return nvinfer1::DataType::kFLOAT; + case phi::DataType::INT32: + return nvinfer1::DataType::kINT32; + case phi::DataType::FLOAT16: + return nvinfer1::DataType::kHALF; + default: + llvm_unreachable("should not reach here"); + } +} + +static nvinfer1::Dims ArrayAttrToNvDims(const mlir::ArrayAttr& int_array_attr) { + nvinfer1::Dims dims; + dims.nbDims = int_array_attr.size(); + CHECK(!int_array_attr.empty()); + CHECK(int_array_attr[0].getType().isIntOrIndex()); + for (int i = 0; i < dims.nbDims; ++i) { + dims.d[i] = int_array_attr[i].cast().getInt(); + } + return dims; +} + +static nvinfer1::Weights TensorToWeights(phi::DenseTensor* tensor) { + CHECK_NOTNULL(tensor); + nvinfer1::Weights ret; + ret.type = TensorTypeToWeightType(tensor->dtype()); + ret.count = tensor->numel(); + ret.values = tensor->data(); + return ret; +} + +} // namespace tensorrt +} // namespace kernel +} // namespace infrt diff --git a/paddle/infrt/kernel/tensorrt/trt_kernels.cc b/paddle/infrt/kernel/tensorrt/trt_kernels.cc new file mode 100644 index 0000000000000000000000000000000000000000..aa7609092b82c8ab08b75bfbd3e252801cc79c7d --- /dev/null +++ b/paddle/infrt/kernel/tensorrt/trt_kernels.cc @@ -0,0 +1,182 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/kernel/tensorrt/trt_kernels.h" +#include +#include "NvInfer.h" +#include "NvInferRuntime.h" +#include "NvInferRuntimeCommon.h" +#include "glog/logging.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/raw_ostream.h" +#include "mlir/IR/BuiltinAttributes.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/Operation.h" +#include "mlir/IR/Value.h" + +#include "paddle/infrt/kernel/tensorrt/trt_helper.h" +#include "paddle/infrt/kernel/tensorrt/trt_layers.h" + +#include "paddle/infrt/backends/tensorrt/trt_engine.h" +#include "paddle/infrt/backends/tensorrt/trt_options.h" +#include "paddle/infrt/dialect/tensorrt/trt_ops.h" +#include "paddle/infrt/host_context/symbol_table.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace infrt { +namespace kernel { +namespace tensorrt { + +::infrt::backends::tensorrt::TrtEngine CreateTrtEngine( + MlirOperationWithInfrtSymbol create_engine_op) { + // TODO(wilber): The device_id needs to get from mlir. + int device_id = 0; + backends::tensorrt::TrtEngine engine(device_id); + + auto* builder = engine.GetTrtBuilder(); + // TODO(wilber): How to process weights? + backends::tensorrt::TrtUniquePtr network; + // TODO(wilber): static_shape or dynamic_shape network? The code is just + // static_shape test. + network.reset(builder->createNetworkV2(0)); + + // TODO(wilber): The build option shoule be fiiled from mlir info. + backends::tensorrt::BuildOptions options; + options.max_batch = 4; + options.workspace = 1024; + + // Parse mlir Region which only has one block. + mlir::Operation& operation = *create_engine_op.operation; + auto* symbol_table = create_engine_op.symbol_table; + CHECK_NOTNULL(symbol_table); + + unsigned int num_regions = operation.getNumRegions(); + CHECK_EQ(num_regions, 1U) << "only support one region case."; + auto& region = operation.getRegion(0); + auto& block = region.getBlocks().front(); + + std::unordered_map trt_bind_inputs; + ValueToITensorMap value_to_trt_tensor_map; + ValueToTensorMap value_to_tensor_map; + + for (auto index_operand : llvm::enumerate(operation.getOperands())) { + mlir::Value operand = index_operand.value(); + size_t idx = index_operand.index(); + + const std::string input_name = "input_" + std::to_string(idx); + auto* v = symbol_table->GetValue(std::to_string(idx)); + CHECK_NOTNULL(v); + auto* t = &v->get(); + value_to_tensor_map[operand] = t; + + // TODO(wilber): get input info from mlir. + + // TODO(wilber): input dims, now only support static_shape, and just remove + // the first dimension. If the first dim is not -1, maybe we can pass the + // origin dims. + + // TODO(wilber): now only suppot float input. + + if (operand.isa()) { + // TODO(wilber): A trick: the weights are CPU tensor and inputs are GPU + // tensor, so we treat all GPU tensors as inputs to trt. + if (t->place().GetType() == phi::AllocationType::GPU) { + trt_bind_inputs[input_name] = t; + nvinfer1::Dims dims; + dims.nbDims = t->dims().size() - 1; + for (int i = 0; i < dims.nbDims; ++i) { + dims.d[i] = t->dims()[i + 1]; + } + auto* in = network->addInput( + input_name.c_str(), nvinfer1::DataType::kFLOAT, dims); + value_to_trt_tensor_map[operand] = in; + } + } else { + // TODO(wilber): Replace with the op name that generates the weights. + if (operand.getDefiningOp()->getName().getStringRef() != + "phi_dt.create_dense_tensor.cpu") { + trt_bind_inputs[input_name] = t; + nvinfer1::Dims dims; + dims.nbDims = t->dims().size() - 1; + for (int i = 0; i < dims.nbDims; ++i) { + dims.d[i] = t->dims()[i + 1]; + } + auto* in = network->addInput( + input_name.c_str(), nvinfer1::DataType::kFLOAT, dims); + value_to_trt_tensor_map[operand] = in; + } + } + } + + // TODO(wilber): Find a way to add layer. + for (auto& operation : block.without_terminator()) { + if (trt::ActivationOp op = llvm::dyn_cast(operation)) { + ActivationFunc( + op, network.get(), value_to_trt_tensor_map, value_to_tensor_map); + } else if (trt::FullyConnectedOp op = + llvm::dyn_cast(operation)) { + FcFunc(op, network.get(), value_to_trt_tensor_map, value_to_tensor_map); + } else if (trt::ConvolutionOp op = + llvm::dyn_cast(operation)) { + ConvFunc(op, network.get(), value_to_trt_tensor_map, value_to_tensor_map); + } else { + CHECK(false) << "not supported operation."; + } + } + + for (auto index_operand : + llvm::enumerate(block.getTerminator()->getOperands())) { + mlir::Value arg = index_operand.value(); + CHECK(value_to_trt_tensor_map.count(arg)); + // TODO(wilber): A trick that we name trt output tensor's name as output_0, + // output_1, ... + value_to_trt_tensor_map[arg]->setName( + ("output_" + std::to_string(index_operand.index())).c_str()); + network->markOutput(*value_to_trt_tensor_map[arg]); + } + for (int i = 0; i < network->getNbOutputs(); ++i) { + engine.PrepareOutputHandle(network->getOutput(i)->getName()); + } + + VLOG(3) << "trt engine build start."; + engine.Build(std::move(network), options); + VLOG(3) << "trt engine build done."; + + // TODO(wilber): get inference options from mlir. + backends::tensorrt::InferenceOptions inference_options; + inference_options.batch = 1; + // TODO(wilber): bind trt input/output tensors. + engine.SetUpInference(inference_options, trt_bind_inputs); + return engine; +} + +void PrintTrtLayer(backends::tensorrt::TrtEngine* engine) { + engine->GetEngineInfo(); +} + +std::vector TrtEngineCompute( + backends::tensorrt::TrtEngine* engine, const phi::GPUContext& context) { + engine->Run(context); + std::vector res; + for (size_t i = 0; i < engine->GetOutputNum(); ++i) { + res.push_back(engine->GetOutput("output_" + std::to_string(i))); + } + return res; +} + +} // namespace tensorrt +} // namespace kernel +} // namespace infrt diff --git a/paddle/infrt/kernel/tensorrt/trt_kernels.h b/paddle/infrt/kernel/tensorrt/trt_kernels.h new file mode 100644 index 0000000000000000000000000000000000000000..546ee9dc78852e6967bf8b61ae81563d32beae66 --- /dev/null +++ b/paddle/infrt/kernel/tensorrt/trt_kernels.h @@ -0,0 +1,49 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "mlir/IR/Operation.h" + +#include "paddle/infrt/backends/tensorrt/trt_engine.h" +#include "paddle/phi/backends/gpu/gpu_context.h" + +namespace infrt { +namespace host_context { +class SymbolTable; +} // namespace host_context + +namespace kernel { +namespace tensorrt { + +struct MlirOperationWithInfrtSymbol { + mlir::Operation* operation; + ::infrt::host_context::SymbolTable* symbol_table; +}; + +::infrt::backends::tensorrt::TrtEngine CreateTrtEngine( + MlirOperationWithInfrtSymbol engine_op); + +void PrintTrtLayer(backends::tensorrt::TrtEngine* engine); + +std::vector TrtEngineCompute( + backends::tensorrt::TrtEngine* engine, const phi::GPUContext& context); + +} // namespace tensorrt +} // namespace kernel +} // namespace infrt diff --git a/paddle/infrt/kernel/tensorrt/trt_layers.h b/paddle/infrt/kernel/tensorrt/trt_layers.h new file mode 100644 index 0000000000000000000000000000000000000000..19e20c170ec835444a5a37818b837dafb096b2b8 --- /dev/null +++ b/paddle/infrt/kernel/tensorrt/trt_layers.h @@ -0,0 +1,104 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include + +#include "paddle/infrt/dialect/tensorrt/trt_ops.h" +#include "paddle/infrt/kernel/tensorrt/trt_helper.h" + +#include "paddle/phi/core/dense_tensor.h" + +namespace infrt { +namespace kernel { +namespace tensorrt { + +using ValueToTensorMap = llvm::DenseMap; +using ValueToITensorMap = llvm::DenseMap; + +inline void ActivationFunc( + trt::ActivationOp& act_op, // NOLINT + nvinfer1::INetworkDefinition* network, + ValueToITensorMap& value_to_trt_tensor_map, // NOLINT + ValueToTensorMap& value_to_tensor_map) { // NOLINT + auto in_arg = act_op.getOperand(); + CHECK(value_to_trt_tensor_map.count(in_arg)) + << "value_to_trt_tensor_map not has in_arg."; + + nvinfer1::ActivationType act_type = + static_cast(act_op.activation_type()); + auto* act_layer = + network->addActivation(*value_to_trt_tensor_map[in_arg], act_type); + act_layer->setAlpha(act_op.alpha().convertToFloat()); + act_layer->setBeta(act_op.beta().convertToFloat()); + for (size_t i = 0; i < act_op->getNumResults(); ++i) { + nvinfer1::ITensor* act_out_tensor = act_layer->getOutput(i); + mlir::Value act_out = act_op->getResult(i); + value_to_trt_tensor_map[act_out] = act_out_tensor; + } +} + +inline void ConvFunc(trt::ConvolutionOp& op, // NOLINT + nvinfer1::INetworkDefinition* network, + ValueToITensorMap& value_to_trt_tensor_map, // NOLINT + ValueToTensorMap& value_to_tensor_map) { // NOLINT + mlir::Value input_tensor_repr = op.input_tensor(); + int out_channel_num = op.out_channel_num(); + auto size_attrs = op.kernel_size(); + nvinfer1::Dims dims = ArrayAttrToNvDims(size_attrs); + auto kernel_weights = + TensorToWeights(value_to_tensor_map[op.kernel_weights()]); + auto bias_weights = TensorToWeights(value_to_tensor_map[op.bias_weights()]); + + auto* layer = + network->addConvolutionNd(*value_to_trt_tensor_map[input_tensor_repr], + out_channel_num, + dims, + kernel_weights, + bias_weights); + CHECK_NOTNULL(layer); + mlir::Value out_repr = op.output_tensor(); + nvinfer1::ITensor* out_tensor = layer->getOutput(0); + value_to_trt_tensor_map[out_repr] = out_tensor; +} + +inline void FcFunc(trt::FullyConnectedOp& op, // NOLINT + nvinfer1::INetworkDefinition* network, + ValueToITensorMap& value_to_trt_tensor_map, // NOLINT + ValueToTensorMap& value_to_tensor_map) { // NOLINT + mlir::Value input_tensor_repr = op.input_tensor(); + CHECK(value_to_trt_tensor_map.count(input_tensor_repr)); + + auto kernel_weights = + TensorToWeights(value_to_tensor_map[op.kernel_weights()]); + auto bias_weights = TensorToWeights(value_to_tensor_map[op.bias_weights()]); + + int out_channel_num = op.out_channel_num(); + auto* layer = + network->addFullyConnected(*value_to_trt_tensor_map[input_tensor_repr], + out_channel_num, + kernel_weights, + bias_weights); + + mlir::Value out_repr = op.output_tensor(); + nvinfer1::ITensor* out_tensor = layer->getOutput(0); + value_to_trt_tensor_map[out_repr] = out_tensor; +} +} // namespace tensorrt +} // namespace kernel +} // namespace infrt diff --git a/paddle/infrt/tests/CMakeLists.txt b/paddle/infrt/tests/CMakeLists.txt index 5ce6d8673421ba3c53c9dad6d2fd1f20298f837a..58543a6864258bd6c0153150bb535262d9a8f00d 100644 --- a/paddle/infrt/tests/CMakeLists.txt +++ b/paddle/infrt/tests/CMakeLists.txt @@ -1,6 +1,8 @@ +cc_test_tiny(test_abs_model SRCS model/test_abs.cc DEPS infrt ${MLIR_IR_LIBS}) + configure_file(lit.cfg.py.in "${CMAKE_SOURCE_DIR}/paddle/infrt/tests/lit.cfg.py") add_test(NAME test_infrt_by_lit COMMAND sh -c "lit -v ${CMAKE_SOURCE_DIR}/paddle/infrt/tests --filter-out \"disabled_*\"" - DEPENDS infrtopt infrtexec phi-ir-exec) + DEPENDS infrtopt infrtexec) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensor/tensor_map.mlir.in ${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensor/tensor_map.mlir) diff --git a/paddle/infrt/tests/dialect/rewrite.mlir b/paddle/infrt/tests/dialect/pd/rewrite.mlir similarity index 97% rename from paddle/infrt/tests/dialect/rewrite.mlir rename to paddle/infrt/tests/dialect/pd/rewrite.mlir index 9fbb09e22449ff98a28b9e22732351ddbbc49dd0..ea0248b9d95d28e0160192a44f4c542d50a4892d 100644 --- a/paddle/infrt/tests/dialect/rewrite.mlir +++ b/paddle/infrt/tests/dialect/pd/rewrite.mlir @@ -1,4 +1,4 @@ -// RUN: infrtopt --canonicalize %s | FileCheck %s +// RUN: infrtopt --pd-op-fuse %s | FileCheck %s // CHECK-LABEL: @main func @main() -> tensor { %a = "pd.feed"() {name="input0"} : () -> tensor diff --git a/paddle/infrt/tests/dialect/phi/dense_tensor.mlir b/paddle/infrt/tests/dialect/phi/dense_tensor.mlir index 3657777a5b0bce1c5a5e4df8d59695f8b122da56..b8cb1a5cec2a17d3f6d15036249fcf9f7f711948 100644 --- a/paddle/infrt/tests/dialect/phi/dense_tensor.mlir +++ b/paddle/infrt/tests/dialect/phi/dense_tensor.mlir @@ -3,7 +3,7 @@ // CHECK-LABEL: @sign_any_float32_execute func @sign_any_float32_execute() { %ctx = "phi_dt.create_context.cpu" (): () -> !phi.context - %t = "phi_dt.create_dense_tensor" (%ctx) { + %t = "phi_dt.create_dense_tensor.cpu" (%ctx) { precision=#infrt.precision, layout=#infrt.layout, lod=[1:i64], dims=[1:i64]}: (!phi.context) -> (!infrt.dense_tensor) "phi_dt.fill_dense_tensor.f32"(%t) {value=[3.8:f32]} : (!infrt.dense_tensor) -> () diff --git a/paddle/infrt/tests/dialect/phi/phi_pass.mlir b/paddle/infrt/tests/dialect/phi/phi_pass.mlir index 61a66cb3d71a372bcd67cb96362abcb033768e4d..47badd97d37db578ec36f496b21212d73fd9920e 100644 --- a/paddle/infrt/tests/dialect/phi/phi_pass.mlir +++ b/paddle/infrt/tests/dialect/phi/phi_pass.mlir @@ -1,4 +1,5 @@ -// RUN: phi-ir-exec %s +// RUN: infrtopt -phi-op-convert -infrt-op-fuse %s + // CHECK-LABEL: @ops func @ops() { %a = pd.feed() {name="input0"} : !infrt.lod_tensor @@ -8,3 +9,10 @@ func @ops() { %h = "pd.abs"(%g):(tensor) -> tensor "pd.fetch"(%h) {name="output"} :(tensor)->() } + +// CHECK-LABEL: @op_execute +func @op_execute(%a:!infrt.lod_tensor, %b:!infrt.lod_tensor, %c:!infrt.lod_tensor) -> !infrt.lod_tensor { + %g = "pd.elementwise_add"(%a, %b) {axis=1:si32} : (!infrt.lod_tensor, !infrt.lod_tensor) -> tensor + %h = "pd.abs"(%g):(tensor) -> tensor + "pd.fetch"(%h) {name="output"} :(tensor)->() +} diff --git a/paddle/infrt/tests/dialect/phi/phi_test.mlir b/paddle/infrt/tests/dialect/phi/phi_test.mlir index 5b0fa735897a31287bb6dea487e2f22eacd7b0aa..21ee8ebf0b705894446192b0d5d0bfeb9f10f326 100644 --- a/paddle/infrt/tests/dialect/phi/phi_test.mlir +++ b/paddle/infrt/tests/dialect/phi/phi_test.mlir @@ -6,7 +6,7 @@ module { } func @main() { %ctx = "phi_dt.create_context.cpu" (): () -> !phi.context - %t = "phi_dt.create_dense_tensor" (%ctx) {precision=#infrt.precision, layout=#infrt.layout, lod=[1:i64], dims=[1:i64]}: (!phi.context) -> (!infrt.dense_tensor) + %t = "phi_dt.create_dense_tensor.cpu" (%ctx) {precision=#infrt.precision, layout=#infrt.layout, lod=[1:i64], dims=[1:i64]}: (!phi.context) -> (!infrt.dense_tensor) "phi_dt.fill_dense_tensor.f32"(%t) {value=[3.8:f32]} : (!infrt.dense_tensor) -> () %2 = infrt.call@predict(%t) : (!infrt.dense_tensor) -> !infrt.dense_tensor phi_dt.print_tensor(%2 : !infrt.dense_tensor) diff --git a/paddle/infrt/tests/dialect/tensorrt/disabled_trt.mlir b/paddle/infrt/tests/dialect/tensorrt/disabled_trt.mlir new file mode 100644 index 0000000000000000000000000000000000000000..ef86dcf1e72a04c478a7763000cf366715665d81 --- /dev/null +++ b/paddle/infrt/tests/dialect/tensorrt/disabled_trt.mlir @@ -0,0 +1,37 @@ +// RUN: infrtexec -i %s | FileCheck %s + +// CHECK-LABEL: @run_trt +func @run_trt(%0 : !infrt.dense_tensor, %ctx : !phi.context) { + %a = "trt.create_engine"(%0) ({ + %1 = "trt.Activation"(%0) {activation_type = 1 : si32, alpha = 1.0 : f32, beta = 6.0 : f32} : (!infrt.dense_tensor) -> !infrt.dense_tensor + "infrt.return"(%1) : (!infrt.dense_tensor) -> () + }) : (!infrt.dense_tensor) -> !trt.engine + "trt.inspect_engine"(%a) {} : (!trt.engine) -> () + + %res = "trt.compute"(%a, %ctx) {} : (!trt.engine, !phi.context) -> (!infrt.tensor_list) + %size = "dt.tensor_list_get_size"(%res) {} : (!infrt.tensor_list) -> (i32) + "infrt.print.i32"(%size) {} : (i32) -> () + + %ts0 = "dt.tensor_list_get_tensor"(%res) {id = 0 : i32} : (!infrt.tensor_list) -> (!infrt.dense_tensor) + "phi_dt.print_tensor" (%ts0) : (!infrt.dense_tensor) -> () + + infrt.return +} + +// CHECK-LABEL: @main +func @main() { + %ctx = "phi_dt.create_context.gpu" (): () -> !phi.context + %t = "phi_dt.create_dense_tensor.gpu" (%ctx) { + precision=#infrt.precision, + layout=#infrt.layout, + dims=[1:i64, 3:i64, 1:i64, 1:i64], lod=[1:i64]}: (!phi.context) -> (!infrt.dense_tensor) + + "phi_dt.fill_dense_tensor.f32"(%t) {value=[3.8:f32, 2.4:f32, 1.3:f32]} : (!infrt.dense_tensor) -> () + "phi_dt.print_tensor" (%t) : (!infrt.dense_tensor) -> () + + //%res = + infrt.call @run_trt(%t, %ctx) : (!infrt.dense_tensor, !phi.context) -> () + //-> (!infrt.dense_tensor) + + infrt.return +} diff --git a/paddle/infrt/tests/dialect/tensorrt/disabled_trt_conv.mlir b/paddle/infrt/tests/dialect/tensorrt/disabled_trt_conv.mlir new file mode 100644 index 0000000000000000000000000000000000000000..c67d47415bfb002d2c7a91ee8b222c6227968d52 --- /dev/null +++ b/paddle/infrt/tests/dialect/tensorrt/disabled_trt_conv.mlir @@ -0,0 +1,54 @@ +// RUN: infrtexec -i %s | FileCheck %s + +// CHECK-LABEL: @run_trt +func @run_trt(%input_tensor : !infrt.dense_tensor, %kernel_weight : !infrt.dense_tensor, %kernel_bias : !infrt.dense_tensor, %gpu_ctx : !phi.context) { + %a = "trt.create_engine"(%input_tensor, %kernel_weight, %kernel_bias) ({ + %1 = "trt.Activation"(%input_tensor) {activation_type = 1 : si32, alpha = 1.0 : f32, beta = 6.0 : f32} : (!infrt.dense_tensor) -> !infrt.dense_tensor + %2 = "trt.Convolution"(%input_tensor, %kernel_weight, %kernel_bias) {out_channel_num = 3 : si32, kernel_size = [3:i32, 3:i32]} : (!infrt.dense_tensor, !infrt.dense_tensor, !infrt.dense_tensor) -> !infrt.dense_tensor + "infrt.return"(%1, %2) : (!infrt.dense_tensor, !infrt.dense_tensor) -> () + }) : (!infrt.dense_tensor, !infrt.dense_tensor, !infrt.dense_tensor) -> !trt.engine + "trt.inspect_engine"(%a) {} : (!trt.engine) -> () + + %res = "trt.compute"(%a, %gpu_ctx) {} : (!trt.engine, !phi.context) -> (!infrt.tensor_list) + %size = "dt.tensor_list_get_size"(%res) {} : (!infrt.tensor_list) -> (i32) + "infrt.print.i32"(%size) {} : (i32) -> () + + %ts0 = "dt.tensor_list_get_tensor"(%res) {id = 0 : i32} : (!infrt.tensor_list) -> (!infrt.dense_tensor) + "phi_dt.print_tensor" (%ts0) : (!infrt.dense_tensor) -> () + + %ts1 = "dt.tensor_list_get_tensor"(%res) {id = 1 : i32} : (!infrt.tensor_list) -> (!infrt.dense_tensor) + "phi_dt.print_tensor" (%ts1) : (!infrt.dense_tensor) -> () + + infrt.return +} + +// CHECK-LABEL: @main +func @main() { + %gpu_ctx = "phi_dt.create_context.gpu" (): () -> !phi.context + %cpu_ctx = "phi_dt.create_context.cpu" (): () -> !phi.context + + %input_tensor = "phi_dt.create_dense_tensor.gpu" (%gpu_ctx) { + precision=#infrt.precision, + layout=#infrt.layout, + dims=[1:i64, 3:i64, 28:i64, 28:i64], lod=[0:i64]}: (!phi.context) -> (!infrt.dense_tensor) + "phi_dt.fill_dense_tensor.f32"(%input_tensor) {value=[3.8:f32, 2.4:f32, 1.3:f32]} : (!infrt.dense_tensor) -> () + // "phi_dt.print_tensor" (%input_tensor) : (!infrt.dense_tensor) -> () + + %kernel_weight = "phi_dt.create_dense_tensor.cpu"(%cpu_ctx) { + precision=#infrt.precision, + layout=#infrt.layout, + dims=[3:i64, 3:i64, 3:i64, 3:i64], lod=[0:i64]} : (!phi.context) -> (!infrt.dense_tensor) + "phi_dt.fill_dense_tensor.f32"(%kernel_weight) {value=[1.:f32, 2.:f32, 3.:f32, 4.:f32, 5.:f32, 6.:f32]} : (!infrt.dense_tensor) -> () + // "phi_dt.print_tensor" (%kernel_weight) : (!infrt.dense_tensor) -> () + + %kernel_bias = "phi_dt.create_dense_tensor.cpu"(%cpu_ctx) { + precision=#infrt.precision, + layout=#infrt.layout, + dims=[3:i64], lod=[0:i64]} : (!phi.context) -> (!infrt.dense_tensor) + "phi_dt.fill_dense_tensor.f32"(%kernel_bias) {value=[1.:f32]} : (!infrt.dense_tensor) -> () + // "phi_dt.print_tensor" (%kernel_bias) : (!infrt.dense_tensor) -> () + + infrt.call @run_trt(%input_tensor, %kernel_weight, %kernel_bias, %gpu_ctx) : (!infrt.dense_tensor, !infrt.dense_tensor, !infrt.dense_tensor, !phi.context) -> () + + infrt.return +} diff --git a/paddle/infrt/tests/dialect/tensorrt/disabled_trt_fc.mlir b/paddle/infrt/tests/dialect/tensorrt/disabled_trt_fc.mlir new file mode 100644 index 0000000000000000000000000000000000000000..78dc4ac1c1093c1eb9b3fb30d0ea3f0cd5be6104 --- /dev/null +++ b/paddle/infrt/tests/dialect/tensorrt/disabled_trt_fc.mlir @@ -0,0 +1,46 @@ +// RUN: infrtexec -i %s | FileCheck %s + +// CHECK-LABEL: @main +func @main() { + %ctx = "phi_dt.create_context.gpu" (): () -> !phi.context + %cpu_ctx = "phi_dt.create_context.cpu" (): () -> !phi.context + + %input_tensor = "phi_dt.create_dense_tensor.gpu" (%ctx) { + precision=#infrt.precision, + layout=#infrt.layout, + dims=[1:i64, 3:i64, 1:i64, 1:i64], lod=[1:i64]}: (!phi.context) -> (!infrt.dense_tensor) + "phi_dt.fill_dense_tensor.f32"(%input_tensor) {value=[3.8:f32, 2.4:f32, 1.3:f32]} : (!infrt.dense_tensor) -> () + //"phi_dt.print_tensor" (%input_tensor) : (!infrt.dense_tensor) -> () + + %kernel_weight = "phi_dt.create_dense_tensor.cpu"(%cpu_ctx) { + precision=#infrt.precision, + layout=#infrt.layout, + dims=[2:i64, 3:i64], lod=[1:i64]} : (!phi.context) -> (!infrt.dense_tensor) + "phi_dt.fill_dense_tensor.f32"(%kernel_weight) {value=[1.:f32, 2.:f32, 3.:f32, 4.:f32, 5.:f32, 6.:f32]} : (!infrt.dense_tensor) -> () + //"phi_dt.print_tensor" (%kernel_weight) : (!infrt.dense_tensor) -> () + + %kernel_bias = "phi_dt.create_dense_tensor.cpu"(%cpu_ctx) { + precision=#infrt.precision, + layout=#infrt.layout, + dims=[2:i64], lod=[1:i64]} : (!phi.context) -> (!infrt.dense_tensor) + "phi_dt.fill_dense_tensor.f32"(%kernel_bias) {value=[1.:f32, 2.:f32]} : (!infrt.dense_tensor) -> () + //"phi_dt.print_tensor" (%kernel_bias) : (!infrt.dense_tensor) -> () + + %engine = "trt.create_engine"(%input_tensor, %kernel_weight, %kernel_bias) ({ + %1 = "trt.Activation"(%input_tensor) {activation_type = 1 : si32, alpha = 1.0 : f32, beta = 6.0 : f32} : (!infrt.dense_tensor) -> !infrt.dense_tensor + %2 = "trt.FullyConnected"(%input_tensor, %kernel_weight, %kernel_bias) {out_channel_num = 2 : si32} : (!infrt.dense_tensor, !infrt.dense_tensor, !infrt.dense_tensor) -> !infrt.dense_tensor + "infrt.return"(%1, %2) : (!infrt.dense_tensor, !infrt.dense_tensor) -> () + }) : (!infrt.dense_tensor, !infrt.dense_tensor, !infrt.dense_tensor) -> !trt.engine + + %res = "trt.compute"(%engine, %ctx) {} : (!trt.engine, !phi.context) -> (!infrt.tensor_list) + %size = "dt.tensor_list_get_size"(%res) {} : (!infrt.tensor_list) -> (i32) + "infrt.print.i32"(%size) {} : (i32) -> () + + %ts0 = "dt.tensor_list_get_tensor"(%res) {id = 0 : i32} : (!infrt.tensor_list) -> (!infrt.dense_tensor) + "phi_dt.print_tensor" (%ts0) : (!infrt.dense_tensor) -> () + + %ts1 = "dt.tensor_list_get_tensor"(%res) {id = 1 : i32} : (!infrt.tensor_list) -> (!infrt.dense_tensor) + "phi_dt.print_tensor" (%ts1) : (!infrt.dense_tensor) -> () + + infrt.return +} diff --git a/paddle/infrt/tests/dialect/trt_ops.mlir b/paddle/infrt/tests/dialect/trt_ops.mlir index e3cb9670bec015e58e2a538bb55dfbe7c8b7f554..7bdf62a277896afe2f8a5e156fa8183742f1d853 100644 --- a/paddle/infrt/tests/dialect/trt_ops.mlir +++ b/paddle/infrt/tests/dialect/trt_ops.mlir @@ -1,16 +1,16 @@ // RUN: trt-exec %s // CHECK-LABEL: @main -func @main(%bias:tensor, %c:tensor, %b1:tensor, %b2:tensor, %bias1:tensor, %bias2:tensor) -> tensor { - %d = "pd.elementwise_add"(%c, %bias) {axis=-1:si32} : (tensor, tensor) -> tensor - %e = "pd.relu6"(%d) {} : (tensor) -> tensor +func @main(%bias:!infrt.dense_tensor, %c:!infrt.dense_tensor, %b1:!infrt.dense_tensor, %b2:!infrt.dense_tensor, %bias1:!infrt.dense_tensor, %bias2:!infrt.dense_tensor) -> !infrt.dense_tensor { + %d = "pd.elementwise_add"(%c, %bias) {axis=-1:si32} : (!infrt.dense_tensor, !infrt.dense_tensor) -> !infrt.dense_tensor + %e = "pd.relu6"(%d) {} : (!infrt.dense_tensor) -> !infrt.dense_tensor - %c1 = "pd.matmul"(%e, %b1) {transpose_x=false, transpose_y=false} : (tensor, tensor) -> tensor - %d1 = "pd.elementwise_add"(%c1, %bias1) {axis=-1:si32} : (tensor, tensor) -> tensor - %e1 = "pd.relu"(%d1) {} : (tensor) -> tensor + %c1 = "pd.matmul"(%e, %b1) {transpose_x=false, transpose_y=false} : (!infrt.dense_tensor, !infrt.dense_tensor) -> !infrt.dense_tensor + %d1 = "pd.elementwise_add"(%c1, %bias1) {axis=-1:si32} : (!infrt.dense_tensor, !infrt.dense_tensor) -> !infrt.dense_tensor + %e1 = "pd.relu"(%d1) {} : (!infrt.dense_tensor) -> !infrt.dense_tensor - %c2 = "pd.matmul"(%e1, %b2) {transpose_x=true, transpose_y=false} : (tensor, tensor) -> tensor - %d2 = "pd.elementwise_add"(%c2, %bias2) {axis=-1:si32} : (tensor, tensor) -> tensor - %e2 = "pd.relu"(%d2) {} : (tensor) -> tensor + %c2 = "pd.matmul"(%e1, %b2) {transpose_x=true, transpose_y=false} : (!infrt.dense_tensor, !infrt.dense_tensor) -> !infrt.dense_tensor + %d2 = "pd.elementwise_add"(%c2, %bias2) {axis=-1:si32} : (!infrt.dense_tensor, !infrt.dense_tensor) -> !infrt.dense_tensor + %e2 = "pd.relu"(%d2) {} : (!infrt.dense_tensor) -> !infrt.dense_tensor - infrt.return %e2 : tensor + infrt.return %e2 : !infrt.dense_tensor } diff --git a/paddle/infrt/tests/model/abs_model.py b/paddle/infrt/tests/model/abs_model.py new file mode 100644 index 0000000000000000000000000000000000000000..dd1632bc9d4d8e4e6ea0fb918d1179f4e28a441b --- /dev/null +++ b/paddle/infrt/tests/model/abs_model.py @@ -0,0 +1,38 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +from paddle.nn import Layer +from paddle.static import InputSpec +from paddle.jit import to_static +import sys + + +class AbsNet(paddle.nn.Layer): + def __init__(self): + super(AbsNet, self).__init__() + + def forward(self, x): + x = paddle.abs(x) + return x + + +if __name__ == '__main__': + # build network + model = AbsNet() + # save inferencing format model + net = to_static( + model, input_spec=[InputSpec( + shape=[None, 1, 28, 28], name='x')]) + paddle.jit.save(net, sys.argv[1]) diff --git a/paddle/infrt/tests/model/test_abs.cc b/paddle/infrt/tests/model/test_abs.cc new file mode 100644 index 0000000000000000000000000000000000000000..49266910dbd278fb8d429534134097751cf8b6b1 --- /dev/null +++ b/paddle/infrt/tests/model/test_abs.cc @@ -0,0 +1,126 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include + +#include "llvm/Support/DynamicLibrary.h" +#include "paddle/infrt/common/global.h" +#include "paddle/infrt/dialect/mlir_loader.h" +#include "paddle/infrt/host_context/core_runtime.h" +#include "paddle/infrt/host_context/kernel_registry.h" +#include "paddle/infrt/host_context/mlir_to_runtime_translate.h" +#include "paddle/infrt/kernel/basic_kernels.h" +#include "paddle/infrt/kernel/control_flow_kernels.h" +#include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.h" +#include "paddle/infrt/kernel/phi/registry.h" +#include "paddle/infrt/kernel/tensor_kernels.h" +#include "paddle/infrt/kernel/tensor_shape_kernels.h" +#include "paddle/infrt/kernel/test_kernels.h" + +#include "paddle/infrt/kernel/phi/infershaped/infershaped_utils.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/meta_tensor.h" + +#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h" +#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" + +#include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h" +#include "paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h" +#include "paddle/infrt/host_context/paddle_mlir.h" + +#include "paddle/infrt/dialect/dense_tensor.h" +#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h" +#include "paddle/infrt/dialect/phi/ir/phi_base.h" +#include "paddle/infrt/dialect/phi/ir/phi_kernels.h" + +static llvm::cl::list cl_shared_libs( // NOLINT + "shared_libs", + llvm::cl::desc("Specify shared library with kernels."), + llvm::cl::ZeroOrMore, + llvm::cl::MiscFlags::CommaSeparated); + +TEST(ABS_MODEL, convert_and_execute) { + std::string model_file_name = "./abs.pdmodel"; + std::string params_file_name = "./abs.pdiparams"; + // convert model + MLIRModelGenImpl myGen; + auto module_ = myGen.ImportPaddleModel(model_file_name, params_file_name); + module_.dump(); + // pick kernel + mlir::MLIRContext* context = infrt::Global::getMLIRContext(); + context->allowUnregisteredDialects(); + context->getOrLoadDialect(); + + context->getOrLoadDialect(); + context->getOrLoadDialect(); + context->getOrLoadDialect(); + context->getOrLoadDialect(); + context->getOrLoadDialect(); + + context->getOrLoadDialect(); + context->getOrLoadDialect(); + context->getOrLoadDialect(); + context->getOrLoadDialect(); + + context->loadAllAvailableDialects(); + mlir::PassManager pm(context); + + mlir::OpPassManager& phi_pass_manager = pm.nest(); + std::vector valid_places = {{infrt::TargetType::CPU, + infrt::PrecisionType::FLOAT32, + infrt::LayoutType::NCHW}}; + phi_pass_manager.addPass(infrt::createPhiOpCvtPass(valid_places)); + phi_pass_manager.addPass(infrt::createInfrtOpFusePass()); + + if (mlir::failed(pm.run(module_))) { + std::cout << "\npass failed!\n" << std::endl; + } + module_.dump(); + + // executate + infrt::host_context::KernelRegistry registry; + infrt::kernel::RegisterBasicKernels(®istry); + infrt::kernel::RegisterTestKernels(®istry); + infrt::kernel::RegisterTensorShapeKernels(®istry); + infrt::kernel::RegisterTensorKernels(®istry); + infrt::kernel::RegisterControlFlowKernels(®istry); + infrt::kernel::RegisterPhiKernels(®istry); + infrt::kernel::RegisterInferShapeLaunchers(®istry); + // load extra shared library + for (const auto& lib_path : cl_shared_libs) { + std::string err; + llvm::sys::DynamicLibrary dynLib = + llvm::sys::DynamicLibrary::getPermanentLibrary(lib_path.c_str(), &err); + if (!dynLib.isValid()) { + llvm::errs() << "Load shared library failed. Error: " << err << "\n"; + break; + } + if (auto reg_sym = dynLib.SearchForAddressOfSymbol("RegisterKernels")) { + auto reg_func = + reinterpret_cast( + reg_sym); + reg_func(®istry); + } else { + llvm::outs() << "Symbol \"RegisterKernels\" not found in \"" << lib_path + << "\". Skip.\n"; + } + } + infrt::host_context::TestMlir(module_, ®istry); +} diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt index 7b074d0ebb76d110dc361140bd42f78ef54f224b..04e1bbcc9df423bc38e78822ec6ef8ee28c5b216 100644 --- a/paddle/phi/CMakeLists.txt +++ b/paddle/phi/CMakeLists.txt @@ -25,8 +25,6 @@ add_subdirectory(tests) # make an unity target for compile deps set(PHI_DEPS convert_utils dense_tensor phi_context kernel_factory kernel_context arg_map_context infermeta lod_utils op_compat_infos sparse_csr_tensor sparse_coo_tensor) get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS) -# keep this message for debug, remove it later if needless -message(STATUS "All standard phi kernels: ${phi_kernels}") set(PHI_DEPS ${PHI_DEPS} ${phi_kernels}) cc_library(phi DEPS ${PHI_DEPS}) diff --git a/paddle/phi/api/include/tensor.h b/paddle/phi/api/include/tensor.h index c268742fa567bffecb2fd17a773ab56aee019853..c58ebe69523eb97a4357bea061de64e5e01ec181 100644 --- a/paddle/phi/api/include/tensor.h +++ b/paddle/phi/api/include/tensor.h @@ -225,6 +225,22 @@ class PADDLE_API Tensor final { */ bool is_selected_rows() const; + /** + * @brief Determine whether tensor is SparseCooTensor + * + * @return true + * @return false + */ + bool is_sparse_coo_tensor() const; + + /** + * @brief Determine whether tensor is SparseCsrTensor + * + * @return true + * @return false + */ + bool is_sparse_csr_tensor() const; + /* Part 3: Device and Backend methods */ /** @@ -324,7 +340,7 @@ class PADDLE_API Tensor final { * * @return std::shared_ptr */ - std::shared_ptr impl() const; + const std::shared_ptr& impl() const; /** * @brief Set the implemention of current Tensor. @@ -333,6 +349,13 @@ class PADDLE_API Tensor final { */ void set_impl(const std::shared_ptr& impl); + /** + * @brief Set the implemention of current Tensor. + * + * @param impl + */ + void set_impl(std::shared_ptr&& impl); + #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) /** * @brief Get the stream where the tensor is currently located @@ -397,7 +420,9 @@ class PADDLE_API Tensor final { * @param blocking, Should we copy this in sync way. * @return void */ - void copy_(const Tensor& src, const bool blocking); + void copy_(const Tensor& src, + const phi::Place& target_place, + const bool blocking); /** * @brief Cast datatype from one to another * @@ -472,7 +497,21 @@ class PADDLE_API Tensor final { */ void set_autograd_meta(std::shared_ptr autograd_meta); - /* Part 9: Auto generated Tensor methods */ + /* Part 9: Inplace methods */ + + /** + * @brief Increase inplace version + */ + void bump_inplace_version(); + + /** + * @brief Get current inplace version + * + * @return uint32_t + */ + uint32_t current_inplace_version(); + + /* Part 10: Auto generated Tensor methods */ private: /** diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt index 42bf7a8103f837195775b33daf301a7d2e0f4c44..4cbca07236208281f38984022d17b6cb88af8ed8 100644 --- a/paddle/phi/api/lib/CMakeLists.txt +++ b/paddle/phi/api/lib/CMakeLists.txt @@ -148,4 +148,4 @@ cc_library(phi_bw_function_api SRCS ${bw_api_source_file} DEPS phi_tensor_raw ph cc_library(sparse_api SRCS ${sparse_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api_custom_impl) cc_library(sparse_bw_api SRCS ${sparse_bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api sparse_api_custom_impl) -cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api) +cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api api_gen_utils kernel_dispatch infermeta) diff --git a/paddle/phi/api/lib/api_gen_utils.cc b/paddle/phi/api/lib/api_gen_utils.cc index e1ebe8c6465cfdd7f8213c0a31416bc77412221c..0c11e2df65d0db23b4e080bf041c78d976714013 100644 --- a/paddle/phi/api/lib/api_gen_utils.cc +++ b/paddle/phi/api/lib/api_gen_utils.cc @@ -95,12 +95,8 @@ paddle::optional MakeMetaTensor( /* ------------------ for output ----------------------- */ phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out) { - if (!out->initialized()) { - auto dense_tensor = std::make_shared( - phi::make_intrusive(phi::TransToPhiPlace(backend)), - phi::DenseTensorMeta()); - out->set_impl(dense_tensor); - return dense_tensor.get(); + if (out->impl() == nullptr) { + out->set_impl(std::make_shared()); } return static_cast(out->impl().get()); } @@ -111,9 +107,7 @@ std::vector SetKernelOutput(size_t out_size, out->reserve(out_size); std::vector results(out_size); for (size_t i = 0; i < out_size; ++i) { - auto tensor_ptr = std::make_shared( - phi::make_intrusive(phi::TransToPhiPlace(backend)), - phi::DenseTensorMeta()); + auto tensor_ptr = std::make_shared(); results[i] = tensor_ptr.get(); out->emplace_back(); out->back().set_impl(tensor_ptr); diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc index 79b8ac6d0b8352b2e817e6bdbefca74c835ad6b2..e280ab626da74a9b0951925f7472fa49996691cb 100644 --- a/paddle/phi/api/lib/data_transform.cc +++ b/paddle/phi/api/lib/data_transform.cc @@ -167,10 +167,7 @@ phi::DenseTensor TransformData(const phi::DenseTensor& tensor, if (NeedTransformPlace( out.place(), target_args_def.backend, transform_flag)) { - phi::DenseTensor result( - phi::make_intrusive( - phi::TransToPhiPlace(target_args_def.backend)), - {out.dtype(), out.dims(), out.layout()}); + phi::DenseTensor result; framework::TransDataDevice( out, phi::TransToPhiPlace(target_args_def.backend), &result); out = result; @@ -190,14 +187,14 @@ std::shared_ptr PrepareData( tensor_in->dtype(), target_args_def.dtype, transform_flag) && !NeedTransformLayout( tensor_in->layout(), target_args_def.layout, transform_flag))) { - return std::dynamic_pointer_cast(tensor_in); + return std::static_pointer_cast(tensor_in); } phi::DenseTensor out = TransformData(*(static_cast(tensor_in.get())), target_args_def, transform_flag); - return std::make_shared(out); + return std::make_shared(std::move(out)); } std::shared_ptr PrepareData( diff --git a/paddle/phi/api/lib/sparse_api_custom_impl.cc b/paddle/phi/api/lib/sparse_api_custom_impl.cc index 832c19361e5eb03419fe988c9a30304b5993afdf..8f8de02e49bdff8d3e026ca6dbed8948637260ae 100644 --- a/paddle/phi/api/lib/sparse_api_custom_impl.cc +++ b/paddle/phi/api/lib/sparse_api_custom_impl.cc @@ -25,25 +25,24 @@ namespace paddle { namespace experimental { namespace sparse { -Tensor to_sparse_coo_impl(const Tensor& x, - Backend backend, - const int64_t sparse_dim) { +Tensor to_sparse_coo_impl(const Tensor& x, const int64_t sparse_dim) { if (x.layout() == phi::DataLayout::SPARSE_COO) { return x; } + // 1. Get kernel signature and kernel - auto kernel_key_set = ParseKernelKeyByInputArgs(x); - kernel_key_set.backend_set = kernel_key_set.backend_set | BackendSet(backend); - auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); std::string kernel_name = "dense_to_sparse_coo"; if (x.layout() == phi::DataLayout::SPARSE_CSR) { kernel_name = "sparse_csr_to_coo"; } + auto kernel_key_set = ParseKernelKeyByInputArgs(x); + auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); + auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( kernel_name, kernel_key); - VLOG(6) << "to API kernel key: " << kernel_key; + VLOG(6) << "add API kernel key: " << kernel_key; VLOG(6) << "to API kernel: " << kernel; // 2. Get Device Context @@ -62,18 +61,18 @@ Tensor to_sparse_coo_impl(const Tensor& x, // 4. InferMeta auto indices_meta = - phi::DenseTensorMeta(phi::DataType::INT64, {-1}, phi::DataLayout::NCHW); - auto elements_meta = phi::DenseTensorMeta(x.dtype(), {-1}, x.layout()); + phi::DenseTensorMeta(phi::DataType::INT64, {1}, phi::DataLayout::NCHW); + auto elements_meta = phi::DenseTensorMeta(x.dtype(), {1}, x.layout()); // 5. Prepare outputs // create empty SparseCooTensor phi::DenseTensor non_zero_indices( phi::make_intrusive( - phi::TransToPhiPlace(backend)), + phi::TransToPhiPlace(kernel_key.backend())), std::move(indices_meta)); phi::DenseTensor non_zero_elements( phi::make_intrusive( - phi::TransToPhiPlace(backend)), + phi::TransToPhiPlace(kernel_key.backend())), std::move(elements_meta)); auto coo = std::make_shared( non_zero_indices, non_zero_elements, x.dims()); @@ -88,23 +87,23 @@ Tensor to_sparse_coo_impl(const Tensor& x, return out; } -Tensor to_sparse_csr_impl(const Tensor& x, Backend backend) { +Tensor to_sparse_csr_impl(const Tensor& x) { if (x.layout() == phi::DataLayout::SPARSE_CSR) { return x; } // 1. Get kernel signature and kernel - auto kernel_key_set = ParseKernelKeyByInputArgs(x); - kernel_key_set.backend_set = kernel_key_set.backend_set | BackendSet(backend); - auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); std::string kernel_name = "dense_to_sparse_csr"; if (x.layout() == phi::DataLayout::SPARSE_COO) { kernel_name = "sparse_coo_to_csr"; } + auto kernel_key_set = ParseKernelKeyByInputArgs(x); + auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); + auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( kernel_name, kernel_key); - VLOG(6) << "to API kernel key: " << kernel_key; + VLOG(6) << "add API kernel key: " << kernel_key; VLOG(6) << "to API kernel: " << kernel; // 2. Get Device Context @@ -122,24 +121,24 @@ Tensor to_sparse_csr_impl(const Tensor& x, Backend backend) { // 4. InferMeta auto crows_meta = - phi::DenseTensorMeta(phi::DataType::INT64, {-1}, phi::DataLayout::NCHW); + phi::DenseTensorMeta(phi::DataType::INT64, {1}, phi::DataLayout::NCHW); auto cols_meta = - phi::DenseTensorMeta(phi::DataType::INT64, {-1}, phi::DataLayout::NCHW); - auto elements_meta = phi::DenseTensorMeta(x.dtype(), {-1}, x.layout()); + phi::DenseTensorMeta(phi::DataType::INT64, {1}, phi::DataLayout::NCHW); + auto elements_meta = phi::DenseTensorMeta(x.dtype(), {1}, x.layout()); // 5. Prepare outputs // create empty SparseCooTensor phi::DenseTensor non_zero_crows( phi::make_intrusive( - phi::TransToPhiPlace(backend)), + phi::TransToPhiPlace(kernel_key.backend())), std::move(crows_meta)); phi::DenseTensor non_zero_cols( phi::make_intrusive( - phi::TransToPhiPlace(backend)), + phi::TransToPhiPlace(kernel_key.backend())), std::move(cols_meta)); phi::DenseTensor non_zero_elements( phi::make_intrusive( - phi::TransToPhiPlace(backend)), + phi::TransToPhiPlace(kernel_key.backend())), std::move(elements_meta)); auto csr = std::make_shared( non_zero_crows, non_zero_cols, non_zero_elements, x.dims()); @@ -154,24 +153,25 @@ Tensor to_sparse_csr_impl(const Tensor& x, Backend backend) { return out; } -Tensor to_dense_impl(const Tensor& x, Backend backend) { +Tensor to_dense_impl(const Tensor& x) { if (x.layout() != phi::DataLayout::SPARSE_CSR && x.layout() != phi::DataLayout::SPARSE_COO) { return x; } + // 1. Get kernel signature and kernel - auto kernel_key_set = ParseKernelKeyByInputArgs(x); - kernel_key_set.backend_set = kernel_key_set.backend_set | BackendSet(backend); - auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); std::string kernel_name = "sparse_coo_to_dense"; if (x.layout() == phi::DataLayout::SPARSE_CSR) { kernel_name = "sparse_csr_to_dense"; } + auto kernel_key_set = ParseKernelKeyByInputArgs(x); + auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); + auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( kernel_name, kernel_key); - VLOG(6) << "to API kernel key: " << kernel_key; + VLOG(6) << "add API kernel key: " << kernel_key; VLOG(6) << "to API kernel: " << kernel; // 2. Get Device Context @@ -194,7 +194,7 @@ Tensor to_dense_impl(const Tensor& x, Backend backend) { // create empty SparseCooTensor auto dense_out = std::make_shared( phi::make_intrusive( - phi::TransToPhiPlace(backend)), + phi::TransToPhiPlace(kernel_key.backend())), std::move(dense_meta)); kernel_context.EmplaceBackOutput(dense_out.get()); diff --git a/paddle/phi/api/lib/sparse_api_custom_impl.h b/paddle/phi/api/lib/sparse_api_custom_impl.h index 293b2cfa3d33480ccccd0f601f8e15c639b93e1e..6053d281f0ff169db2868cb7fd949b75f816616a 100644 --- a/paddle/phi/api/lib/sparse_api_custom_impl.h +++ b/paddle/phi/api/lib/sparse_api_custom_impl.h @@ -21,13 +21,11 @@ namespace paddle { namespace experimental { namespace sparse { -Tensor to_dense_impl(const Tensor& x, Backend backend); +Tensor to_dense_impl(const Tensor& x); -Tensor to_sparse_coo_impl(const Tensor& x, - Backend backend, - const int64_t sparse_dim); +Tensor to_sparse_coo_impl(const Tensor& x, const int64_t sparse_dim); -Tensor to_sparse_csr_impl(const Tensor& x, Backend backend); +Tensor to_sparse_csr_impl(const Tensor& x); } // namespace sparse } // namespace experimental diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc index 40174a505dcc9b3d475254b7cec7691300c7aecf..066287d4244797e316a34d524eca171e94afcfdf 100644 --- a/paddle/phi/api/lib/tensor.cc +++ b/paddle/phi/api/lib/tensor.cc @@ -25,6 +25,8 @@ limitations under the License. */ #include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/selected_rows.h" +#include "paddle/phi/core/sparse_coo_tensor.h" +#include "paddle/phi/core/sparse_csr_tensor.h" #include "paddle/phi/core/tensor_base.h" #include "paddle/phi/core/tensor_meta.h" #include "paddle/phi/core/tensor_utils.h" @@ -46,6 +48,7 @@ limitations under the License. */ * In the future, the necessary components will be moved to the this library, * or the corresponding components will be re-implemented. */ + #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/stream/cuda_stream.h" @@ -131,6 +134,12 @@ bool Tensor::is_dense_tensor() const { bool Tensor::is_selected_rows() const { return phi::SelectedRows::classof(impl_.get()); } +bool Tensor::is_sparse_coo_tensor() const { + return phi::SparseCooTensor::classof(impl_.get()); +} +bool Tensor::is_sparse_csr_tensor() const { + return phi::SparseCsrTensor::classof(impl_.get()); +} /* Part 3: Device and Backend methods */ PlaceType Tensor::place() const { @@ -142,7 +151,12 @@ PlaceType Tensor::place() const { } paddle::platform::Place Tensor::inner_place() const { - return ConvertExtPlaceToInnerPlace(place()); + PADDLE_ENFORCE_NOT_NULL( + impl_, + phi::errors::PermissionDenied( + "Null pointer error, the impl_ of Tensor should not be " + "Null when calling Tensor::inner_place().")); + return impl_->place(); } bool Tensor::is_cpu() const { @@ -286,12 +300,16 @@ Tensor Tensor::slice(int64_t begin_idx, int64_t end_idx) const { } } -std::shared_ptr Tensor::impl() const { return impl_; } +const std::shared_ptr &Tensor::impl() const { return impl_; } void Tensor::set_impl(const std::shared_ptr &impl) { impl_ = impl; } +void Tensor::set_impl(std::shared_ptr &&impl) { + impl_ = std::move(impl); +} + #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) gpuStream_t Tensor::stream() const { return platform::stream::get_current_stream(-1)->raw_stream(); @@ -337,5 +355,36 @@ void Tensor::set_autograd_meta( autograd_meta_ = std::move(autograd_meta); } +void Tensor::bump_inplace_version() { + if (is_dense_tensor()) { + auto &inplace_version_counter = + std::dynamic_pointer_cast(impl_) + ->InplaceVersionCounter(); + VLOG(3) << "yoki: before bump inplace version: " + << inplace_version_counter.CurrentVersion(); + inplace_version_counter.Bump(); + VLOG(3) << "yoki: after bump inplace version: " + << inplace_version_counter.CurrentVersion(); + } else { + PADDLE_THROW(phi::errors::Unimplemented( + "bump_inplace_version is only supported on DenseTensor now.")); + } +} + +uint32_t Tensor::current_inplace_version() { + if (is_dense_tensor()) { + auto &inplace_version_counter = + std::dynamic_pointer_cast(impl_) + ->InplaceVersionCounter(); + VLOG(3) << "yoki: print version: " + << inplace_version_counter.CurrentVersion(); + return inplace_version_counter.CurrentVersion(); + } else { + PADDLE_THROW(phi::errors::Unimplemented( + "current_inplace_version is only supported on DenseTensor now.")); + } + return 0; +} + } // namespace experimental } // namespace paddle diff --git a/paddle/phi/api/lib/tensor_method.cc b/paddle/phi/api/lib/tensor_method.cc index 885e29b27fa8e723ad0e89f9a99c2accd3c172f6..cc797507e68ec11005d6ac35d5dca2d19418598d 100644 --- a/paddle/phi/api/lib/tensor_method.cc +++ b/paddle/phi/api/lib/tensor_method.cc @@ -19,9 +19,12 @@ limitations under the License. */ #include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/tensor_base.h" +#include "paddle/phi/api/lib/api_gen_utils.h" +#include "paddle/phi/api/lib/kernel_dispatch.h" +#include "paddle/phi/infermeta/unary.h" + namespace paddle { namespace experimental { - // declare cast api Tensor cast(const Tensor &x, DataType out_dtype); Tensor copy_to(const Tensor &x, Backend backend, bool blocking); @@ -67,12 +70,18 @@ template PADDLE_API Tensor Tensor::copy_to>( template PADDLE_API Tensor Tensor::copy_to(const PlaceType &target_place) const; -void Tensor::copy_(const Tensor &src, bool blocking) { +void Tensor::copy_(const Tensor &src, + const phi::Place &target_place, + bool blocking) { if (!src.is_initialized()) { + VLOG(8) << "Src is empty, skip copy"; return; } + // Prepare copy kernel key and outputs + auto kernel_key_set = ParseKernelKeyByInputArgs(src); + KernelType kernel_type = ParseKernelTypeByInputArgs(src); VLOG(3) << "Deep copy Tensor from " << src.name() << " to " << name(); - if (defined()) { + if (is_initialized()) { PADDLE_ENFORCE_EQ(dtype(), src.dtype(), platform::errors::PreconditionNotMet( @@ -87,10 +96,91 @@ void Tensor::copy_(const Tensor &src, bool blocking) { "Copy cannot be performed!", name(), src.name())); + PADDLE_ENFORCE_EQ(target_place, + inner_place(), + platform::errors::PreconditionNotMet( + "Place is different of dst tensor and args %s, which " + "current tensor holds %s " + "Copy cannot be performed!", + target_place.DebugString(), + inner_place().DebugString())); + kernel_key_set.backend_set = + kernel_key_set.backend_set | + BackendSet(phi::TransToPhiBackend(inner_place())); + } else { + // Deep Copy AutoGrad info from src to self. + *autograd_meta_ = *(src.autograd_meta_); + } + + auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); + auto *dev_ctx = GetDeviceContextByBackend(kernel_key.backend()); + Backend kernel_backend = Backend::UNDEFINED; + DataLayout kernel_layout = DataLayout::UNDEFINED; + DataType kernel_data_type = DataType::UNDEFINED; + + if (kernel_backend == Backend::UNDEFINED || + kernel_layout == DataLayout::UNDEFINED || + kernel_data_type == DataType::UNDEFINED) { + if (kernel_backend == Backend::UNDEFINED) { + kernel_backend = kernel_key.backend(); + } + if (kernel_layout == DataLayout::UNDEFINED) { + kernel_layout = kernel_key.layout(); + } + if (kernel_data_type == DataType::UNDEFINED) { + kernel_data_type = kernel_key.dtype(); + } + } + + if (kernel_type == KernelType::DENSE_TENSOR_KENREL) { + auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( + "copy", {kernel_backend, kernel_layout, kernel_data_type}); + VLOG(6) << "copy API kernel key: " << kernel_key; + VLOG(6) << "copy API kernel: " << kernel; + using kernel_signature = void (*)(const platform::DeviceContext &, + const phi::DenseTensor &, + phi::Place, + bool, + phi::DenseTensor *); + SetKernelOutput(kernel_backend, this); + phi::MetaTensor meta_out(impl_.get()); + phi::UnchangedInferMeta( + MakeMetaTensor( + *(std::static_pointer_cast(src.impl_))), + &meta_out); + auto *kernel_fn = kernel.GetVariadicKernelFn(); + (*kernel_fn)(*dev_ctx, + (*(std::static_pointer_cast(src.impl_))), + target_place, + blocking, + static_cast(impl_.get())); + } else if (kernel_type == KernelType::SELECTED_ROWS_KENREL) { + auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( + "copy_sr", {kernel_backend, kernel_layout, kernel_data_type}); + VLOG(6) << "copy API kernel key: " << kernel_key; + VLOG(6) << "copy API kernel: " << kernel; + using kernel_signature = void (*)(const platform::DeviceContext &, + const phi::SelectedRows &, + phi::Place, + bool, + phi::SelectedRows *); + SetSelectedRowsKernelOutput(kernel_backend, this); + phi::MetaTensor meta_out(impl_.get()); + phi::UnchangedInferMeta( + MakeMetaTensor( + *(std::static_pointer_cast(src.impl_))), + &meta_out); + auto *kernel_fn = kernel.GetVariadicKernelFn(); + (*kernel_fn)(*dev_ctx, + (*(std::static_pointer_cast(src.impl_))), + target_place, + blocking, + static_cast(impl_.get())); + } else { + PADDLE_THROW(paddle::platform::errors::InvalidArgument( + "We currently only support dense tensor copy for now and if u need to " + "copy selected rows please raise a issue.")); } - auto copy_tensor = - src.copy_to(phi::TransToPhiBackend(src.inner_place()), blocking); - set_impl(copy_tensor.impl()); } } // namespace experimental diff --git a/paddle/phi/api/lib/utils/CMakeLists.txt b/paddle/phi/api/lib/utils/CMakeLists.txt index 6d056b54b70058e33501083d9754aa27466c0f59..271a58222f0c0f6b60642482691bed635e4d5f3c 100644 --- a/paddle/phi/api/lib/utils/CMakeLists.txt +++ b/paddle/phi/api/lib/utils/CMakeLists.txt @@ -1,2 +1,2 @@ cc_library(phi_api_utils SRCS storage.cc tensor_utils.cc DEPS -tensor_base convert_utils dense_tensor lod_tensor selected_rows_utils place var_type_traits) +tensor_base convert_utils dense_tensor lod_tensor selected_rows_utils place var_type_traits scalar) diff --git a/paddle/phi/backends/device_manager.cc b/paddle/phi/backends/device_manager.cc index 1ffe38d8e1f4ce59aa819a5eaa46c75d5fded5b0..35339aed0f3e1cd87ac65855e0255fa3277a6bfb 100644 --- a/paddle/phi/backends/device_manager.cc +++ b/paddle/phi/backends/device_manager.cc @@ -393,6 +393,11 @@ DeviceManager& DeviceManager::Instance() { return platform_manager; } +void DeviceManager::Clear() { + Instance().device_map_.clear(); + Instance().device_impl_map_.clear(); +} + std::vector ListAllLibraries(const std::string& library_dir) { std::vector libraries; std::regex express(".*\\.so"); diff --git a/paddle/phi/backends/device_manager.h b/paddle/phi/backends/device_manager.h index c0911a0f8d50c52697b748f3726faded5a428694..39eef27b4a607bd3af75a6b5dde07f715e5537e5 100644 --- a/paddle/phi/backends/device_manager.h +++ b/paddle/phi/backends/device_manager.h @@ -158,6 +158,8 @@ class DeviceManager { static std::vector GetDeviceList(const std::string& device_type); + static void Clear(); + private: DISABLE_COPY_AND_ASSIGN(DeviceManager); DeviceManager() {} diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc index a3b252598582bc212ba66f9c18ec52e035a29a68..0394835aa8b700ba4f9ee9b106661e2d70fc50b6 100644 --- a/paddle/phi/backends/gpu/gpu_context.cc +++ b/paddle/phi/backends/gpu/gpu_context.cc @@ -741,6 +741,10 @@ struct GPUContext::Impl { GPUContext::GPUContext() : DeviceContext(), impl_(std::make_unique()) {} +GPUContext::GPUContext(GPUContext&&) = default; + +GPUContext& GPUContext::operator=(GPUContext&&) = default; + GPUContext::GPUContext(const GPUPlace& place) : DeviceContext(), impl_(std::make_unique(place)) {} diff --git a/paddle/phi/backends/gpu/gpu_context.h b/paddle/phi/backends/gpu/gpu_context.h index 3eb4360ad35382369681308b46050cc3e6e04ea0..cd08da1c0f2f8031a461a0410a89254823a6a903 100644 --- a/paddle/phi/backends/gpu/gpu_context.h +++ b/paddle/phi/backends/gpu/gpu_context.h @@ -77,6 +77,8 @@ class DnnWorkspaceHandle { class GPUContext : public DeviceContext { public: GPUContext(); + GPUContext(GPUContext&&); + GPUContext& operator=(GPUContext&&); explicit GPUContext(const GPUPlace& place); diff --git a/paddle/phi/common/CMakeLists.txt b/paddle/phi/common/CMakeLists.txt index 85a1424ee34e04b50a077f5d8ac88d0a0d2fbe78..9bf692703860f15601ad601970ea1f5b1316442b 100644 --- a/paddle/phi/common/CMakeLists.txt +++ b/paddle/phi/common/CMakeLists.txt @@ -1 +1,2 @@ cc_library(phi_place SRCS place.cc) +cc_library(scalar SRCS scalar.cc DEPS phi_enforce) diff --git a/paddle/phi/common/scalar.cc b/paddle/phi/common/scalar.cc new file mode 100644 index 0000000000000000000000000000000000000000..5cd55c1e88bed6f805a72cff92024d9dc219a1a2 --- /dev/null +++ b/paddle/phi/common/scalar.cc @@ -0,0 +1,35 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/common/scalar.h" + +#include "paddle/phi/core/enforce.h" + +namespace paddle { +namespace experimental { + +// NOTE(xiongkun): why we put definition here? +// test_custom_op can't include enforce.h, because enforce.h includes gflags. +// so we decouple the include dependence of enforce.h by link. +void ThrowTensorConvertError(int num) { + PADDLE_ENFORCE_EQ(num, + 1, + phi::errors::InvalidArgument( + "The Scalar only supports Tensor with 1 element, but " + "now Tensor has `%d` elements", + num)); +} + +} // namespace experimental +} // namespace paddle diff --git a/paddle/phi/common/scalar.h b/paddle/phi/common/scalar.h index 72cef89d300c8d60811bde7cf667275b37fedc6f..5134f4eb72639650a0bde34f2abbb0e05ced13c7 100644 --- a/paddle/phi/common/scalar.h +++ b/paddle/phi/common/scalar.h @@ -19,9 +19,12 @@ limitations under the License. */ #include "paddle/phi/api/ext/exception.h" #include "paddle/phi/api/include/tensor.h" + namespace paddle { namespace experimental { +void ThrowTensorConvertError(int); + template class ScalarBase { public: @@ -104,11 +107,7 @@ class ScalarBase { // The Tensor must have one dim ScalarBase(const T& tensor) : dtype_(tensor.dtype()) { // NOLINT is_from_tensor_ = true; - PD_CHECK( - tensor.numel() == 1, - "The Scalar only supports Tensor with 1 element, but now Tensor has `", - tensor.numel(), - "` element."); + ThrowTensorConvertError(tensor.numel()); switch (dtype_) { case DataType::FLOAT32: data_.f32 = tensor.template data()[0]; @@ -156,6 +155,8 @@ class ScalarBase { CopyScalar(other, this); } + // NOTE(xiongkun): some op need to judge the dtype of the Scalar, we expose a + // interface. bool FromTensor() const { return is_from_tensor_; } void SetFromTensor(bool from_tensor) { is_from_tensor_ = from_tensor; } diff --git a/paddle/phi/core/compat/arg_map_context.h b/paddle/phi/core/compat/arg_map_context.h index 25b80279ecf10619d97b8800b24ab5353c79745d..71cec011411641ffe34918f03162800b111275a2 100644 --- a/paddle/phi/core/compat/arg_map_context.h +++ b/paddle/phi/core/compat/arg_map_context.h @@ -89,6 +89,8 @@ class ArgumentMappingContext { virtual bool IsDenseTensorInput(const std::string& name) const = 0; virtual bool IsSelectedRowsInput(const std::string& name) const = 0; + // For compatibility with LoDTensorArray + virtual bool IsDenseTensorVectorInput(const std::string& name) const = 0; virtual bool IsDenseTensorOutput(const std::string& name) const = 0; virtual bool IsSelectedRowsOutput(const std::string& name) const = 0; diff --git a/paddle/phi/core/compat/op_utils.h b/paddle/phi/core/compat/op_utils.h index 7f4384545f353ecdbd33c73751e186061bf316cc..946230cb169d20db56a46399552b629348c4783f 100644 --- a/paddle/phi/core/compat/op_utils.h +++ b/paddle/phi/core/compat/op_utils.h @@ -47,7 +47,15 @@ const std::unordered_set deprecated_op_names({"diag", "matmul_grad", "matmul_grad_grad", "mean", + "mean_grad", "max", + "max_grad", + "min", + "min_grad", + "prod", + "prod_grad", + "any", + "all", "reshape", "reshape_grad", "expand", diff --git a/paddle/phi/core/kernel_context.cc b/paddle/phi/core/kernel_context.cc index a32e0e44f469694c62ff33863971d3b04004ff37..234e3528c363b948c0a3e3b22d5ee676660fce76 100644 --- a/paddle/phi/core/kernel_context.cc +++ b/paddle/phi/core/kernel_context.cc @@ -37,6 +37,13 @@ void KernelContext::EmplaceBackInputs( std::make_move_iterator(inputs.end())); } +void KernelContext::EmplaceBackInputsWithoutSetRange( + paddle::SmallVector inputs) { + inputs_.insert(inputs_.end(), + std::make_move_iterator(inputs.begin()), + std::make_move_iterator(inputs.end())); +} + void KernelContext::EmplaceBackOutput(TensorBase* output) { int index = outputs_.size(); outputs_.emplace_back(output); @@ -59,6 +66,13 @@ void KernelContext::EmplaceBackOutputs( std::make_move_iterator(outputs.end())); } +void KernelContext::EmplaceBackOutputsWithoutSetRange( + paddle::SmallVector outputs) { + outputs_.insert(outputs_.end(), + std::make_move_iterator(outputs.begin()), + std::make_move_iterator(outputs.end())); +} + void KernelContext::EmplaceBackAttr(paddle::any attr) { attrs_.emplace_back(std::move(attr)); } diff --git a/paddle/phi/core/kernel_context.h b/paddle/phi/core/kernel_context.h index 213ac47d30bfdd28541bd1b9cb24bf2053b1c939..d3ca1ffc61c42c06c2b33cccdb6f1037df237a24 100644 --- a/paddle/phi/core/kernel_context.h +++ b/paddle/phi/core/kernel_context.h @@ -52,12 +52,18 @@ class KernelContext { void EmplaceBackInputs(paddle::SmallVector inputs); + void EmplaceBackInputsWithoutSetRange( + paddle::SmallVector inputs); + void EmplaceBackOutput(TensorBase* output); void EmplaceBackOutputWithoutSetRange(TensorBase* output); void EmplaceBackOutputs(paddle::SmallVector outputs); + void EmplaceBackOutputsWithoutSetRange( + paddle::SmallVector outputs); + void EmplaceBackAttr(paddle::any attr); const std::pair& InputRangeAt(size_t idx) const; diff --git a/paddle/phi/core/kernel_factory.h b/paddle/phi/core/kernel_factory.h index be91409762635e8aabdd6953aa5527d94959e4b2..e502b9cb3e02536e8d764a4cbc5e1d5509960303 100644 --- a/paddle/phi/core/kernel_factory.h +++ b/paddle/phi/core/kernel_factory.h @@ -197,8 +197,16 @@ class Kernel { const KernelArgsDef& args_def() const { return args_def_; } + const TensorArgDef& InputAt(size_t idx) const { + return args_def_.input_defs().at(idx); + } + TensorArgDef& InputAt(size_t idx) { return args_def_.input_defs().at(idx); } + const TensorArgDef& OutputAt(size_t idx) const { + return args_def_.output_defs().at(idx); + } + TensorArgDef& OutputAt(size_t idx) { return args_def_.output_defs().at(idx); } bool IsValid() { return fn_ != nullptr; } diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h index d9ed68593cd610790ee4a0015069ac5a8cfea61b..c3356eadcbd2156617a7a69324e7b440cc54b339 100644 --- a/paddle/phi/core/kernel_registry.h +++ b/paddle/phi/core/kernel_registry.h @@ -98,6 +98,28 @@ struct KernelArgsParseFunctor { default_tensor_layout, default_key.dtype(), arg_type); + } else if (arg_type == std::type_index(typeid(const SparseCooTensor&))) { + args_def->AppendInput(default_key.backend(), + default_tensor_layout, + default_key.dtype(), + arg_type); + } else if (arg_type == std::type_index(typeid( + paddle::optional))) { + args_def->AppendInput(default_key.backend(), + default_tensor_layout, + default_key.dtype(), + arg_type); + } else if (arg_type == std::type_index(typeid(const SparseCsrTensor&))) { + args_def->AppendInput(default_key.backend(), + default_tensor_layout, + default_key.dtype(), + arg_type); + } else if (arg_type == std::type_index(typeid( + paddle::optional))) { + args_def->AppendInput(default_key.backend(), + default_tensor_layout, + default_key.dtype(), + arg_type); } else if (arg_type == std::type_index(typeid(DenseTensor*))) { args_def->AppendOutput(default_key.backend(), default_tensor_layout, @@ -114,6 +136,16 @@ struct KernelArgsParseFunctor { default_tensor_layout, default_key.dtype(), arg_type); + } else if (arg_type == std::type_index(typeid(SparseCooTensor*))) { + args_def->AppendOutput(default_key.backend(), + default_tensor_layout, + default_key.dtype(), + arg_type); + } else if (arg_type == std::type_index(typeid(SparseCsrTensor*))) { + args_def->AppendOutput(default_key.backend(), + default_tensor_layout, + default_key.dtype(), + arg_type); } else { // Attribute deal with // TODO(chenweihang): now here allow any types of attribute, maybe diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h index 06ee5a205d7b0f2f842e1b9b4b8fad8948168b64..260fbfe7197912fd3dd5b9103a0a991a45d55816 100644 --- a/paddle/phi/infermeta/backward.h +++ b/paddle/phi/infermeta/backward.h @@ -21,6 +21,10 @@ limitations under the License. */ namespace phi { +// Common InferMeta Functions for backward operators. +// +// NOTE: The InferMeta Functions in this file are arranged in alphabetic order. + void BilinearTensorProductGradInferMeta(const MetaTensor& x, const MetaTensor& y, const MetaTensor& weight, diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index 2947661517e78f328100d9b208cfa45c5f98dae1..aabb944db30b9f30394f092c245bc0307d8bbf3f 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -18,8 +18,11 @@ limitations under the License. */ #include #include "paddle/phi/common/data_type.h" #include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/kernels/funcs/common_shape.h" +#include "paddle/phi/kernels/cpu/conv_util.h" + namespace phi { namespace detail { @@ -72,6 +75,51 @@ void AllValueCompareInferMeta(const MetaTensor& x, out->set_dtype(DataType::BOOL); } +void KLDivInferMeta(const MetaTensor& x, + const MetaTensor& label, + const std::string& reduction, + MetaTensor* out, + MetaConfig config) { + auto dim_x = x.dims(); + auto dim_target = label.dims(); + PADDLE_ENFORCE_EQ(dim_x.size(), + dim_target.size(), + phi::errors::InvalidArgument( + "Input(X) rank and Input(Target) rank should be " + "same, but received X rank(%d) != Target rank(%d)", + dim_x.size(), + dim_target.size())); + for (int i = 0; i < dim_x.size(); i++) { + if (config.is_runtime || (dim_x[i] > 0 && dim_target[i] > 0)) { + PADDLE_ENFORCE_EQ( + dim_x[i], + dim_target[i], + phi::errors::InvalidArgument( + "Input(X) and Input(Target) should in same shape. but received " + "X dimension[%d](%d) != Target dimension[%d](%d)", + i, + dim_x[i], + i, + dim_target[i])); + } + } + + auto reduction_valid = "mean" == reduction || "sum" == reduction || + "batchmean" == reduction || "none" == reduction; + PADDLE_ENFORCE_EQ( + reduction_valid, + true, + phi::errors::InvalidArgument( + "Attr(reduction) can only be 'none'|'batchmean'|'sum'|'mean'.")); + + if ("none" == reduction) { + out->set_dims(dim_x); + } else { + out->set_dims({1}); + } + out->set_dtype(x.dtype()); +} + void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) { out->share_meta(x); } @@ -309,6 +357,161 @@ void CrossInferMeta(const MetaTensor& x, out->share_lod(x); } +void ConvInferMeta(const MetaTensor& input, + const MetaTensor& filter, + const std::vector& strides, + const std::vector& paddings_t, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_t, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + MetaTensor* out, + MetaConfig config) { + std::vector paddings = paddings_t; + std::vector dilations = dilations_t; + auto in_dims = input.dims(); + auto filter_dims = filter.dims(); + int dilation_size = dilations.size(); + for (int i = 0; i < dilation_size; ++i) { + PADDLE_ENFORCE_GT( + dilations[i], + 0, + phi::errors::InvalidArgument( + "The dilation of Op(Conv) should be larget than 0, but received " + "dilation is %d.", + dilations[i])); + } + const bool channel_last = (config.is_run_mkldnn_kernel == false) && + (data_format == "NHWC" || data_format == "NDHWC"); + + PADDLE_ENFORCE_EQ( + in_dims.size() == 4 || in_dims.size() == 5, + true, + phi::errors::InvalidArgument( + "The input of Op(Conv) should be a 4-D or 5-D Tensor. But " + "received: input's dimension is %u, input's shape is [%s].", + in_dims.size(), + in_dims)); + + PADDLE_ENFORCE_EQ( + in_dims.size(), + filter_dims.size(), + phi::errors::InvalidArgument( + "The input's dimension and filter's dimension of " + "Op(Conv) should be equal. But received: the input's shape is [%s], " + "the input's dimension is %d; the filter's shape is [%s], " + "the filter's dimension is %d.", + in_dims, + in_dims.size(), + filter_dims, + filter_dims.size())); + + int stride_size = strides.size(); + for (int i = 0; i < stride_size; ++i) { + PADDLE_ENFORCE_GT( + strides[i], + 0, + phi::errors::InvalidArgument( + "The stride of Op(Conv) should be larget than 0, but received " + "stride is %d.", + strides[i])); + } + + int in_sub_stride_size = in_dims.size() - stride_size; + PADDLE_ENFORCE_EQ( + in_dims.size(), + strides.size() + 2U, + phi::errors::InvalidArgument( + "The difference of input's dimension and Attr(strides)'s " + "length must be euqal to 2 for Op(Conv). " + "But received: input's dimension is %d, input's shape is [%s]; " + "Attr(stride)'s length is %d, Attr(stride) is [%s]; " + "difference of input's dimention and Attr(strides)'s length = %u.", + in_dims.size(), + in_dims, + strides.size(), + phi::make_ddim(strides), + in_sub_stride_size)); + + const auto input_channels = + channel_last ? in_dims[in_dims.size() - 1] : in_dims[1]; + + PADDLE_ENFORCE_EQ( + input_channels, + filter_dims[1] * groups, + phi::errors::InvalidArgument( + "The number of input's channels should be equal to filter's channels " + "* groups for Op(Conv). But received: the input's channels is %d, " + "the input's shape is [%s]; the filter's channels is %d, the " + "filter's shape is [%s]; the groups is %d, the data_format is %s. " + "The error may come from wrong data_format setting.", + input_channels, + in_dims, + filter_dims[1], + filter_dims, + groups, + data_format)); + PADDLE_ENFORCE_EQ( + filter_dims[0] % groups, + 0, + phi::errors::InvalidArgument( + "The number of output's channels (filter's first dimension) of " + "Op(Conv) should be divided by groups. But received: " + "the output channels is %d, the filter's shape is [%s], " + "the groups is %d.", + filter_dims[0], + filter_dims, + groups)); + + if (config.is_runtime) { + PADDLE_ENFORCE_GT( + filter_dims[0], + 0, + phi::errors::InvalidArgument( + "the size of filter at axis 0 should be greater than 0")); + } + + DDim in_data_dims; + if (channel_last) { + in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); + } else { + in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); + } + + DDim filter_data_dims = phi::slice_ddim(filter_dims, 2, filter_dims.size()); + + std::vector ksize = phi::vectorize(filter_data_dims); + phi::UpdatePaddingAndDilation( + &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); + + std::vector output_shape({in_dims[0]}); + if (!channel_last) { + output_shape.push_back(filter_dims[0]); + } + for (int i = 0; i < in_data_dims.size(); ++i) { + if ((!config.is_runtime) && + (in_data_dims[i] <= 0 || filter_dims[i + 2] <= 0)) { + output_shape.push_back(-1); + } else { + const int dkernel = dilations[i] * (filter_data_dims[i] - 1) + 1; + int output_size = + (in_data_dims[i] + paddings[2 * i] + paddings[2 * i + 1] - dkernel) / + strides[i] + + 1; + output_shape.push_back(output_size); + } + } + if (channel_last) { + output_shape.push_back(filter_dims[0]); + } + + out->set_dims(make_ddim(output_shape)); + out->set_dtype(input.dtype()); +} + void DistInferMeta(const MetaTensor& x, const MetaTensor& y, float p, @@ -430,6 +633,82 @@ void ElementwiseRawInferMeta(const MetaTensor& x, out->share_lod(x); } +void ExpandAsInferMeta(const MetaTensor& x, + paddle::optional y, + const std::vector& target_shape, + MetaTensor* out) { +#define MAX_RANK_SUPPORTED 6 + auto x_dims = x.dims(); + PADDLE_ENFORCE_GE( + target_shape.size(), + static_cast(x_dims.size()), + phi::errors::InvalidArgument( + "The rank of target_shape must be greater than or equal " + "to the rank of Input(X). But received Input(X): input " + "rank %u; received target_shape: rank %u.", + x_dims.size(), + target_shape.size())); + PADDLE_ENFORCE_LE(target_shape.size(), + MAX_RANK_SUPPORTED, + phi::errors::InvalidArgument( + "The rank of target_shape must be less than or equal " + "to %d. But received: rank %u.", + MAX_RANK_SUPPORTED, + target_shape.size())); + out->set_dims(phi::make_ddim(target_shape)); + out->set_dtype(x.dtype()); +#undef MAX_RANK_SUPPORTED +} + +void GatherInferMeta(const MetaTensor& x, + const MetaTensor& index, + const Scalar& axis, + MetaTensor* out) { + auto index_dims = index.dims(); + + if (index_dims.size() == 2) { + PADDLE_ENFORCE_EQ( + index_dims[1], + 1, + phi::errors::InvalidArgument( + "The last dim of index should be 1 when it is 2D, but we get %d", + index_dims[1])); + } else { + PADDLE_ENFORCE_EQ( + index_dims.size(), + 1, + phi::errors::InvalidArgument( + "The index should be 1D, when it is not 2D, but we get %d", + index_dims.size())); + } + + auto input_dim = x.dims(); + auto axis_v = axis.to(); + if (axis.FromTensor() || axis_v == 0) { + // if axis.FromTensor(), we can not obtain correct shape of output + int batch_size = index_dims[0]; + phi::DDim output_dims(input_dim); + output_dims[0] = batch_size; + out->set_dims(output_dims); + out->set_dtype(x.dtype()); + out->share_lod(x); + } else { + int index_size = index_dims[0]; + std::vector out_dim_vec; + for (int i = 0; i < axis_v; i++) { + out_dim_vec.push_back(input_dim[i]); + } + out_dim_vec.push_back(index_size); + for (int i = axis_v + 1; i < input_dim.size(); i++) { + out_dim_vec.push_back(input_dim[i]); + } + auto output_dims = phi::make_ddim(out_dim_vec); + out->set_dims(output_dims); + out->set_dtype(x.dtype()); + out->share_lod(x); + } +} + void GatherNdInferMeta(const MetaTensor& x, const MetaTensor& index, MetaTensor* out) { @@ -476,6 +755,48 @@ void GatherTreeMeta(const MetaTensor& ids, out->set_dims(ids_dims); } +void GridSampleBaseInferMeta(const MetaTensor& x, + const MetaTensor& grid, + MetaTensor* out, + MetaConfig config) { + auto x_dims = x.dims(); + auto grid_dims = grid.dims(); + PADDLE_ENFORCE_EQ(x_dims.size(), + 4, + phi::errors::InvalidArgument( + "Input(X) of GridSampleOp should be 4-D Tensor, but " + "received X dimension size(%d)", + x_dims.size())); + PADDLE_ENFORCE_EQ(grid_dims.size(), + 4, + phi::errors::InvalidArgument( + "Input(Grid) of GridSampleOp should be 4-D Tensor, " + "but received X dimension size(%d)", + grid_dims.size())); + if (config.is_runtime || grid_dims[3] > 0) { + PADDLE_ENFORCE_EQ( + grid_dims[3], + 2, + phi::errors::InvalidArgument( + "Input(Grid) dimension[3] should be 2, but received %d", + grid_dims[3])); + } + if (config.is_runtime) { + PADDLE_ENFORCE_EQ( + grid_dims[0], + x_dims[0], + phi::errors::InvalidArgument( + "Input(X) and Input(Grid) dimension[0] should be equal, but " + "received X dimension[0](%d) != Grid dimension[0](%d)", + x_dims[0], + grid_dims[0])); + } + + out->set_dims({x_dims[0], x_dims[1], grid_dims[1], grid_dims[2]}); + out->set_dtype(x.dtype()); + out->share_lod(x); +} + void HuberLossInferMeta(const MetaTensor& input, const MetaTensor& label, float delta, @@ -548,6 +869,67 @@ void IndexSampleInferMeta(const MetaTensor& x, out->share_lod(y); } +void IndexSelectInferMeta(const MetaTensor& x, + const MetaTensor& index, + int dim, + MetaTensor* output) { + auto input_dim = x.dims(); + auto index_dim = index.dims(); + + PADDLE_ENFORCE_EQ( + dim < input_dim.size() && dim >= (0 - input_dim.size()), + true, + phi::errors::OutOfRange( + "Attr(dim) is out of range, It's expected " + "to be in range of [-%d, %d]. But received Attr(dim) = %d.", + input_dim.size(), + input_dim.size() - 1, + dim)); + + PADDLE_ENFORCE_EQ( + index_dim.size() == 1 || (index_dim.size() == 2 && index_dim[1] == 1), + true, + phi::errors::InvalidArgument( + "The 'shape' of Input(Index) must be 1-D tensor. " + "But received: the 'shape' of Input(Index) is [%s], " + "the dimension of Input(Index) is [%d].", + index_dim, + index_dim.size())); + + PADDLE_ENFORCE_EQ( + index_dim[0] != 0, + true, + phi::errors::InvalidArgument("The length of Input(Index) can't be 0.")); + + auto output_dim = phi::vectorize(input_dim); + if (dim < 0) { + dim += input_dim.size(); + } + output_dim[dim] = index_dim[0]; + output->set_dims(phi::make_ddim(output_dim)); + output->set_dtype(x.dtype()); + output->set_layout(x.layout()); + output->share_lod(x); +} + +void KronInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) { + auto dim_x = x.dims(); + auto dim_y = y.dims(); + auto rank_x = dim_x.size(); + auto rank_y = dim_y.size(); + auto rank = (rank_x > rank_y) ? rank_x : rank_y; + + std::vector dim_out; + dim_out.reserve(rank); + for (int i = 0; i < rank; i++) { + int64_t dim_xi = (i < rank - rank_x) ? 1 : dim_x.at(i - (rank - rank_x)); + int64_t dim_yi = (i < rank - rank_y) ? 1 : dim_y.at(i - (rank - rank_y)); + dim_out.push_back(dim_xi == -1 || dim_yi == -1 ? -1 : dim_xi * dim_yi); + } + out->set_dims(phi::make_ddim(dim_out)); + out->set_dtype(x.dtype()); +} + void LogLossInferMeta(const MetaTensor& input, const MetaTensor& label, float epsilon, @@ -590,6 +972,13 @@ void LogLossInferMeta(const MetaTensor& input, out->share_lod(input); } +void MaskedSelectInferMeta(const MetaTensor& x, + const MetaTensor& mask, + MetaTensor* out) { + out->set_dims({-1}); // can not infer + out->set_dtype(x.dtype()); +} + void MatmulInferMeta(const MetaTensor& x, const MetaTensor& y, bool trans_x, @@ -693,6 +1082,157 @@ void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out) { out->share_lod(x); } +void PReluInferMeta(const MetaTensor& x, + const MetaTensor& alpha, + const std::string& mode, + const std::string& data_format, + MetaTensor* out, + MetaConfig config) { + auto x_dim = x.dims(); + if (mode == "all") { + PADDLE_ENFORCE_EQ(phi::product(alpha.dims()), + 1, + phi::errors::InvalidArgument( + "For mode 'all', size of weight Alpha must be one. " + "But recevied alpha's size: %d.", + product(alpha.dims()))); + } else if (mode == "channel") { + auto x_rank = x_dim.size(); + PADDLE_ENFORCE_GE(x_rank, + 2, + phi::errors::InvalidArgument( + "For mode 'channel', rank of input X must be " + "equal or larger than 2. But recevied X's " + "rank: %d", + x_rank)); + PADDLE_ENFORCE_EQ(data_format == "NCHW" || data_format == "NHWC", + true, + phi::errors::InvalidArgument( + "For mode 'channel', data_format must be one of " + "NCHW and NHWC. But recevied data_format: %s", + data_format)); + if (data_format == "NCHW" || config.is_run_mkldnn_kernel) { + PADDLE_ENFORCE_EQ(product(alpha.dims()) == x_dim[1], + true, + phi::errors::InvalidArgument( + "For mode 'channel', size of weight Alpha must be " + "equal to the number of channels of input(x). But " + "recevied alpha's size: %d, x_dim[1]: %d", + product(alpha.dims()), + x_dim[1])); + } else { + PADDLE_ENFORCE_EQ(product(alpha.dims()) == x_dim[x_rank - 1], + true, + phi::errors::InvalidArgument( + "For mode 'channel', size of weight Alpha must be " + "equal to the number of channels of input(x). But " + "recevied alpha's size: %d, x_dim[%d]: %d", + product(alpha.dims()), + x_rank - 1, + x_dim[x_rank - 1])); + } + } else if (mode == "element") { + auto alpha_dim = alpha.dims(); + auto alpha_rank = alpha_dim.size(); + auto x_rank = x_dim.size(); + PADDLE_ENFORCE_GE(x_rank, + 1, + phi::errors::InvalidArgument( + "For mode 'element', rank of input X must be " + "equal or larger than 2. But recevied X's " + "rank: %d", + x_rank)); + PADDLE_ENFORCE_EQ( + alpha_rank, + x_rank, + phi::errors::InvalidArgument( + "For mode 'element', rank of weight Alpha must be ", + "equal to the rank of input(x). But recevied alpha's rank: %d, " + "x's rank: %d.", + alpha_rank, + x_rank)); + size_t x_product = 1; + size_t alpha_product = 1; + for (int64_t i = x_rank - 1; i > 0; i--) { + x_product *= x_dim[i]; + alpha_product *= alpha_dim[i]; + } + PADDLE_ENFORCE_EQ( + alpha_product, + x_product, + phi::errors::InvalidArgument( + "For mode 'element', the size of weight Alpha must be " + "equal to the size of input(x). But recevied alpha's size: %d, " + "x's size: %d.", + alpha_product, + x_product)); + } else { + PADDLE_THROW(phi::errors::InvalidArgument( + "Attr(mode) of prelu must be one of 'all', 'channel', or 'element'. " + "But recevied " + "mode: '%s'.", + mode)); + } + out->set_dims(x_dim); + out->set_dtype(x.dtype()); + out->set_layout(x.layout()); + out->share_lod(x); +} + +void SearchsortedInferMeta(const MetaTensor& sorted_sequence, + const MetaTensor& value, + bool out_int32, + bool right, + MetaTensor* out) { + auto sequences_dims = sorted_sequence.dims(); + auto values_dims = value.dims(); + + bool flag = true; + if (sequences_dims.size() != values_dims.size()) { + flag = false; + } + const auto& sequences_dims_size = sequences_dims.size(); + for (int64_t dim = 0; dim < sequences_dims_size - 1; ++dim) { + if (sequences_dims[dim] != values_dims[dim]) { + flag = false; + break; + } + } + if (sequences_dims.size() != 1) { + PADDLE_ENFORCE_EQ( + flag, + true, + phi::errors::Unavailable( + "The dimensions of sorted_sequence tensor ( %s ) and values " + "tensor ( %s ) can not match. Because the input sorted_sequence " + "tensor must be 1 dimension or the first N-1 dimensions of " + "sorted_sequence tensor and input values tensor must match. " + "Please input appropriate sorted_sequence and values again! ", + sequences_dims, + values_dims)); + } + + if (out_int32) { + PADDLE_ENFORCE_LT( + sequences_dims[sequences_dims.size() - 1], + std::numeric_limits::max(), + phi::errors::Unavailable( + "The size of sorted_sequence %d exceed the maximum limit d%. " + "Because the size of sorted_sequence should be less than the " + "output maximum value for int32 bit. Please set appropriate " + "sorted_sequence to meet this requirement! ", + sequences_dims[sequences_dims.size() - 1], + std::numeric_limits::max())); + } + + out->set_dims(values_dims); + if (out_int32) { + out->set_dtype(DataType::INT32); + } else { + out->set_dtype(DataType::INT64); + } +} + void SegmentPoolInferMeta(const MetaTensor& x, const MetaTensor& segment_ids, const std::string& pooltype, @@ -812,4 +1352,129 @@ void TriangularSolveInferMeta(const MetaTensor& x, out->share_lod(y); } +void YoloBoxInferMeta(const MetaTensor& x, + const MetaTensor& img_size, + const std::vector& anchors, + int class_num, + float conf_thresh, + int downsample_ratio, + bool clip_bbox, + float scale_x_y, + bool iou_aware, + float iou_aware_factor, + MetaTensor* boxes, + MetaTensor* scores, + MetaConfig config) { + auto dim_x = x.dims(); + auto dim_imgsize = img_size.dims(); + int anchor_num = anchors.size() / 2; + + PADDLE_ENFORCE_EQ( + dim_x.size(), + 4, + phi::errors::InvalidArgument("Input(X) should be a 4-D tensor." + "But received X dimension(%s)", + dim_x.size())); + if (iou_aware) { + PADDLE_ENFORCE_EQ( + dim_x[1], + anchor_num * (6 + class_num), + phi::errors::InvalidArgument( + "Input(X) dim[1] should be equal to (anchor_mask_number * (6 " + "+ class_num)) while iou_aware is true." + "But received dim[1](%s) != (anchor_mask_number * " + "(6+class_num)(%s).", + dim_x[1], + anchor_num * (6 + class_num))); + PADDLE_ENFORCE_GE( + iou_aware_factor, + 0, + phi::errors::InvalidArgument( + "Attr(iou_aware_factor) should greater than or equal to 0." + "But received iou_aware_factor (%s)", + iou_aware_factor)); + PADDLE_ENFORCE_LE( + iou_aware_factor, + 1, + phi::errors::InvalidArgument( + "Attr(iou_aware_factor) should less than or equal to 1." + "But received iou_aware_factor (%s)", + iou_aware_factor)); + } else { + PADDLE_ENFORCE_EQ( + dim_x[1], + anchor_num * (5 + class_num), + phi::errors::InvalidArgument( + "Input(X) dim[1] should be equal to (anchor_mask_number * (5 " + "+ class_num))." + "But received dim[1](%s) != (anchor_mask_number * " + "(5+class_num)(%s).", + dim_x[1], + anchor_num * (5 + class_num))); + } + PADDLE_ENFORCE_EQ( + dim_imgsize.size(), + 2, + phi::errors::InvalidArgument("Input(ImgSize) should be a 2-D tensor." + "But received Imgsize size(%s)", + dim_imgsize.size())); + if ((dim_imgsize[0] > 0 && dim_x[0] > 0) || config.is_runtime) { + PADDLE_ENFORCE_EQ( + dim_imgsize[0], + dim_x[0], + phi::errors::InvalidArgument( + "Input(ImgSize) dim[0] and Input(X) dim[0] should be same.")); + } + PADDLE_ENFORCE_EQ( + dim_imgsize[1], + 2, + phi::errors::InvalidArgument("Input(ImgSize) dim[1] should be 2." + "But received imgsize dim[1](%s).", + dim_imgsize[1])); + PADDLE_ENFORCE_GT(anchors.size(), + 0, + phi::errors::InvalidArgument( + "Attr(anchors) length should be greater than 0." + "But received anchors length(%s).", + anchors.size())); + PADDLE_ENFORCE_EQ(anchors.size() % 2, + 0, + phi::errors::InvalidArgument( + "Attr(anchors) length should be even integer." + "But received anchors length (%s)", + anchors.size())); + PADDLE_ENFORCE_GT(class_num, + 0, + phi::errors::InvalidArgument( + "Attr(class_num) should be an integer greater than 0." + "But received class_num (%s)", + class_num)); + + int box_num; + if ((dim_x[2] > 0 && dim_x[3] > 0) || config.is_runtime) { + box_num = dim_x[2] * dim_x[3] * anchor_num; + } else { + box_num = -1; + } + std::vector dim_boxes({dim_x[0], box_num, 4}); + boxes->set_dims(phi::make_ddim(dim_boxes)); + boxes->set_dtype(x.dtype()); + + std::vector dim_scores({dim_x[0], box_num, class_num}); + scores->set_dims(phi::make_ddim(dim_scores)); +} + +void ValueCompareInferMeta(const MetaTensor& x, + const MetaTensor& y, + MetaTensor* out, + MetaConfig config) { + detail::BinarySameInputDimsCheck(x, y, config); + + out->set_dims(x.dims()); + out->set_dtype(DataType::BOOL); +} + } // namespace phi + +PD_REGISTER_INFER_META_FN(add_raw, phi::ElementwiseRawInferMeta); +PD_REGISTER_INFER_META_FN(conv2d, phi::ConvInferMeta); diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h index cfae45cf04b87c287a174d172700a794c8c2a2a3..d770a096de7c922c674b7edda55ae8cb531a6d00 100644 --- a/paddle/phi/infermeta/binary.h +++ b/paddle/phi/infermeta/binary.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include "paddle/phi/common/scalar.h" #include "paddle/phi/core/meta_tensor.h" namespace phi { @@ -28,12 +29,20 @@ namespace phi { // NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good. // Because functions in this file not only can infer shape, but also need // infer lod or other useful data. +// +// The InferMeta Functions in this file are arranged in alphabetic order. void AllValueCompareInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out, MetaConfig config = MetaConfig()); +void KLDivInferMeta(const MetaTensor& x, + const MetaTensor& label, + const std::string& reduction, + MetaTensor* out, + MetaConfig config = MetaConfig()); + void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out); void BCELossInferMeta(const MetaTensor& input, @@ -60,6 +69,20 @@ void CompareInferMeta(const MetaTensor& x, int axis, MetaTensor* out); +void ConvInferMeta(const MetaTensor& input, + const MetaTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& paddding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + MetaTensor* out, + MetaConfig config = MetaConfig()); + void CrossInferMeta(const MetaTensor& x, const MetaTensor& y, int axis, @@ -81,6 +104,16 @@ void ElementwiseRawInferMeta(const MetaTensor& x_meta, int axis, MetaTensor* out); +void ExpandAsInferMeta(const MetaTensor& x, + paddle::optional y, + const std::vector& target_shape, + MetaTensor* out); + +void GatherInferMeta(const MetaTensor& x, + const MetaTensor& index, + const Scalar& axis, + MetaTensor* out); + void GatherNdInferMeta(const MetaTensor& x, const MetaTensor& index, MetaTensor* out); @@ -89,6 +122,11 @@ void GatherTreeMeta(const MetaTensor& ids, const MetaTensor& parents, MetaTensor* out); +void GridSampleBaseInferMeta(const MetaTensor& x, + const MetaTensor& grid, + MetaTensor* out, + MetaConfig config = MetaConfig()); + void HuberLossInferMeta(const MetaTensor& input_meta, const MetaTensor& label_meta, float delta, @@ -101,12 +139,23 @@ void IndexSampleInferMeta(const MetaTensor& x, MetaTensor* out, MetaConfig config = MetaConfig()); +void IndexSelectInferMeta(const MetaTensor& x, + const MetaTensor& index, + int dim, + MetaTensor* output); + +void KronInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out); + void LogLossInferMeta(const MetaTensor& input, const MetaTensor& label, float epsilon, MetaTensor* out, MetaConfig config = MetaConfig()); +void MaskedSelectInferMeta(const MetaTensor& x, + const MetaTensor& mask, + MetaTensor* out); + void MatmulInferMeta(const MetaTensor& x, const MetaTensor& y, bool trans_x, @@ -115,6 +164,19 @@ void MatmulInferMeta(const MetaTensor& x, void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out); +void PReluInferMeta(const MetaTensor& x, + const MetaTensor& alpha, + const std::string& mode, + const std::string& data_format, + MetaTensor* out, + MetaConfig config); + +void SearchsortedInferMeta(const MetaTensor& sorted_sequence, + const MetaTensor& value, + bool out_int32, + bool right, + MetaTensor* out); + void SegmentPoolInferMeta(const MetaTensor& x, const MetaTensor& segment_ids, const std::string& pooltype, @@ -136,4 +198,23 @@ void TriangularSolveInferMeta(const MetaTensor& x, bool unitriangular, MetaTensor* out); +void YoloBoxInferMeta(const MetaTensor& x, + const MetaTensor& img_size, + const std::vector& anchors, + int class_num, + float conf_thresh, + int downsample_ratio, + bool clip_bbox, + float scale_x_y, + bool iou_aware, + float iou_aware_factor, + MetaTensor* boxes, + MetaTensor* scores, + MetaConfig config = MetaConfig()); + +void ValueCompareInferMeta(const MetaTensor& x, + const MetaTensor& y, + MetaTensor* out, + MetaConfig config = MetaConfig()); + } // namespace phi diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index 84441ed8b740be172ddaa7de3fc23ad420ebf077..3e9da9a217a0a8837d7edadc70401fdad04b4869 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -14,7 +14,9 @@ limitations under the License. */ #include "paddle/phi/infermeta/multiary.h" #include +#include "paddle/phi/common/layout.h" #include "paddle/phi/common/scalar.h" +#include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/core/meta_tensor.h" #include "paddle/phi/kernels/funcs/concat_funcs.h" namespace phi { @@ -200,6 +202,114 @@ void AucInferMeta(const MetaTensor& input, } } +void BatchNormInferMeta(const MetaTensor& x, + const MetaTensor& scale, + const MetaTensor& bias, + const MetaTensor& mean, + const MetaTensor& variance, + float momentum, + float epsilon, + const std::string& data_layout_str, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + bool fuse_with_relu, + MetaTensor* y, + MetaTensor* mean_out, + MetaTensor* variance_out, + MetaTensor* saved_mean, + MetaTensor* saved_variance, + MetaTensor* reserve_space, + MetaConfig config) { + const auto x_dims = x.dims(); + for (int i = 0; i < x_dims.size(); i++) { + PADDLE_ENFORCE_EQ( + (x_dims[i] == -1) || (x_dims[i] > 0), + true, + phi::errors::InvalidArgument( + "Each dimension of input tensor is expected to be -1 or a " + "positive number, but recieved %d. Input's shape is [%s].", + x_dims[i], + x_dims)); + } + + const DataLayout data_layout = + paddle::framework::StringToDataLayout(data_layout_str); + + PADDLE_ENFORCE_GE( + x_dims.size(), + 2, + phi::errors::InvalidArgument( + "ShapeError: the dimension of input " + "X must greater than or equal to 2. But received: the shape of input " + "X = [%s], the dimension of input X =[%d]", + x_dims, + x_dims.size())); + PADDLE_ENFORCE_LE( + x_dims.size(), + 5, + phi::errors::InvalidArgument( + "ShapeError: the dimension of input X " + "must smaller than or equal to 5. But received: the shape of input X " + "= [%s], the dimension of input X = [%d]", + x_dims, + x_dims.size())); + + const int64_t C = ((config.is_run_mkldnn_kernel == true) || + (data_layout == DataLayout::kNCHW) + ? x_dims[1] + : x_dims[x_dims.size() - 1]); + auto scale_dim = scale.dims(); + auto bias_dim = bias.dims(); + + PADDLE_ENFORCE_EQ( + scale_dim.size(), + 1UL, + phi::errors::InvalidArgument( + "ShapeError: the dimension of scale must equal to 1." + "But received: the shape of scale is [%s], the dimension " + "of scale is [%d]", + scale_dim, + scale_dim.size())); + PADDLE_ENFORCE_EQ(bias_dim.size(), + 1UL, + phi::errors::InvalidArgument( + "ShapeError: the dimension of bias must equal to 1." + "But received: the shape of bias is [%s],the dimension " + "of bias is [%d]", + bias_dim, + bias_dim.size())); + + bool check = true; + if ((!config.is_runtime) && + (phi::product(scale_dim) <= 0 || phi::product(bias_dim) <= 0)) { + check = false; + } + + if (check) { + PADDLE_ENFORCE_EQ(scale_dim[0], + C, + phi::errors::InvalidArgument( + "ShapeError: the shape of scale must equal to [%d]" + "But received: the shape of scale is [%d]", + C, + scale_dim[0])); + PADDLE_ENFORCE_EQ(bias_dim[0], + C, + phi::errors::InvalidArgument( + "ShapeError: the shape of bias must equal to [%d]" + "But received: the shape of bias is [%d]", + C, + bias_dim[0])); + } + y->set_dims(x_dims); + mean_out->set_dims({C}); + variance_out->set_dims({C}); + saved_mean->set_dims({C}); + saved_variance->set_dims({C}); + y->share_lod(x); +} + void BilinearTensorProductInferMeta(const MetaTensor& x, const MetaTensor& y, const MetaTensor& weight, @@ -369,6 +479,113 @@ void ConcatInferMeta(const std::vector& x, out->share_lod(*x.at(0)); } +void HierarchicalSigmoidInferMeta(const MetaTensor& x, + const MetaTensor& w, + const MetaTensor& label, + paddle::optional path, + paddle::optional code, + paddle::optional bias, + int num_classes, + bool remote_prefetch, + int trainer_id, + const std::vector& height_sections, + const std::vector& epmap, + const std::vector& table_names, + bool is_sparse, + MetaTensor* out, + MetaTensor* pre_out, + MetaTensor* w_out) { + const int64_t input_dims = x.dims()[0]; + const int64_t label_dims = label.dims()[0]; + PADDLE_ENFORCE_EQ(input_dims, + label_dims, + phi::errors::InvalidArgument( + "The first dimension of " + "input and label is expected to be the same. " + "But received input's first dimension is %d; " + "label's first dimension is %d.", + input_dims, + label_dims)); + + std::vector output_shape({input_dims, 1}); + out->set_dims(phi::make_ddim(output_shape)); + out->share_lod(x); + out->set_dtype(x.dtype()); +} + +void MultiDotInferMeta(const std::vector& x, MetaTensor* out) { + auto inputs_dims = GetMetaTensorsDim(x); + + const size_t inputs_num = inputs_dims.size(); + PADDLE_ENFORCE_GT( + inputs_num, + static_cast(1), + phi::errors::InvalidArgument( + "The number of input tensors in multi_dot op should > 1.")); + + const size_t n = inputs_dims.size(); + auto first_dim = inputs_dims[0]; + + bool is_vector = false; + phi::DDim out_dim; + + PADDLE_ENFORCE_LT( + first_dim.size(), + static_cast(3), + phi::errors::InvalidArgument( + "multi_dot: the first input tensor must be 1D or 2D but got[%d]!", + static_cast(first_dim.size()))); + + // If the first tensor is 1D of size n view it as a row vector (1, n) + if (first_dim.size() == 1) { + first_dim = phi::make_ddim({1, static_cast(first_dim[0])}); + is_vector = true; + } + + auto last_dim = inputs_dims[n - 1]; + PADDLE_ENFORCE_LT( + last_dim.size(), + static_cast(3), + phi::errors::InvalidArgument( + "the last input tensor of multi_dot must be 1D or 2D but got[%d]!", + static_cast(first_dim.size()))); + + // If the last tensor is 1D of size n view it as a column vector (n, 1) + if (last_dim.size() == 1) { + last_dim = phi::make_ddim({static_cast(last_dim[0]), 1}); + out_dim = is_vector ? phi::make_ddim({1}) : phi::make_ddim({first_dim[0]}); + } else { + out_dim = is_vector ? phi::make_ddim({last_dim[1]}) + : phi::make_ddim({first_dim[0], last_dim[1]}); + } + + auto width = first_dim[1]; + for (size_t i = 1; i < n - 1; i++) { + PADDLE_ENFORCE_EQ(inputs_dims[i].size(), + static_cast(2), + phi::errors::InvalidArgument( + "the input tensor of multi_dot op must be 2D.")); + + const auto& tmp_dim = inputs_dims[i]; + PADDLE_ENFORCE_EQ( + tmp_dim[0], + width, + phi::errors::InvalidArgument( + "the input matrix does not meet the multiplication requirements.")); + width = tmp_dim[1]; + } + + PADDLE_ENFORCE_EQ( + last_dim[0], + width, + phi::errors::InvalidArgument( + "the input matrix does not meet the multiplication requirements.")); + + out->set_dims(out_dim); + out->set_dtype(x.at(0)->dtype()); + out->share_lod(*x.at(0)); +} + void PsroiPoolInferMeta(const MetaTensor& x, const MetaTensor& rois, paddle::optional rois_num, @@ -470,3 +687,5 @@ void WhereInferMeta(const MetaTensor& condition, } } // namespace phi + +PD_REGISTER_INFER_META_FN(batch_norm, phi::BatchNormInferMeta); diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index c11843212ed33fd8170e6677a4d6e0ad95b730dc..068766c0e11671c93285c077ab2328ac20134a13 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -18,6 +18,23 @@ limitations under the License. */ #include "paddle/phi/core/meta_tensor.h" namespace phi { +// Common InferMeta Functions for multiary operators, The format like: +// +// 1. The number of input MetaTensor is more than 3: +// void [FunctionDesc|OpName]InferMeta(const MetaTensor& x, +// const MetaTensor& y, +// const MetaTensor& z, +// const MetaTensor& w, +// ..., +// MetaTensor* out) {} +// +// 2. There are `const vector&` in params: +// void [FunctionDesc|OpName]InferMeta(const vector& x, +// ..., +// MetaTensor* out) {} +// +// NOTE: The InferMeta Functions in this file are arranged in alphabetic order. + std::vector GetMetaTensorsDim(const std::vector& tensors); void AdadeltaInferMeta(const MetaTensor& param, @@ -55,6 +72,26 @@ void AucInferMeta(const MetaTensor& input, MetaTensor* stat_neg_out, MetaConfig config = MetaConfig()); +void BatchNormInferMeta(const MetaTensor& x, + const MetaTensor& scale, + const MetaTensor& bias, + const MetaTensor& mean, + const MetaTensor& variance, + float momentum, + float epsilon, + const std::string& data_layout, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + bool fuse_with_relu, + MetaTensor* y, + MetaTensor* mean_out, + MetaTensor* variance_out, + MetaTensor* saved_mean, + MetaTensor* saved_variance, + MetaTensor* reserve_space, + MetaConfig config = MetaConfig()); + void BilinearTensorProductInferMeta(const MetaTensor& x, const MetaTensor& y, const MetaTensor& weight, @@ -70,6 +107,25 @@ void ConcatInferMeta(const std::vector& x, MetaTensor* out, MetaConfig config = MetaConfig()); +void HierarchicalSigmoidInferMeta(const MetaTensor& x, + const MetaTensor& w, + const MetaTensor& label, + paddle::optional path, + paddle::optional code, + paddle::optional bias, + int num_classes, + bool remote_prefetch, + int trainer_id, + const std::vector& height_sections, + const std::vector& epmap, + const std::vector& table_names, + bool is_sparse, + MetaTensor* out, + MetaTensor* pre_out, + MetaTensor* w_out); + +void MultiDotInferMeta(const std::vector& x, MetaTensor* out); + void PsroiPoolInferMeta(const MetaTensor& x, const MetaTensor& rois, paddle::optional rois_num, diff --git a/paddle/phi/infermeta/nullary.h b/paddle/phi/infermeta/nullary.h index 38eaa636f8c8779c5a1f597b8cfb23ce6efc5edc..55e59b27e71cfb1d9b16a659e40d299ed3f2fc54 100644 --- a/paddle/phi/infermeta/nullary.h +++ b/paddle/phi/infermeta/nullary.h @@ -27,6 +27,8 @@ namespace phi { // NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good. // Because functions in this file not only can infer shape, but also need // infer lod or other useful data. +// +// The InferMeta Functions in this file are arranged in alphabetic order. void CreateInferMeta(const ScalarArray& shape, DataType dtype, MetaTensor* out); diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc index 235cfe368c1921eac546b670470963fb49100290..556fb874470dd248dcf7c77e8a8ac3510bd6f63e 100644 --- a/paddle/phi/infermeta/ternary.cc +++ b/paddle/phi/infermeta/ternary.cc @@ -322,6 +322,158 @@ void NllLossRawInferMeta(const MetaTensor& input, total_weight->set_dtype(input.dtype()); } +void RoiAlignInferMeta(const MetaTensor& x, + const MetaTensor& boxes, + paddle::optional boxes_num, + int pooled_height, + int pooled_width, + float spatial_scale, + int sampling_ratio, + bool aligned, + MetaTensor* out, + MetaConfig config) { + auto input_dims = x.dims(); + auto boxes_dims = boxes.dims(); + + if (boxes_num) { + auto boxes_num_dims = boxes_num->dims(); + PADDLE_ENFORCE_EQ( + boxes_num_dims.size(), + 1, + phi::errors::InvalidArgument("The size of boxes_num should be 1" + ", but received size = %d", + boxes_num_dims.size())); + } + PADDLE_ENFORCE_EQ(input_dims.size(), + 4, + phi::errors::InvalidArgument( + "The format of Input(x) in" + "RoiAlignOp is NCHW. And the rank of input must be 4. " + "But received rank = %d", + input_dims.size())); + PADDLE_ENFORCE_EQ(boxes_dims.size(), + 2, + phi::errors::InvalidArgument("The rank of Input(boxes) " + "in RoiAlignOp should be 2. " + "But the rank of boxes is %d", + boxes_dims.size())); + if (config.is_runtime) { + PADDLE_ENFORCE_EQ(boxes_dims[1], + 4, + phi::errors::InvalidArgument( + "The second dimension " + "of Input(boxes) should be 4. But received the " + "dimension = %d", + boxes_dims[1])); + } + + PADDLE_ENFORCE_GT(pooled_height, + 0, + phi::errors::InvalidArgument( + "The 'pooled_height' attribute in RoiAlignOp is " + "invalid. The height must be greater than 0. But " + "received 'pooled_height' = %d", + pooled_height)); + PADDLE_ENFORCE_GT(pooled_width, + 0, + phi::errors::InvalidArgument( + "The 'pooled_width' attribute in RoiAlignOp is " + "invalid. The width must be greater than 0. But " + "received 'pooled_width' = %d", + pooled_width)); + PADDLE_ENFORCE_GT(spatial_scale, + 0.0f, + phi::errors::InvalidArgument( + "The 'spatial_scale' attribute in RoiAlignOp is " + "invalid. The scale must be greater than 0. But " + "received 'spatial_scale' = %f", + spatial_scale)); + + auto out_dims = input_dims; + out_dims[0] = boxes_dims[0]; + out_dims[1] = input_dims[1]; + out_dims[2] = pooled_height; + out_dims[3] = pooled_width; + + out->set_dims(out_dims); + out->set_dtype(x.dtype()); +} + +void RoiPoolInferMeta(const MetaTensor& x, + const MetaTensor& boxes, + paddle::optional boxes_num, + int pooled_height, + int pooled_width, + float spatial_scale, + MetaTensor* out, + MetaTensor* arg_max) { + auto input_dims = x.dims(); + auto boxes_dims = boxes.dims(); + + if (boxes_num) { + auto boxes_num_dims = boxes_num->dims(); + PADDLE_ENFORCE_EQ( + boxes_num_dims.size(), + 1, + phi::errors::InvalidArgument("The second dimension of boxes_num should " + "be 1, but received dimension is %d", + boxes_num_dims.size())); + } + PADDLE_ENFORCE_EQ(input_dims.size(), + 4, + phi::errors::InvalidArgument( + "The input data should be a four-dimensional " + "tensor with [N,C,H,W], but received input data with " + " %d dimension", + input_dims.size())); + PADDLE_ENFORCE_EQ( + boxes_dims.size(), + 2, + phi::errors::InvalidArgument( + "boxes should be a 2-D LoDTensor with shape (num_boxes, 4)" + "given as [[x1, y1, x2, y2], ...], but received boxes is " + "%d-dimensional LoDTensor", + boxes_dims.size())); + PADDLE_ENFORCE_EQ( + boxes_dims[1], + 4, + phi::errors::InvalidArgument( + "boxes should be a 2-D LoDTensor with shape (num_boxes, 4)" + "given as [[x1, y1, x2, y2], ...]. But the second dimension of " + "the received data is %d", + boxes_dims[1])); + + PADDLE_ENFORCE_GT( + pooled_height, + 0, + phi::errors::OutOfRange("The pooled output height must be greater than 0" + "but received height is %d", + pooled_height)); + PADDLE_ENFORCE_GT( + pooled_width, + 0, + phi::errors::OutOfRange("The pooled output width must be greater than 0" + "but received width is %d", + pooled_width)); + PADDLE_ENFORCE_GT( + spatial_scale, + 0.0f, + phi::errors::OutOfRange("The spatial scale must be greater than 0, " + "but received spatial scale is %f", + spatial_scale)); + + auto out_dims = input_dims; + out_dims[0] = boxes_dims[0]; + out_dims[1] = input_dims[1]; + out_dims[2] = pooled_height; + out_dims[3] = pooled_width; + + out->set_dims(out_dims); + out->set_dtype(x.dtype()); + arg_max->set_dims(out_dims); + arg_max->set_dtype(DataType::INT64); +} + void ScatterInferMeta(const MetaTensor& x, const MetaTensor& index, const MetaTensor& updates, diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h index 209a07db18b5c7a87ba094c5839149533757220d..42a0f35dc1d8d6aef13b631d355a4cee951a4ed1 100644 --- a/paddle/phi/infermeta/ternary.h +++ b/paddle/phi/infermeta/ternary.h @@ -30,6 +30,8 @@ namespace phi { // Because functions in this file not only can infer shape, but also need // infer lod or other useful data. // +// The InferMeta Functions in this file are arranged in alphabetic order. + void AccuracyInferMeta(const MetaTensor& out, const MetaTensor& indice, const MetaTensor& label, @@ -71,6 +73,26 @@ void NllLossRawInferMeta(const MetaTensor& input, MetaTensor* total_weight, MetaConfig config = MetaConfig()); +void RoiAlignInferMeta(const MetaTensor& x, + const MetaTensor& boxes, + paddle::optional boxes_num, + int pooled_height, + int pooled_width, + float spatial_scale, + int sampling_ratio, + bool aligned, + MetaTensor* out, + MetaConfig config = MetaConfig()); + +void RoiPoolInferMeta(const MetaTensor& x, + const MetaTensor& boxes, + paddle::optional boxes_num, + int pooled_height, + int pooled_width, + float spatial_scale, + MetaTensor* out, + MetaTensor* arg_max); + void ScatterInferMeta(const MetaTensor& x, const MetaTensor& index, const MetaTensor& updates, diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index 4d1cb42bd59f072e5926b237528a742c231bcdcf..0f51839553158b6dce7ac90006c5c72ee8e3b57b 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -304,6 +304,17 @@ void DiagonalInferMeta(const MetaTensor& input, out->set_dims(phi::make_ddim(out_dims)); } +void DropoutInferMeta(const MetaTensor& x, MetaTensor* out, MetaTensor* mask) { + auto x_dims = x.dims(); + out->set_dims(x_dims); + out->share_lod(x); + out->set_dtype(x.dtype()); + + if (mask != nullptr) { + mask->set_dims(x_dims); + } +} + void EighInferMeta(const MetaTensor& x, const std::string& uplo, MetaTensor* out_w, @@ -392,6 +403,26 @@ void GumbelSoftmaxInferMeta(const MetaTensor& x, UnchangedInferMetaCheckAxis(x, axis, out); } +void HistogramInferMeta( + const MetaTensor& input, int64_t bins, int min, int max, MetaTensor* out) { + PADDLE_ENFORCE_GE(bins, + 1, + phi::errors::InvalidArgument( + "The bins should be greater than or equal to 1." + "But received nbins is %d", + bins)); + PADDLE_ENFORCE_GE( + max, + min, + phi::errors::InvalidArgument("max must be larger or equal to min." + "But received max is %d, min is %d", + max, + min)); + + out->set_dims({bins}); + out->share_lod(input); +} + void IncrementInferMeta(const MetaTensor& x, float value, MetaTensor* out) { PADDLE_ENFORCE_EQ( product(x.dims()), @@ -554,6 +585,89 @@ void IsfiniteInferMeta(const MetaTensor& x, MetaTensor* out) { out->set_dtype(DataType::BOOL); } +void KthvalueInferMeta(const MetaTensor& x, + int k, + int axis, + bool keepdim, + MetaTensor* out, + MetaTensor* indices, + MetaConfig config) { + auto input_dims = x.dims(); + const int& dim_size = input_dims.size(); + PADDLE_ENFORCE_LT(axis, + dim_size, + phi::errors::InvalidArgument( + "the axis must be [-%d, %d), but received %d .", + dim_size, + dim_size, + axis)); + PADDLE_ENFORCE_GE(axis, + -dim_size, + phi::errors::InvalidArgument( + "the axis must be [-%d, %d), but received %d .", + dim_size, + dim_size, + axis)); + if (axis < 0) axis += dim_size; + PADDLE_ENFORCE_GE( + k, + 1, + phi::errors::InvalidArgument( + "the k in the kthvalue must >= 1, but received %d .", k)); + PADDLE_ENFORCE_GE( + input_dims.size(), + 1, + phi::errors::InvalidArgument("input of kthvalue must have >= 1d shape")); + if (config.is_runtime) { + PADDLE_ENFORCE_GE( + input_dims[axis], + k, + phi::errors::InvalidArgument( + "input of kthvalue must have >= %d columns in axis of %d", + k, + axis)); + } + std::vector dimvec; + for (int64_t i = 0; i < axis; i++) { + dimvec.emplace_back(input_dims[i]); + } + if (keepdim) { + dimvec.emplace_back(static_cast(1)); + } + for (int64_t i = axis + 1; i < dim_size; i++) { + dimvec.emplace_back(input_dims[i]); + } + DDim dims = phi::make_ddim(dimvec); + out->set_dims(dims); + out->share_lod(x); + out->set_dtype(x.dtype()); + indices->set_dims(dims); + indices->share_lod(x); + indices->set_dtype(x.dtype()); +} + +void MatrixPowerInferMeta(const MetaTensor& x, int n, MetaTensor* out) { + auto dims = x.dims(); + auto n_dim = dims.size(); + PADDLE_ENFORCE_GE(n_dim, + 2, + phi::errors::InvalidArgument( + "The Input(X) should have at least 2 dimensions. But " + "received a %d dimension tensor.", + n_dim)); + PADDLE_ENFORCE_EQ(dims[n_dim - 2], + dims[n_dim - 1], + phi::errors::InvalidArgument( + "The inner-most 2 dimensions of Input(X) all should " + "be square matrices " + "But received X's shape[-2] = %d and shape[-1] = %d.", + dims[n_dim - 2], + dims[n_dim - 1])); + out->set_dims(dims); + out->share_lod(x); + out->set_dtype(x.dtype()); +} + void MaxPoolWithIndexInferMeta(const MetaTensor& x, const std::vector& kernel_size, const std::vector& strides, @@ -626,6 +740,49 @@ void MaxPoolWithIndexInferMeta(const MetaTensor& x, mask->set_dtype(paddle::experimental::CppTypeToDataType::Type()); } +void ModeInferMeta(const MetaTensor& x, + int axis, + bool keepdim, + MetaTensor* out, + MetaTensor* indices) { + auto input_dims = x.dims(); + const int& dim_size = input_dims.size(); + PADDLE_ENFORCE_EQ( + (axis < dim_size) && (axis >= (-1 * dim_size)), + true, + errors::InvalidArgument( + "the axis of ModeOp must be [-%d, %d), but you set axis is %d", + dim_size, + dim_size, + axis)); + PADDLE_ENFORCE_GE( + input_dims.size(), + 1, + errors::InvalidArgument("input of ModeOp must have >= 1d shape")); + if (axis < 0) axis += dim_size; + std::vector dimvec; + for (int64_t i = 0; i < axis; i++) { + dimvec.emplace_back(input_dims[i]); + } + if (keepdim) { + dimvec.emplace_back(static_cast(1)); + } + for (int64_t i = axis + 1; i < dim_size; i++) { + dimvec.emplace_back(input_dims[i]); + } + DDim dims = phi::make_ddim(dimvec); + PADDLE_ENFORCE_GE(input_dims.size(), + 1, + errors::InvalidArgument("input shape should >= 1d")); + out->set_dims(dims); + out->share_lod(x); + out->set_dtype(x.dtype()); + + indices->set_dims(dims); + indices->share_lod(x); + indices->set_dtype(x.dtype()); +} + void MultinomialInferMeta(const MetaTensor& x, int num_samples, bool replacement, @@ -661,6 +818,24 @@ void MultinomialInferMeta(const MetaTensor& x, out->set_dtype(DataType::INT64); } +void NormInferMeta(const MetaTensor& x, + int axis, + float epsilon, + bool is_test, + MetaTensor* out, + MetaTensor* norm) { + auto xdim = x.dims(); + out->set_dims(x.dims()); + out->set_dtype(x.dtype()); + + if (is_test == false) { + if (axis < 0) axis = xdim.size() + axis; + xdim[axis] = 1; + norm->set_dims(xdim); + norm->set_dtype(x.dtype()); + } +} + void PadInferMeta(const MetaTensor& input, const std::vector& paddings, float pad_value, @@ -994,6 +1169,53 @@ void ReshapeWithXShapeInferMeta(const MetaTensor& x, ReshapeInferMeta(x, shape, out, config); } +void RollInferMeta(const MetaTensor& x, + const ScalarArray& shifts, + const std::vector& axis, + MetaTensor* out) { + auto shifts_data = shifts.GetData(); + + if (axis.size() != 0) { + PADDLE_ENFORCE_EQ( + axis.size(), + shifts_data.size(), + phi::errors::InvalidArgument("When dims.size() != 0, dims.size() " + "should be equal to " + "shifts.size(). But received " + "dims.size() = %d, shifts.size() = %d", + axis.size(), + shifts_data.size())); + } else { + PADDLE_ENFORCE_EQ( + shifts_data.size(), + 1, + phi::errors::InvalidArgument("When dims.size() == 0, shifts.size() " + "should be equal to 1, But received " + "shifts.size() = %d", + shifts_data.size())); + } + + out->set_dims(x.dims()); + out->share_lod(x); + out->set_dtype(x.dtype()); +} + +void SetValueInferMeta(const MetaTensor& x, MetaTensor* out) { + auto in_dims = x.dims(); + PADDLE_ENFORCE_LT( + in_dims.size(), + 7, + phi::errors::InvalidArgument( + "The rank of input should be less than 7, but received %d.", + in_dims.size())); +} + +void ShapeInferMeta(const MetaTensor& input, MetaTensor* out) { + auto in_dim = input.dims(); + out->set_dims(phi::make_ddim({in_dim.size()})); + out->set_dtype(DataType::INT32); +} + void ShardIndexInferMeta(const MetaTensor& in, int index_num, int nshards, @@ -1282,6 +1504,55 @@ void TileInferMeta(const MetaTensor& x, } } +void TopKInferMeta(const MetaTensor& x, + const Scalar& k_scalar, + int axis, + bool largest, + bool sorted, + MetaTensor* out, + MetaTensor* indices, + MetaConfig config) { + auto input_dims = x.dims(); + const int& dim_size = input_dims.size(); + PADDLE_ENFORCE_EQ( + (axis < dim_size) && (axis >= (-1 * dim_size)), + true, + phi::errors::InvalidArgument( + "the axis of topk must be [-%d, %d), but you set axis is %d", + dim_size, + dim_size, + axis)); + + if (axis < 0) axis += dim_size; + + int k = k_scalar.to(); + if (k_scalar.FromTensor()) { + k = -1; + } else { + PADDLE_ENFORCE_EQ(k >= 1, + true, + phi::errors::InvalidArgument( + "the attribute of k in the topk must >= 1 or be a " + "Tensor, but received %d .", + k)); + } + + PADDLE_ENFORCE_GE( + input_dims.size(), + 1, + phi::errors::InvalidArgument("input of topk must have >= 1d shape")); + + phi::DDim dims = input_dims; + + dims[axis] = k; + out->set_dims(dims); + out->share_lod(x); + out->set_dtype(x.dtype()); + indices->set_dims(dims); + indices->share_lod(x); + indices->set_dtype(DataType::INT64); +} + void TraceInferMeta( const MetaTensor& x, int offset, int axis1, int axis2, MetaTensor* out) { int dim1 = axis1; @@ -1428,7 +1699,7 @@ void UnchangedInferMetaCheckAxis(const MetaTensor& x, PADDLE_ENFORCE_GE( axis, -rank, - errors::InvalidArgument( + phi::errors::InvalidArgument( "Attr(axis) value should be in range [-R, R-1], " "R is the rank of Input(X). But received axis: %d, R: %d.", axis, diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index 75fb9fadf82dc87ac18814e0674e5012fea95ec4..2d51bac995d5142871873dd4a12c22b4bf2de55e 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -31,6 +31,8 @@ class MetaConfig; // NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good. // Because functions in this file not only can infer shape, but also need // infer lod or other useful data. +// +// The InferMeta Functions in this file are arranged in alphabetic order. void ArgMinMaxInferMeta(const MetaTensor& x, int64_t axis, @@ -72,6 +74,8 @@ void DiagInferMeta(const MetaTensor& x, void DiagonalInferMeta( const MetaTensor& input, int offset, int axis1, int axis2, MetaTensor* out); +void DropoutInferMeta(const MetaTensor& x, MetaTensor* out, MetaTensor* mask); + void EighInferMeta(const MetaTensor& x, const std::string& uplo, MetaTensor* out_w, @@ -87,6 +91,8 @@ void GumbelSoftmaxInferMeta(const MetaTensor& x, bool hard, int axis, MetaTensor* out); +void HistogramInferMeta( + const MetaTensor& input, int64_t bins, int min, int max, MetaTensor* out); void IncrementInferMeta(const MetaTensor& x, float value, MetaTensor* out); @@ -98,6 +104,16 @@ void IsEmptyInferMeta(const MetaTensor& x, MetaTensor* out); void IsfiniteInferMeta(const MetaTensor& input, MetaTensor* out); +void KthvalueInferMeta(const MetaTensor& x, + int k, + int axis, + bool keepdim, + MetaTensor* out, + MetaTensor* indices, + MetaConfig = MetaConfig()); + +void MatrixPowerInferMeta(const MetaTensor& x, int n, MetaTensor* out); + void MaxPoolWithIndexInferMeta(const MetaTensor& x, const std::vector& kernel_size, const std::vector& strides, @@ -108,10 +124,22 @@ void MaxPoolWithIndexInferMeta(const MetaTensor& x, MetaTensor* mask, MetaConfig config = MetaConfig()); +void ModeInferMeta(const MetaTensor& x, + int axis, + bool keepdim, + MetaTensor* out, + MetaTensor* indices); + void MultinomialInferMeta(const MetaTensor& x, int num_samples, bool replacement, MetaTensor* out); +void NormInferMeta(const MetaTensor& x, + int axis, + float epsilon, + bool is_test, + MetaTensor* out, + MetaTensor* norm); void PadInferMeta(const MetaTensor& input, const std::vector& paddings, @@ -162,6 +190,15 @@ void ReshapeWithXShapeInferMeta(const MetaTensor& x, MetaTensor* out, MetaConfig config = MetaConfig()); +void RollInferMeta(const MetaTensor& x, + const ScalarArray& shifts, + const std::vector& axis, + MetaTensor* out); + +void SetValueInferMeta(const MetaTensor& x, MetaTensor* out); + +void ShapeInferMeta(const MetaTensor& input, MetaTensor* out); + void ShardIndexInferMeta(const MetaTensor& in, int index_num, int nshards, @@ -198,6 +235,15 @@ void TileInferMeta(const MetaTensor& x, MetaTensor* out, MetaConfig config = MetaConfig()); +void TopKInferMeta(const MetaTensor& x, + const Scalar& k_scalar, + int axis, + bool largest, + bool sorted, + MetaTensor* out, + MetaTensor* indices, + MetaConfig config = MetaConfig()); + void TraceInferMeta( const MetaTensor& x, int offset, int axis1, int axis2, MetaTensor* out); diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index d443b7bb2a09225e78a7001374821114c59a1557..d140912aa783047ba021be171805adff071bf22b 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -27,11 +27,18 @@ kernel_library(full_kernel DEPS ${COMMON_KERNEL_DEPS} empty_kernel) # Some kernels depend on some targets that are not commonly used. # These targets are not suitable for common dependencies. # In this case, you need to manually generate them here. -set(MANUAL_BUILD_KERNELS eigh_kernel gumbel_softmax_kernel gumbel_softmax_grad_kernel math_kernel matrix_power_kernel matrix_power_grad_kernel maxout_kernel maxout_grad_kernel pool_kernel put_along_axis_kernel put_along_axis_grad_kernel segment_pool_kernel segment_pool_grad_kernel softmax_kernel softmax_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel triangular_solve_grad_kernel) +set(MANUAL_BUILD_KERNELS eigh_kernel gumbel_softmax_kernel gumbel_softmax_grad_kernel + hierarchical_sigmoid_kernel hierarchical_sigmoid_grad_kernel + matrix_power_kernel matrix_power_grad_kernel maxout_kernel maxout_grad_kernel pool_kernel + put_along_axis_kernel put_along_axis_grad_kernel segment_pool_kernel segment_pool_grad_kernel + softmax_kernel softmax_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel + triangular_solve_grad_kernel determinant_grad_kernel reduce_kernel) kernel_library(eigh_kernel DEPS ${COMMON_KERNEL_DEPS} lapack_function) +kernel_library(hierarchical_sigmoid_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_bit_code) +kernel_library(hierarchical_sigmoid_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_bit_code) kernel_library(gumbel_softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) kernel_library(gumbel_softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) -kernel_library(math_kernel DEPS ${COMMON_KERNEL_DEPS} cast_kernel copy_kernel) +kernel_library(reduce_kernel DEPS ${COMMON_KERNEL_DEPS} cast_kernel) kernel_library(matrix_power_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse) kernel_library(matrix_power_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse) kernel_library(maxout_kernel DEPS ${COMMON_KERNEL_DEPS} maxouting) @@ -46,6 +53,7 @@ kernel_library(softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) kernel_library(take_along_axis_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel) kernel_library(take_along_axis_grad_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel) kernel_library(triangular_solve_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_reduce) +kernel_library(determinant_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse) # 4. auto parse and build kernel targets by cmake register_kernels(EXCLUDES ${COMMON_BAISC_KERNELS} ${MANUAL_BUILD_KERNELS} DEPS ${COMMON_KERNEL_DEPS} ${COMMON_BAISC_KERNELS} ) diff --git a/paddle/phi/kernels/activation_grad_kernel.h b/paddle/phi/kernels/activation_grad_kernel.h index a5b737b28c23ba97988915f00cbf447d2e1b1c22..241a80d85ead2d7bb6cd63105feb345c62a29a62 100644 --- a/paddle/phi/kernels/activation_grad_kernel.h +++ b/paddle/phi/kernels/activation_grad_kernel.h @@ -19,20 +19,54 @@ limitations under the License. */ namespace phi { -#define DECLARE_ACTIVATION_GRAD_KERNEL_DepX(name) \ +#define DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(name) \ template \ void name##GradKernel(const Context& dev_ctx, \ const DenseTensor& x, \ const DenseTensor& dout, \ DenseTensor* dx); -#define DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(name) \ +#define DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(name, attr) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& dout, \ + float attr, \ + DenseTensor* dx); + +#define DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(name, attr1, attr2) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& dout, \ + float attr1, \ + float attr2, \ + DenseTensor* dx); + +#define DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(name) \ template \ void name##GradKernel(const Context& dev_ctx, \ const DenseTensor& out, \ const DenseTensor& dout, \ DenseTensor* dx); +#define DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT(name, attr) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& out, \ + const DenseTensor& dout, \ + float attr, \ + DenseTensor* dx); + +#define DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(name, attr1, attr2) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& out, \ + const DenseTensor& dout, \ + float attr1, \ + float attr2, \ + DenseTensor* dx); + template void ReluDoubleGradKernel(const Context& dev_ctx, const DenseTensor& out, @@ -59,46 +93,74 @@ void TanhTripleGradKernel(const Context& dev_ctx, DenseTensor* d_ddx); template -void BReluGradKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& dout, - float t_min, - float t_max, - DenseTensor* dx); +void LeakyReluDoubleGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& ddx, + float alpha, + DenseTensor* ddout); template -void LeakyReluGradKernel(const Context& dev_ctx, +void EluGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& dout, + float alpha, + DenseTensor* dx); + +template +void EluDoubleGradKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& dout, + const DenseTensor& ddx, float alpha, - DenseTensor* dx); + DenseTensor* dx, + DenseTensor* ddout); template -void LeakyReluDoubleGradKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& ddx, - float alpha, - DenseTensor* ddout); +void SigmoidDoubleGradKernel(const Context& dev_ctx, + const DenseTensor& out, + const DenseTensor& ddx, + const DenseTensor& dout, + DenseTensor* dout_new, + DenseTensor* ddout); template -void ThresholdedReluGradKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& dout, - float threshold, - DenseTensor* dx); - -DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Cos); -DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Tan); -DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Acos); -DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Sin); -DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Asin); -DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Atan); -DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Sinh); -DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Cosh); -DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Asinh); -DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Acosh); -DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Atanh); -DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(Relu); -DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(Tanh); +void SigmoidTripleGradKernel(const Context& dev_ctx, + const DenseTensor& out, + const DenseTensor& ddx, + const DenseTensor& dout, + const DenseTensor& d_ddout, + const DenseTensor& d_dout_new, + DenseTensor* d_out_new, + DenseTensor* d_dout, + DenseTensor* d_ddx); + +DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Cos); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Tan); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Acos); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Sin); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Asin); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Atan); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Sinh); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Cosh); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Asinh); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Acosh); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Atanh); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Silu); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid); + +DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Sigmoid); + +DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu, alpha); +DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(ThresholdedRelu, threshold); +DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink, lambda); +DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(HardShrink, threshold); + +DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu, t_min, t_max); + +DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(HardSigmoid, slope, offset); } // namespace phi diff --git a/paddle/phi/kernels/activation_kernel.h b/paddle/phi/kernels/activation_kernel.h index 885dccad8e377642b4cb9e36832ac4bd45f7915f..dbc63a636edb188e4640fdd02895868034f1dd80 100644 --- a/paddle/phi/kernels/activation_kernel.h +++ b/paddle/phi/kernels/activation_kernel.h @@ -24,6 +24,21 @@ namespace phi { void name##Kernel( \ const Context& dev_ctx, const DenseTensor& x, DenseTensor* out); +#define DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(name, attr) \ + template \ + void name##Kernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + float attr, \ + DenseTensor* out); + +#define DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(name, attr1, attr2) \ + template \ + void name##Kernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + float attr1, \ + float attr2, \ + DenseTensor* out); + DECLARE_ACTIVATION_KERNEL(Cos) DECLARE_ACTIVATION_KERNEL(Tan) DECLARE_ACTIVATION_KERNEL(Acos) @@ -37,24 +52,17 @@ DECLARE_ACTIVATION_KERNEL(Acosh) DECLARE_ACTIVATION_KERNEL(Atanh) DECLARE_ACTIVATION_KERNEL(Relu) DECLARE_ACTIVATION_KERNEL(Tanh) +DECLARE_ACTIVATION_KERNEL(TanhShrink) +DECLARE_ACTIVATION_KERNEL(Silu) +DECLARE_ACTIVATION_KERNEL(Sigmoid) +DECLARE_ACTIVATION_KERNEL(LogSigmoid) -template -void BReluKernel(const Context& dev_ctx, - const DenseTensor& x, - float t_min, - float t_max, - DenseTensor* out); - -template -void LeakyReluKernel(const Context& dev_ctx, - const DenseTensor& x, - float alpha, - DenseTensor* out); - -template -void ThresholdedReluKernel(const Context& dev_ctx, - const DenseTensor& x, - float threshold, - DenseTensor* out); +DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(LeakyRelu, alpha) +DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu, threshold) +DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(SoftShrink, lambda) +DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(HardShrink, threshold) +DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Elu, alpha) +DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(BRelu, t_min, t_max) +DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(HardSigmoid, slope, offset) } // namespace phi diff --git a/paddle/phi/kernels/assign_kernel.cc b/paddle/phi/kernels/assign_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..9faaace69176690f64ff81138844567deceef689 --- /dev/null +++ b/paddle/phi/kernels/assign_kernel.cc @@ -0,0 +1,63 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/assign_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/utils/optional.h" + +namespace phi { + +template +void AssignKernel(const Context& dev_ctx, + paddle::optional x, + DenseTensor* out) { + if (!x.is_initialized()) { + return; + } + auto& x_tensor = *x.get_ptr(); + Copy(dev_ctx, x_tensor, x_tensor.place(), false, out); +} + +// Note: use `const paddle::optional&> x` +// as input if needed +template +void AssignArrayKernel(const Context& dev_ctx, + const std::vector& x, + std::vector out) { + for (size_t i = 0; i < x.size(); ++i) { + AssignKernel(dev_ctx, *x[i], out.at(i)); + } +} + +} // namespace phi + +PD_REGISTER_GENERAL_KERNEL( + assign, CPU, ALL_LAYOUT, phi::AssignKernel, ALL_DTYPE) {} +PD_REGISTER_GENERAL_KERNEL(assign_array, + CPU, + ALL_LAYOUT, + phi::AssignArrayKernel, + ALL_DTYPE) {} + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_REGISTER_GENERAL_KERNEL( + assign, GPU, ALL_LAYOUT, phi::AssignKernel, ALL_DTYPE) {} +PD_REGISTER_GENERAL_KERNEL(assign_array, + GPU, + ALL_LAYOUT, + phi::AssignArrayKernel, + ALL_DTYPE) {} +#endif diff --git a/paddle/phi/kernels/assign_kernel.h b/paddle/phi/kernels/assign_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..7cc06818dc007f859a3a3513d211008cd2a153e6 --- /dev/null +++ b/paddle/phi/kernels/assign_kernel.h @@ -0,0 +1,34 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +// In order to be compatible with the `AsDispensable` input in the original +// assign op maker, the input parameter here needs to be dispensable, but +// this looks weird +template +void AssignKernel(const Context& dev_ctx, + paddle::optional x, + DenseTensor* out); + +template +void AssignArrayKernel(const Context& dev_ctx, + const std::vector& x, + std::vector out); + +} // namespace phi diff --git a/paddle/phi/kernels/cpu/activation_grad_kernel.cc b/paddle/phi/kernels/cpu/activation_grad_kernel.cc index f9af50f6832a1884f3ef58ccb5708b1f2636ccea..c582261596221f4db8bd03599386082cee909096 100644 --- a/paddle/phi/kernels/cpu/activation_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/activation_grad_kernel.cc @@ -21,101 +21,164 @@ limitations under the License. */ namespace phi { -#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(name, functor_class) \ +#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(name, functor_class) \ template \ void name##GradKernel(const Context& dev_ctx, \ const DenseTensor& x, \ const DenseTensor& dout, \ DenseTensor* dx) { \ - functor_class functor; \ - ActivationGradImpl>( \ + funcs::functor_class functor; \ + ActivationGradImpl>( \ dev_ctx, &x, nullptr, &dout, dx, functor); \ } -#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX( \ - name, functor_class, attr) \ - template \ - void name##GradKernel(const Context& dev_ctx, \ - const DenseTensor& x, \ - const DenseTensor& dout, \ - float attr, \ - DenseTensor* dx) { \ - functor_class functor; \ - auto attrs = functor.GetAttrs(); \ - *(attrs[0].second) = attr; \ - ActivationGradImpl>( \ - dev_ctx, &x, nullptr, &dout, dx, functor); \ +#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX( \ + name, functor_class, attr) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& dout, \ + float attr, \ + DenseTensor* dx) { \ + funcs::functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr; \ + ActivationGradImpl>( \ + dev_ctx, &x, nullptr, &dout, dx, functor); \ } -#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX( \ - name, functor_class, attr1, attr2) \ - template \ - void name##GradKernel(const Context& dev_ctx, \ - const DenseTensor& x, \ - const DenseTensor& dout, \ - float attr1, \ - float attr2, \ - DenseTensor* dx) { \ - functor_class functor; \ - auto attrs = functor.GetAttrs(); \ - *(attrs[0].second) = attr1; \ - *(attrs[1].second) = attr2; \ - ActivationGradImpl>( \ - dev_ctx, &x, nullptr, &dout, dx, functor); \ +#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX( \ + name, functor_class, attr1, attr2) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& dout, \ + float attr1, \ + float attr2, \ + DenseTensor* dx) { \ + funcs::functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr1; \ + *(attrs[1].second) = attr2; \ + ActivationGradImpl>( \ + dev_ctx, &x, nullptr, &dout, dx, functor); \ } -#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(name, functor_class) \ +#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(name, functor_class) \ template \ void name##GradKernel(const Context& dev_ctx, \ const DenseTensor& out, \ const DenseTensor& dout, \ DenseTensor* dx) { \ - functor_class functor; \ - ActivationGradImpl>( \ + funcs::functor_class functor; \ + ActivationGradImpl>( \ dev_ctx, nullptr, &out, &dout, dx, functor); \ } -#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepOut( \ - name, functor_class, attr) \ - template \ - void name##GradKernel(const Context& dev_ctx, \ - const DenseTensor& out, \ - const DenseTensor& dout, \ - float attr, \ - DenseTensor* dx) { \ - functor_class functor; \ - auto attrs = functor.GetAttrs(); \ - *(attrs[0].second) = attr; \ - ActivationGradImpl>( \ - dev_ctx, nullptr, &out, &dout, dx, functor); \ +#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT( \ + name, functor_class, attr) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& out, \ + const DenseTensor& dout, \ + float attr, \ + DenseTensor* dx) { \ + funcs::functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr; \ + ActivationGradImpl>( \ + dev_ctx, nullptr, &out, &dout, dx, functor); \ } -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Cos, funcs::CosGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Tan, funcs::TanGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Acos, funcs::AcosGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Sin, funcs::SinGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Asin, funcs::AsinGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Atan, funcs::AtanGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Sinh, funcs::SinhGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Cosh, funcs::CoshGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Asinh, funcs::AsinhGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Acosh, funcs::AcoshGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Atanh, funcs::AtanhGradFunctor); - -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(Relu, funcs::ReluGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(Tanh, funcs::TanhGradFunctor); - -DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(LeakyRelu, - funcs::LeakyReluGradFunctor, - alpha); -DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX( - ThresholdedRelu, funcs::ThresholdedReluGradFunctor, threshold); +#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT( \ + name, functor_class, attr1, attr2) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& out, \ + const DenseTensor& dout, \ + float attr1, \ + float attr2, \ + DenseTensor* dx) { \ + funcs::functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr1; \ + *(attrs[1].second) = attr2; \ + ActivationGradImpl>( \ + dev_ctx, nullptr, &out, &dout, dx, functor); \ + } -DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX(BRelu, - funcs::BReluGradFunctor, +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Cos, CosGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Tan, TanGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Acos, AcosGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Sin, SinGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Asin, AsinGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Atan, AtanGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Sinh, SinhGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Cosh, CoshGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Asinh, AsinhGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Acosh, AcoshGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Atanh, AtanhGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink, TanhShrinkGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Silu, SiluGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid, LogSigmoidGradFunctor); + +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu, ReluGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh, TanhGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Sigmoid, SigmoidGradFunctor); + +DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu, + LeakyReluGradFunctor, + alpha); +DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(ThresholdedRelu, + ThresholdedReluGradFunctor, + threshold); +DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink, + SoftShrinkGradFunctor, + lambda); +DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(HardShrink, + HardShrinkGradFunctor, + threshold); + +DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu, + BReluGradFunctor, t_min, t_max); +DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(HardSigmoid, + HardSigmoidGradFunctor, + slope, + offset); + +template +void EluGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& dout, + float alpha, + DenseTensor* dx) { + dev_ctx.template Alloc(dx); + + auto x_flatten = + EigenVector::Flatten(GET_DATA_SAFELY(&x, "Input", "X", "elu_grad")); + auto out_flatten = EigenVector::Flatten( + GET_DATA_SAFELY(&out, "Input", "Out", "elu_grad")); + auto dout_flatten = EigenVector::Flatten( + GET_DATA_SAFELY(&dout, "Input", "dOut", "elu_grad")); + auto dx_flatten = + EigenVector::Flatten(GET_DATA_SAFELY(dx, "Output", "dX", "elu_grad")); + auto* place = dev_ctx.eigen_device(); + + if (alpha > 0) { + funcs::ELUGradFunctor functor; + functor.alpha = alpha; + functor(*place, x_flatten, out_flatten, dout_flatten, dx_flatten); + } else { + funcs::ELUGradNegativeAlphaFunctor functor; + functor.alpha = alpha; + functor(*place, x_flatten, out_flatten, dout_flatten, dx_flatten); + } +} + } // namespace phi PD_REGISTER_KERNEL( @@ -144,6 +207,11 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL(brelu_grad, BReluGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_grad, LeakyReluGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(thresholded_relu_grad, ThresholdedReluGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(soft_shrink_grad, SoftShrinkGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_shrink_grad, HardShrinkGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_shrink_grad, TanhShrinkGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_grad, EluGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(silu_grad, SiluGradKernel) PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(relu_double_grad, ReluDoubleGradKernel) @@ -151,6 +219,7 @@ PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(tanh_double_grad, TanhDoubleGradKernel) PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(leaky_relu_double_grad, LeakyReluDoubleGradKernel) +PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(elu_double_grad, EluDoubleGradKernel) PD_REGISTER_KERNEL(tanh_triple_grad, CPU, @@ -159,3 +228,8 @@ PD_REGISTER_KERNEL(tanh_triple_grad, float, double, phi::dtype::float16) {} +PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_grad, SigmoidGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_double_grad, SigmoidDoubleGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_triple_grad, SigmoidTripleGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_sigmoid_grad, HardSigmoidGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(logsigmoid_grad, LogSigmoidGradKernel) diff --git a/paddle/phi/kernels/cpu/activation_kernel.cc b/paddle/phi/kernels/cpu/activation_kernel.cc index 0d13429c8f651ccb40646fddd82a3529a95ab45d..1d7b77ea4445f494105d4c23516f31f349847089 100644 --- a/paddle/phi/kernels/cpu/activation_kernel.cc +++ b/paddle/phi/kernels/cpu/activation_kernel.cc @@ -19,78 +19,102 @@ limitations under the License. */ namespace phi { -#define DEFINE_CPU_ACTIVATION_KERNEL(name, functor_class) \ - template \ - void name##Kernel( \ - const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { \ - functor_class functor; \ - ActivationImpl(dev_ctx, x, out, functor); \ +#define DEFINE_CPU_ACTIVATION_KERNEL(name, functor_class) \ + template \ + void name##Kernel( \ + const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { \ + funcs::functor_class functor; \ + ActivationImpl>( \ + dev_ctx, x, out, functor); \ } -#define DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(name, functor_class, attr) \ - template \ - void name##Kernel(const Context& dev_ctx, \ - const DenseTensor& x, \ - float attr, \ - DenseTensor* out) { \ - functor_class functor; \ - auto attrs = functor.GetAttrs(); \ - *(attrs[0].second) = attr; \ - ActivationImpl>(dev_ctx, x, out, functor); \ +#define DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(name, functor_class, attr) \ + template \ + void name##Kernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + float attr, \ + DenseTensor* out) { \ + funcs::functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr; \ + ActivationImpl>( \ + dev_ctx, x, out, functor); \ } -#define DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS( \ - name, functor_class, attr1, attr2) \ - template \ - void name##Kernel(const Context& dev_ctx, \ - const DenseTensor& x, \ - float attr1, \ - float attr2, \ - DenseTensor* out) { \ - functor_class functor; \ - auto attrs = functor.GetAttrs(); \ - *(attrs[0].second) = attr1; \ - *(attrs[1].second) = attr2; \ - ActivationImpl>(dev_ctx, x, out, functor); \ +#define DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS( \ + name, functor_class, attr1, attr2) \ + template \ + void name##Kernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + float attr1, \ + float attr2, \ + DenseTensor* out) { \ + funcs::functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr1; \ + *(attrs[1].second) = attr2; \ + ActivationImpl>( \ + dev_ctx, x, out, functor); \ } -DEFINE_CPU_ACTIVATION_KERNEL(Sin, funcs::SinFunctor) -DEFINE_CPU_ACTIVATION_KERNEL(Cos, funcs::CosFunctor) -DEFINE_CPU_ACTIVATION_KERNEL(Tan, funcs::TanFunctor) -DEFINE_CPU_ACTIVATION_KERNEL(Asin, funcs::AsinFunctor) -DEFINE_CPU_ACTIVATION_KERNEL(Atan, funcs::AtanFunctor) -DEFINE_CPU_ACTIVATION_KERNEL(Acos, funcs::AcosFunctor) -DEFINE_CPU_ACTIVATION_KERNEL(Sinh, funcs::SinhFunctor) -DEFINE_CPU_ACTIVATION_KERNEL(Cosh, funcs::CoshFunctor) -DEFINE_CPU_ACTIVATION_KERNEL(Asinh, funcs::AsinhFunctor) -DEFINE_CPU_ACTIVATION_KERNEL(Acosh, funcs::AcoshFunctor) -DEFINE_CPU_ACTIVATION_KERNEL(Atanh, funcs::AtanhFunctor) -DEFINE_CPU_ACTIVATION_KERNEL(Relu, funcs::ReluCPUFunctor) -DEFINE_CPU_ACTIVATION_KERNEL(Tanh, funcs::TanhFunctor) -DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, funcs::LeakyReluFunctor, alpha) +DEFINE_CPU_ACTIVATION_KERNEL(Sin, SinFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Cos, CosFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Tan, TanFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Asin, AsinFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Atan, AtanFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Acos, AcosFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Sinh, SinhFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Cosh, CoshFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Asinh, AsinhFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Acosh, AcoshFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Atanh, AtanhFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Relu, ReluCPUFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Tanh, TanhFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(TanhShrink, TanhShrinkFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Silu, SiluFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Sigmoid, SigmoidFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(LogSigmoid, LogSigmoidFunctor) + +DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, LeakyReluFunctor, alpha) DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu, - funcs::ThresholdedReluFunctor, + ThresholdedReluFunctor, threshold) -DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, funcs::BReluFunctor, t_min, t_max) +DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(HardShrink, HardShrinkFunctor, threshold) +DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(SoftShrink, SoftShrinkFunctor, lambda) +DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(Elu, ELUFunctor, alpha) + +DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, BReluFunctor, t_min, t_max) +DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(HardSigmoid, + HardSigmoidFunctor, + slope, + offset) } // namespace phi PD_REGISTER_KERNEL(relu, CPU, ALL_LAYOUT, phi::ReluKernel, float, double) {} #define PD_REGISTER_ACTIVATION_KERNEL(name, func) \ - PD_REGISTER_KERNEL(name, CPU, ALL_LAYOUT, phi::func##Kernel, float, double) {} + PD_REGISTER_KERNEL(name, CPU, ALL_LAYOUT, phi::func, float, double) {} -PD_REGISTER_ACTIVATION_KERNEL(sin, Sin) -PD_REGISTER_ACTIVATION_KERNEL(cos, Cos) -PD_REGISTER_ACTIVATION_KERNEL(tan, Tan) -PD_REGISTER_ACTIVATION_KERNEL(acos, Acos) -PD_REGISTER_ACTIVATION_KERNEL(asin, Asin) -PD_REGISTER_ACTIVATION_KERNEL(atan, Atan) -PD_REGISTER_ACTIVATION_KERNEL(sinh, Sinh) -PD_REGISTER_ACTIVATION_KERNEL(cosh, Cosh) -PD_REGISTER_ACTIVATION_KERNEL(asinh, Asinh) -PD_REGISTER_ACTIVATION_KERNEL(acosh, Acosh) -PD_REGISTER_ACTIVATION_KERNEL(atanh, Atanh) -PD_REGISTER_ACTIVATION_KERNEL(tanh, Tanh) -PD_REGISTER_ACTIVATION_KERNEL(brelu, BRelu) -PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyRelu) -PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedRelu) +PD_REGISTER_ACTIVATION_KERNEL(sin, SinKernel) +PD_REGISTER_ACTIVATION_KERNEL(cos, CosKernel) +PD_REGISTER_ACTIVATION_KERNEL(tan, TanKernel) +PD_REGISTER_ACTIVATION_KERNEL(acos, AcosKernel) +PD_REGISTER_ACTIVATION_KERNEL(asin, AsinKernel) +PD_REGISTER_ACTIVATION_KERNEL(atan, AtanKernel) +PD_REGISTER_ACTIVATION_KERNEL(sinh, SinhKernel) +PD_REGISTER_ACTIVATION_KERNEL(cosh, CoshKernel) +PD_REGISTER_ACTIVATION_KERNEL(asinh, AsinhKernel) +PD_REGISTER_ACTIVATION_KERNEL(acosh, AcoshKernel) +PD_REGISTER_ACTIVATION_KERNEL(atanh, AtanhKernel) +PD_REGISTER_ACTIVATION_KERNEL(tanh, TanhKernel) +PD_REGISTER_ACTIVATION_KERNEL(brelu, BReluKernel) +PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel) +PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedReluKernel) +PD_REGISTER_ACTIVATION_KERNEL(hard_shrink, HardShrinkKernel) +PD_REGISTER_ACTIVATION_KERNEL(soft_shrink, SoftShrinkKernel) +PD_REGISTER_ACTIVATION_KERNEL(tanh_shrink, TanhShrinkKernel) +PD_REGISTER_ACTIVATION_KERNEL(elu, EluKernel) +PD_REGISTER_ACTIVATION_KERNEL(silu, SiluKernel) +PD_REGISTER_ACTIVATION_KERNEL(sigmoid, SigmoidKernel) +PD_REGISTER_ACTIVATION_KERNEL(logsigmoid, LogSigmoidKernel) +PD_REGISTER_ACTIVATION_KERNEL(hard_sigmoid, HardSigmoidKernel) diff --git a/paddle/phi/kernels/cpu/copy_kernel.cc b/paddle/phi/kernels/cpu/copy_kernel.cc index 1af071f23ddc520e6733acdbeec3a0652f4e1d8f..fa11fd05bf1d656a075b996f8688d755b28cc034 100644 --- a/paddle/phi/kernels/cpu/copy_kernel.cc +++ b/paddle/phi/kernels/cpu/copy_kernel.cc @@ -38,7 +38,7 @@ void Copy(const Context& dev_ctx, << src_place; dst->Resize(src.dims()); - auto* dst_ptr = dev_ctx.Alloc(dst, src.dtype()); + auto* dst_ptr = dev_ctx.HostAlloc(dst, src.dtype()); if (src_ptr == dst_ptr) { VLOG(3) << "Skip copy the same data async from " << src_place << " to " diff --git a/paddle/phi/kernels/cpu/cumprod_grad_kernel.cc b/paddle/phi/kernels/cpu/cumprod_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..a25f9650fc50fefa3899da13b55b985c164a394a --- /dev/null +++ b/paddle/phi/kernels/cpu/cumprod_grad_kernel.cc @@ -0,0 +1,113 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/cumprod_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/allocator.h" +#include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/complex_functors.h" +#include "paddle/phi/kernels/funcs/cumprod.h" +#include "paddle/phi/kernels/funcs/for_range.h" + +// NOTE(@xiongkun): use of IsComplex<> +#include "paddle/fluid/framework/data_type.h" + +namespace phi { +template +void CumprodGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& d_out, + int dim, + DenseTensor* d_x) { + DDim shape = x.dims(); + + auto* d_out_data = d_out.data(); + auto* x_data = x.data(); + auto* out_data = out.data(); + auto* d_x_data = dev_ctx.template Alloc(d_x); + + size_t outer_dim = 1; + size_t mid_dim = 1; + size_t inner_dim = 1; + GetCumprodDimInfo(shape, dim, &outer_dim, &mid_dim, &inner_dim); + size_t numel = outer_dim * mid_dim * inner_dim; + + // deal with complex + const T* x_data_deal; + const T* out_data_deal; + Allocator::AllocationPtr x_conj; + Allocator::AllocationPtr out_conj; + if (paddle::framework::IsComplex::value) { + x_conj = const_cast(dev_ctx.GetAllocator()) + .Allocate(numel * sizeof(T)); + auto* x_data_conj = reinterpret_cast(x_conj->ptr()); + out_conj = const_cast(dev_ctx.GetAllocator()) + .Allocate(numel * sizeof(T)); + auto* out_data_conj = reinterpret_cast(out_conj->ptr()); + + phi::funcs::ForRange for_range_x(dev_ctx, numel); + phi::funcs::ConjFunctor functor_x(x_data, numel, x_data_conj); + for_range_x(functor_x); + + phi::funcs::ForRange for_range_out(dev_ctx, numel); + phi::funcs::ConjFunctor functor_out(out_data, numel, out_data_conj); + for_range_out(functor_out); + + x_data_deal = x_data_conj; + out_data_deal = out_data_conj; + } else { + x_data_deal = x_data; + out_data_deal = out_data; + } + + for (size_t i = 0; i < outer_dim; i++) { + for (size_t k = 0; k < inner_dim; k++) { + for (size_t j = 0; j < mid_dim; j++) { + size_t index = i * mid_dim * inner_dim + j * inner_dim + k; + d_x_data[index] = 0; + for (size_t n = 0; n < mid_dim; n++) { + size_t pos = i * mid_dim * inner_dim + n * inner_dim + k; + T elem; + if (j == 0) { + elem = d_out_data[pos]; + } else { + elem = d_out_data[pos] * out_data_deal[index - inner_dim]; + } + if (pos > index) { + for (size_t m = index + inner_dim; m <= pos; m += inner_dim) { + elem *= x_data_deal[m]; + } + } else if (pos < index) { + elem = static_cast(0); + } + d_x_data[index] += elem; + } + } + } + } +} +} // namespace phi +PD_REGISTER_KERNEL(cumprod_grad, + CPU, + ALL_LAYOUT, + phi::CumprodGradKernel, + float, + double, + int, + int64_t, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/cpu/cumprod_kernel.cc b/paddle/phi/kernels/cpu/cumprod_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..aea338027f5bb983788c382982dd2e1ad8db5e9a --- /dev/null +++ b/paddle/phi/kernels/cpu/cumprod_kernel.cc @@ -0,0 +1,65 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/cumprod_kernel.h" + +#include +#include +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/complex_functors.h" +#include "paddle/phi/kernels/funcs/cumprod.h" + +namespace phi { +template +void CumprodKernel(const Context& dev_ctx, + const DenseTensor& input, + int dim, + DenseTensor* out) { + const DenseTensor* x = &input; + auto* x_data = x->data(); + auto* out_data = dev_ctx.template Alloc(out); + DDim shape = x->dims(); + + size_t outer_dim = 1; + size_t mid_dim = 1; + size_t inner_dim = 1; + GetCumprodDimInfo(shape, dim, &outer_dim, &mid_dim, &inner_dim); + + for (size_t i = 0; i < outer_dim; i++) { + for (size_t j = 0; j < mid_dim; j++) { + for (size_t k = 0; k < inner_dim; k++) { + size_t pos = i * mid_dim * inner_dim + j * inner_dim + k; + if (j == 0) { + out_data[pos] = x_data[pos]; + } else { + out_data[pos] = out_data[pos - inner_dim] * x_data[pos]; + } + } + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(cumprod, + CPU, + ALL_LAYOUT, + phi::CumprodKernel, + float, + double, + int, + int64_t, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/cpu/deformable_conv_kernel.cc b/paddle/phi/kernels/cpu/deformable_conv_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..0d61f7be68af9cb23363a51065fd06d8b6492bfa --- /dev/null +++ b/paddle/phi/kernels/cpu/deformable_conv_kernel.cc @@ -0,0 +1,146 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/deformable_conv_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/deformable_conv_kernel_impl.h" + +namespace phi { + +template +inline void ModulatedDeformableIm2colCPUKernel( + const int num_kernels, + const T* data_im, + const T* data_offset, + const T* data_mask, + const int height, + const int width, + const int kernel_h, + const int kernel_w, + const int pad_h, + const int pad_w, + const int stride_h, + const int stride_w, + const int dilation_h, + const int dilation_w, + const int channel_per_deformable_group, + const int batch_size, + const int num_channels, + const int deformable_group, + const int height_col, + const int width_col, + T* data_col) { + for (int i = 0; i < num_kernels; i++) { + const int w_col = i % width_col; + const int h_col = (i / width_col) % height_col; + const int b_col = (i / width_col) / height_col % batch_size; + const int c_im = (i / width_col / height_col) / batch_size; + const int c_col = c_im * kernel_h * kernel_w; + + const int deformable_group_index = c_im / channel_per_deformable_group; + + const int h_in = h_col * stride_h - pad_h; + const int w_in = w_col * stride_w - pad_w; + + T* data_col_ptr = + data_col + + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col; + const T* data_im_ptr = + data_im + (b_col * num_channels + c_im) * height * width; + const T* data_offset_ptr = + data_offset + + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * + kernel_w * height_col * width_col; + const T* data_mask_ptr = + data_mask + + (b_col * deformable_group + deformable_group_index) * kernel_h * + kernel_w * height_col * width_col; + + for (int i = 0; i < kernel_h; ++i) { + for (int j = 0; j < kernel_w; ++j) { + const int data_offset_h_ptr = + ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col; + const int data_offset_w_ptr = + ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + + w_col; + const int data_mask_hw_ptr = + ((i * kernel_w + j) * height_col + h_col) * width_col + w_col; + + const T offset_h = data_offset_ptr[data_offset_h_ptr]; + const T offset_w = data_offset_ptr[data_offset_w_ptr]; + const T mask = data_mask_ptr[data_mask_hw_ptr]; + T val = static_cast(0); + const T h_im = h_in + i * dilation_h + offset_h; + const T w_im = w_in + j * dilation_w + offset_w; + if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) { + val = + DmcnIm2colBilinear(data_im_ptr, width, height, width, h_im, w_im); + } + *data_col_ptr = val * mask; + data_col_ptr += batch_size * height_col * width_col; + } + } + } +} + +template +void ModulatedDeformableIm2col(const Context& dev_ctx, + const T* data_im, + const T* data_offset, + const T* data_mask, + const std::vector& im_shape, + const std::vector& col_shape, + const std::vector& filter_shape, + const std::vector& paddings, + const std::vector& strides, + const std::vector& dilations, + const int deformable_groups, + T* data_col) { + int channel_per_deformable_group = im_shape[0] / deformable_groups; + int num_kernels = im_shape[0] * col_shape[1] * col_shape[2] * col_shape[3]; + + // get outputs of im2col with offset by bilinear interpolation + ModulatedDeformableIm2colCPUKernel(num_kernels, + data_im, + data_offset, + data_mask, + im_shape[1], + im_shape[2], + filter_shape[2], + filter_shape[3], + paddings[0], + paddings[1], + strides[0], + strides[1], + dilations[0], + dilations[1], + channel_per_deformable_group, + col_shape[1], + im_shape[0], + deformable_groups, + col_shape[2], + col_shape[3], + data_col); +} + +} // namespace phi + +PD_REGISTER_KERNEL(deformable_conv, + CPU, + ALL_LAYOUT, + phi::DeformableConvKernel, + float, + double) {} diff --git a/paddle/phi/kernels/cpu/determinant_grad_kernel.cc b/paddle/phi/kernels/cpu/determinant_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..e57d7263f88bfc7b15910f05ec9fa6b45c213e7e --- /dev/null +++ b/paddle/phi/kernels/cpu/determinant_grad_kernel.cc @@ -0,0 +1,25 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/determinant_grad_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/determinant_grad_kernel_impl.h" + +PD_REGISTER_KERNEL(determinant_grad, + CPU, + ALL_LAYOUT, + phi::DeterminantGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/cpu/determinant_kernel.cc b/paddle/phi/kernels/cpu/determinant_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..5810e88e92527fac2c643b239661c50df32cd6f9 --- /dev/null +++ b/paddle/phi/kernels/cpu/determinant_kernel.cc @@ -0,0 +1,21 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/determinant_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/determinant_kernel_impl.h" + +PD_REGISTER_KERNEL( + determinant, CPU, ALL_LAYOUT, phi::DeterminantKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/elementwise_kernel.cc b/paddle/phi/kernels/cpu/elementwise_kernel.cc index 37ad18df56ec30c838dd5bd03c484d7889e976c0..095d11720ce26622c31e517286d6f656869e62ff 100644 --- a/paddle/phi/kernels/cpu/elementwise_kernel.cc +++ b/paddle/phi/kernels/cpu/elementwise_kernel.cc @@ -12,10 +12,81 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/phi/kernels/cpu/elementwise.h" +#include "paddle/phi/api/ext/dispatch.h" #include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/complex.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/elementwise_kernel_impl.h" +namespace phi { + +#define DEFINE_CPU_ELEMENTWISE_OP(name) \ + template \ + void name##RawKernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& y, \ + int axis, \ + DenseTensor* out) { \ + dev_ctx.template Alloc(out); \ + if (x.dims() == y.dims()) { \ + SameDimsElementwiseCompute>()( \ + dev_ctx, x, y, out); \ + } else { \ + auto x_dims = x.dims(); \ + auto y_dims = y.dims(); \ + if (x_dims.size() >= y_dims.size()) { \ + funcs::ElementwiseCompute, T>( \ + dev_ctx, x, y, axis, funcs::name##Functor(), out); \ + } else { \ + funcs::ElementwiseCompute, T>( \ + dev_ctx, x, y, axis, funcs::Inverse##name##Functor(), out); \ + } \ + } \ + } + +template +void DivideRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out) { + // allocate memory for out + dev_ctx.template Alloc(out); + if (x.dims() == y.dims() && std::is_floating_point::value) { + SameDimsElementwiseCompute>()( + dev_ctx, x, y, out); + } else { + auto x_dims = x.dims(); + auto y_dims = y.dims(); + if (x_dims.size() >= y_dims.size()) { + funcs::ElementwiseCompute, T>( + dev_ctx, x, y, axis, funcs::DivideFunctor(), out); + } else { + funcs::ElementwiseCompute, T>( + dev_ctx, x, y, axis, funcs::InverseDivideFunctor(), out); + } + } +} + +// Create the definition of Add +DEFINE_CPU_ELEMENTWISE_OP(Add) + +// Create the definition of Subtract +DEFINE_CPU_ELEMENTWISE_OP(Subtract) + +// Create the definition of Multiply +DEFINE_CPU_ELEMENTWISE_OP(Multiply) + +} // namespace phi + +using complex64 = ::phi::dtype::complex; +using complex128 = ::phi::dtype::complex; + +// NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16 +// using bfloat16 = ::phi::dtype::bfloat16; + PD_REGISTER_KERNEL(elementwise_fmax, CPU, ALL_LAYOUT, @@ -33,3 +104,49 @@ PD_REGISTER_KERNEL(elementwise_fmin, double, int, int64_t) {} + +PD_REGISTER_KERNEL(add_raw, + CPU, + ALL_LAYOUT, + phi::AddRawKernel, + float, + double, + int16_t, + int, + int64_t, + complex64, + complex128) {} +PD_REGISTER_KERNEL(subtract_raw, + CPU, + ALL_LAYOUT, + phi::SubtractRawKernel, + float, + double, + int16_t, + int, + int64_t, + complex64, + complex128, + phi::dtype::bfloat16) {} +PD_REGISTER_KERNEL(divide_raw, + CPU, + ALL_LAYOUT, + phi::DivideRawKernel, + float, + double, + int, + int64_t, + complex64, + complex128) {} +PD_REGISTER_KERNEL(multiply_raw, + CPU, + ALL_LAYOUT, + phi::MultiplyRawKernel, + float, + double, + int, + int64_t, + bool, + complex64, + complex128, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/cpu/gather_grad_kernel.cc b/paddle/phi/kernels/cpu/gather_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..f0a6948018afce277725c50e3cbb0e17ab495a83 --- /dev/null +++ b/paddle/phi/kernels/cpu/gather_grad_kernel.cc @@ -0,0 +1,82 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/gather_grad_kernel.h" + +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/gather.h" +#include "paddle/phi/kernels/funcs/scatter.h" + +namespace phi { + +template +void GatherGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& index, + const DenseTensor& out_grad, + const Scalar& axis, + bool overwrite, + DenseTensor* x_grad) { + const auto& index_type = index.dtype(); + auto axis_v = axis.to(); + + if (axis_v != 0) { + if (index_type == phi::DataType::INT32) { + phi::funcs::GatherV2GradFunction( + dev_ctx, &out_grad, &index, axis_v, x_grad); + } else if (index_type == phi::DataType::INT64) { + phi::funcs::GatherV2GradFunction( + dev_ctx, &out_grad, &index, axis_v, x_grad); + } + return; + } + + dev_ctx.template Alloc(x_grad); + + auto dxt = EigenVector::Flatten(*x_grad); + auto& place = *dev_ctx.eigen_device(); + dxt.device(place) = dxt.constant(static_cast(0)); + if (x_grad->numel() == 0) return; + + if (index_type == phi::DataType::INT32) { + if (overwrite) { + phi::funcs::ScatterAssign(dev_ctx, out_grad, index, x_grad); + } else { + phi::funcs::ScatterAssignAdd( + dev_ctx, out_grad, index, x_grad); + } + } else if (index_type == phi::DataType::INT64) { + if (overwrite) { + phi::funcs::ScatterAssign(dev_ctx, out_grad, index, x_grad); + } else { + phi::funcs::ScatterAssignAdd( + dev_ctx, out_grad, index, x_grad); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(gather_grad, + CPU, + ALL_LAYOUT, + phi::GatherGradKernel, + float, + double, + int, + uint8_t, + int64_t, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/cpu/gather_kernel.cc b/paddle/phi/kernels/cpu/gather_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..9207a05b9dcce1daed95a1dbdb99db3c23c5c90d --- /dev/null +++ b/paddle/phi/kernels/cpu/gather_kernel.cc @@ -0,0 +1,66 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/gather_kernel.h" + +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/gather.h" + +namespace phi { + +template +void GatherKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& index, + const Scalar& axis, + DenseTensor* out) { + const auto& index_type = index.dtype(); + auto axis_v = axis.to(); + if (axis_v != 0) { + if (index_type == phi::DataType::INT32) { + phi::funcs::GatherV2Function( + dev_ctx, &x, &index, axis_v, out); + } else if (index_type == phi::DataType::INT64) { + phi::funcs::GatherV2Function( + dev_ctx, &x, &index, axis_v, out); + } + return; + } + + dev_ctx.template Alloc(out); + + if (x.numel() == 0) { + return; + } + + if (index_type == phi::DataType::INT32) { + phi::funcs::CPUGather(dev_ctx, x, index, out); + } else if (index_type == phi::DataType::INT64) { + phi::funcs::CPUGather(dev_ctx, x, index, out); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(gather, + CPU, + ALL_LAYOUT, + phi::GatherKernel, + float, + double, + int, + uint8_t, + int64_t, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/cpu/gelu_grad_kernel.cc b/paddle/phi/kernels/cpu/gelu_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..254c4ea5716d19c65da6a46748a43db8dbddd52b --- /dev/null +++ b/paddle/phi/kernels/cpu/gelu_grad_kernel.cc @@ -0,0 +1,146 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/gelu_grad_kernel.h" + +#include +#include + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/blas/blas_impl.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" +#include "paddle/phi/kernels/gelu_kernel.h" + +namespace phi { + +template +struct GeluGradFunctor { + template + void operator()(Device d, X x, dOut dout, dX dx, bool approximate) const { + if (approximate) { + if (std::is_same::value) { + VLOG(4) << "cast from float16 to float before computing"; + auto casted_x = x.template cast(); + auto casted_dout = dout.template cast(); + + const float kAlpha = static_cast(M_2_SQRTPI * M_SQRT1_2); + const float kBeta = + kAlpha * static_cast(GELU_CONSTANT) * static_cast(3); + const auto y = + (kAlpha * + ((static_cast(GELU_CONSTANT) * casted_x.cube()) + casted_x)) + .tanh(); + dx.device(d) = (static_cast(0.5) * casted_dout * + (static_cast(1) + y + + (casted_x - casted_x * y.square()) * + (kAlpha + kBeta * casted_x.square()))) + .template cast(); + } else { + const T kAlpha = static_cast(M_2_SQRTPI * M_SQRT1_2); + const T kBeta = + kAlpha * static_cast(GELU_CONSTANT) * static_cast(3); + const auto y = + (kAlpha * ((static_cast(GELU_CONSTANT) * x.cube()) + x)).tanh(); + dx.device(d) = static_cast(0.5) * dout * + (static_cast(1) + y + + (x - x * y.square()) * (kAlpha + kBeta * x.square())); + } + } else { +#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \ + !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) && \ + !defined(PADDLE_WITH_HIP) + auto x_data = x.data(); + auto dx_data = dx.data(); + auto dout_data = dout.data(); + int n = std::min(x.size(), dx.size()); + + auto first = static_cast(std::malloc(n * sizeof(T))); + std::memset(first, 0, n * sizeof(T)); + auto second = static_cast(std::malloc(n * sizeof(T))); + std::memset(second, 0, n * sizeof(T)); + + // first = (0.5 * (1 + erf(x / sqrt(2)))) + phi::funcs::CBlas::AXPY( + n, static_cast(M_SQRT1_2), x_data, 1, first, 1); + phi::funcs::CBlas::VMERF(n, first, first, VML_LA); + for (int i = 0; i < n; i++) { + first[i] += static_cast(1); + } + phi::funcs::CBlas::SCAL(n, static_cast(0.5), first, 1); + + // second = (0.5 * 2/sqrt(pi) * 1/sqrt(2) * x * exp(-0.5 * x^2)) + phi::funcs::CBlas::VSQUARE(n, x_data, second); + phi::funcs::CBlas::SCAL(n, -static_cast(0.5), second, 1); + phi::funcs::CBlas::VEXP(n, second, second); + phi::funcs::CBlas::VMUL(n, x_data, second, second); + phi::funcs::CBlas::SCAL( + n, static_cast(0.5 * M_2_SQRTPI * M_SQRT1_2), second, 1); + + // dx = dout * (first + second); + phi::funcs::CBlas::VADD(n, first, second, first); + phi::funcs::CBlas::VMUL(n, dout_data, first, dx_data); + + std::free(first); + std::free(second); +#else + // gelu_grad(x) = dout * 0.5 * (1 + erf(x / sqrt(2)) + x * sqrt(2 / pi) * + // exp(- x^2 / 2) + if (std::is_same::value) { + VLOG(4) << "cast from float16 to float before computing"; + auto casted_x = x.template cast(); + auto casted_dout = dout.template cast(); + auto first = static_cast(0.5) * + (static_cast(1) + + ((casted_x * static_cast(M_SQRT1_2)).erf())); + auto second = static_cast(0.5 * M_2_SQRTPI * M_SQRT1_2) * + casted_x * + (-static_cast(0.5) * casted_x.square()).exp(); + dx.device(d) = (casted_dout * (first + second)).template cast(); + } else { + auto first = + static_cast(0.5) * + (static_cast(1) + ((x * static_cast(M_SQRT1_2)).erf())); + + auto second = static_cast(0.5 * M_2_SQRTPI * M_SQRT1_2) * x * + (-static_cast(0.5) * x.square()).exp(); + dx.device(d) = dout * (first + second); + } +#endif + } + } +}; + +template +void GeluGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + bool approximate, + DenseTensor* x_grad) { + dev_ctx.template Alloc(x_grad); + auto eigen_x = EigenVector::Flatten(x); + auto eigen_out_grad = EigenVector::Flatten(out_grad); + auto eigen_x_grad = EigenVector::Flatten(*x_grad); + auto& dev = *dev_ctx.eigen_device(); + + GeluGradFunctor functor; + functor(dev, eigen_x, eigen_out_grad, eigen_x_grad, approximate); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + gelu_grad, CPU, ALL_LAYOUT, phi::GeluGradKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/gelu_kernel.cc b/paddle/phi/kernels/cpu/gelu_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..d7af220574565ea96706c2a87aec6751c9203af4 --- /dev/null +++ b/paddle/phi/kernels/cpu/gelu_kernel.cc @@ -0,0 +1,102 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/gelu_kernel.h" +#include +#include +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/blas/blas_impl.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" + +namespace phi { + +template +struct GeluFunctor { + template + void operator()(Device d, X x, Out out, bool approximate) const { + if (approximate) { + // gelu(x) = 0.5 * x * (1 + tanh(sqrt(2 / \pi) * (x + 0.044715 * x^{3}))) + if (std::is_same::value) { + VLOG(4) << "cast from float16 to float before computing"; + auto casted_x = x.template cast(); + auto temp = + (static_cast(M_2_SQRTPI * M_SQRT1_2) * + (casted_x + static_cast(GELU_CONSTANT) * casted_x.cube())) + .tanh(); + out.device(d) = (casted_x * static_cast(0.5) * + (static_cast(1) + temp)) + .template cast(); + } else { + auto temp = (static_cast(M_2_SQRTPI * M_SQRT1_2) * + (x + static_cast(GELU_CONSTANT) * x.cube())) + .tanh(); + out.device(d) = x * static_cast(0.5) * (static_cast(1) + temp); + } + } else { +#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \ + !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) && \ + !defined(PADDLE_WITH_HIP) + auto x_data = x.data(); + auto out_data = out.data(); + int n = std::min(x.size(), out.size()); + + std::memset(out_data, 0, n * sizeof(T)); + phi::funcs::CBlas::AXPY( + n, static_cast(M_SQRT1_2), x_data, 1, out_data, 1); + phi::funcs::CBlas::VMERF(n, out_data, out_data, VML_LA); + for (int i = 0; i < n; i++) { + out_data[i] += static_cast(1); + } + phi::funcs::CBlas::VMUL(n, x_data, out_data, out_data); + for (int i = 0; i < n; i++) { + out_data[i] *= static_cast(0.5); + } +#else + // gelu(x) = 0.5 * x * (1 + erf(x / sqrt(2))) + if (std::is_same::value) { + VLOG(4) << "cast from float16 to float before computing"; + auto casted_x = x.template cast(); + auto temp = (casted_x * static_cast(M_SQRT1_2)).erf(); + out.device(d) = (casted_x * static_cast(0.5) * + (static_cast(1) + temp)) + .template cast(); + } else { + auto temp = (x * static_cast(M_SQRT1_2)).erf(); + out.device(d) = x * static_cast(0.5) * (static_cast(1) + temp); + } +#endif + } + } +}; + +template +void GeluKernel(const Context& dev_ctx, + const DenseTensor& x, + bool approximate, + DenseTensor* out) { + dev_ctx.template Alloc(out); + auto eigen_out = EigenVector::Flatten(*out); + auto eigen_x = EigenVector::Flatten(x); + auto& dev = *dev_ctx.eigen_device(); + + GeluFunctor functor; + functor(dev, eigen_x, eigen_out, approximate); +} + +} // namespace phi + +PD_REGISTER_KERNEL(gelu, CPU, ALL_LAYOUT, phi::GeluKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/grid_sample_grad_kernel.cc b/paddle/phi/kernels/cpu/grid_sample_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..923cb8424115e00f07274f959ffe34adaa9a0327 --- /dev/null +++ b/paddle/phi/kernels/cpu/grid_sample_grad_kernel.cc @@ -0,0 +1,357 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/grid_sample_grad_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/grid_sample_utils.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +static inline void ClipWithMask(const CPUContext& ctx, + const int max_val, // height-1 or width-1 + bool align_corners, + std::string padding_mode, + DenseTensor* grid_slice, + DenseTensor* grid_scale) { + auto& place = *ctx.eigen_device(); + grid_scale->Resize(grid_slice->dims()); + ctx.Alloc(grid_scale); + + auto grid_slice_t = EigenTensor::From(*grid_slice); + auto factor = static_cast(max_val * 0.5); + if (!align_corners) { + factor = static_cast((max_val + 1) * 0.5); + } + auto grid_scale_t = EigenTensor::From(*grid_scale).setConstant(factor); + + if (padding_mode == "border") { + // auto bounded_lo = grid_slice_t.cwiseMax(static_cast(0)); + auto res = grid_slice_t.cwiseMax(static_cast(0)) + .cwiseMin(static_cast(max_val)); + + auto in_bound = (res == grid_slice_t); + grid_scale_t.device(place) = grid_scale_t * in_bound.template cast(); + grid_slice_t.device(place) = res; + } else if (padding_mode == "reflection") { + if (align_corners) { + auto double_range = static_cast(max_val * 2); + auto is_neg = (grid_slice_t < static_cast(0)); + auto grid_abs = grid_slice_t.abs(); + auto extra = grid_abs - (grid_abs / double_range).floor() * double_range; + auto one_more_flip = (extra > (double_range - extra)); + grid_scale_t.device(place) = + grid_scale_t * ((is_neg == one_more_flip).template cast() - + (is_neg != one_more_flip).template cast()); + grid_slice_t.device(place) = extra.cwiseMin(double_range - extra); + if (max_val == 0) { + grid_slice_t.device(place) = grid_slice_t.constant(static_cast(0)); + } + } else { + auto double_range = static_cast((max_val + 1) * 2); + auto grid_abs = (grid_slice_t + static_cast(0.5)).abs(); + auto is_neg = ((grid_slice_t + static_cast(0.5)) < static_cast(0)); + auto extra = grid_abs - (grid_abs / double_range).floor() * double_range; + auto one_more_flip = (extra > (double_range - extra)); + auto reflected = + extra.cwiseMin(double_range - extra) - static_cast(0.5); + auto clipped = reflected.cwiseMax(static_cast(0)) + .cwiseMin(static_cast(max_val)); + auto in_bound = (clipped == reflected).template cast(); + grid_scale_t.device(place) = + grid_scale_t * ((is_neg == one_more_flip).template cast() - + (is_neg != one_more_flip).template cast()) * + in_bound; + grid_slice_t.device(place) = clipped; + } + } +} + +template +static void CalcGridLocationsWithGrad(const CPUContext& ctx, + const DenseTensor& grid, + const int in_h, + const int in_w, + bool align_corners, + std::string padding_mode, + DenseTensor* grid_x, + DenseTensor* grid_y, + DenseTensor* grid_x_scale, + DenseTensor* grid_y_scale) { + const int n = grid.dims()[0]; + const int out_h = grid.dims()[1]; + const int out_w = grid.dims()[2]; + + // split grid with shape (n, h, w, 2) into (x, y) by the 3rd Dim + grid_x->Resize({n, out_h, out_w}); + grid_y->Resize({n, out_h, out_w}); + T* grid_x_data = ctx.Alloc(grid_x); + T* grid_y_data = ctx.Alloc(grid_y); + + const T* grid_data = grid.data(); + for (int i = 0; i < n * out_h * out_w; i++) { + grid_x_data[i] = grid_data[2 * i]; + grid_y_data[i] = grid_data[(2 * i) + 1]; + } + + Unnormalize(ctx, grid_x, in_w - 1, align_corners); + Unnormalize(ctx, grid_y, in_h - 1, align_corners); + + ClipWithMask( + ctx, in_w - 1, align_corners, padding_mode, grid_x, grid_x_scale); + ClipWithMask( + ctx, in_h - 1, align_corners, padding_mode, grid_y, grid_y_scale); +} + +template +static void GatherOutputGradToInputGrad(const DenseTensor& output_grad, + DenseTensor* input_grad, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& d1, + const DenseTensor& d2) { + const int n = output_grad.dims()[0]; + const int c = output_grad.dims()[1]; + const int out_h = output_grad.dims()[2]; + const int out_w = output_grad.dims()[3]; + const int in_h = input_grad->dims()[2]; + const int in_w = input_grad->dims()[3]; + auto x_t = EigenTensor::From(x); + auto y_t = EigenTensor::From(y); + auto d1_t = EigenTensor::From(d1); + auto d2_t = EigenTensor::From(d2); + auto input_grad_t = EigenTensor::From(*input_grad); + auto output_grad_t = EigenTensor::From(output_grad); + + for (int i = 0; i < n; i++) { + for (int k = 0; k < out_h; k++) { + for (int l = 0; l < out_w; l++) { + if (IsInBound( + x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1), (T)(in_h - 1))) { + for (int j = 0; j < c; j++) { + input_grad_t(i, + j, + static_cast(round(y_t(i, k, l))), + static_cast(round(x_t(i, k, l)))) += + output_grad_t(i, j, k, l) * d1_t(i, k, l) * d2_t(i, k, l); + } + } + } + } + } +} + +template +static void GatherBilinearGrad(const CPUContext& ctx, + const DenseTensor& input, + const DenseTensor& output_grad, + DenseTensor* grid_x, + DenseTensor* grid_y, + DenseTensor* grid_x_scale, + DenseTensor* grid_y_scale, + DenseTensor* input_grad, + DenseTensor* grid_grad) { + const int n = grid_x->dims()[0]; + const int out_h = grid_x->dims()[1]; + const int out_w = grid_x->dims()[2]; + const int c = input.dims()[1]; + + DenseTensor x_w, x_e, y_n, y_s; + DenseTensor d_w, d_e, d_n, d_s; + DenseTensor v_wn, v_en, v_ws, v_es; + + AllNeigbors(ctx, + input, + grid_x, // grid_x + grid_y, // grid_y + &x_w, + &x_e, + &y_n, + &y_s, + &d_w, + &d_e, + &d_n, + &d_s, + &v_wn, + &v_en, + &v_ws, + &v_es); + + // gather output grad value to input grad by corner point coords and weight + GatherOutputGradToInputGrad(output_grad, input_grad, x_w, y_n, d_e, d_s); + GatherOutputGradToInputGrad(output_grad, input_grad, x_w, y_s, d_e, d_n); + GatherOutputGradToInputGrad(output_grad, input_grad, x_e, y_n, d_w, d_s); + GatherOutputGradToInputGrad(output_grad, input_grad, x_e, y_s, d_w, d_n); + + auto v_wn_t = EigenTensor::From(v_wn); + auto v_en_t = EigenTensor::From(v_en); + auto v_ws_t = EigenTensor::From(v_ws); + auto v_es_t = EigenTensor::From(v_es); + + auto d_w_t = EigenTensor::From(d_w); + auto d_e_t = EigenTensor::From(d_e); + auto d_n_t = EigenTensor::From(d_n); + auto d_s_t = EigenTensor::From(d_s); + + auto output_grad_t = EigenTensor::From(output_grad); + + if (grid_grad != nullptr) { + DenseTensor grid_grad_x, grid_grad_y; + grid_grad_x.Resize({n, out_h, out_w}); + grid_grad_y.Resize({n, out_h, out_w}); + ctx.Alloc(&grid_grad_x); + ctx.Alloc(&grid_grad_y); + auto grid_grad_x_t = + EigenTensor::From(grid_grad_x).setConstant(static_cast(0.0)); + auto grid_grad_y_t = + EigenTensor::From(grid_grad_y).setConstant(static_cast(0.0)); + for (int i = 0; i < n; i++) { + for (int j = 0; j < c; j++) { + for (int k = 0; k < out_h; k++) { + for (int l = 0; l < out_w; l++) { + grid_grad_x_t(i, k, l) += + ((v_en_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_s_t(i, k, l) + + (v_es_t(i, j, k, l) - v_ws_t(i, j, k, l)) * d_n_t(i, k, l)) * + output_grad_t(i, j, k, l); + grid_grad_y_t(i, k, l) += + ((v_ws_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_e_t(i, k, l) + + (v_es_t(i, j, k, l) - v_en_t(i, j, k, l)) * d_w_t(i, k, l)) * + output_grad_t(i, j, k, l); + } + } + } + } + + // const T x_max = static_cast(in_w - 1); + // const T y_max = static_cast(in_h - 1); + + auto grid_x_scale_t = EigenTensor::From(*grid_x_scale); + auto grid_y_scale_t = EigenTensor::From(*grid_y_scale); + grid_grad_x_t = grid_grad_x_t * grid_x_scale_t; + grid_grad_y_t = grid_grad_y_t * grid_y_scale_t; + + // gather grid_grad [x, y] in 3rd Dim + T* grid_grad_data = grid_grad->data(); + T* grid_grad_x_data = grid_grad_x.data(); + T* grid_grad_y_data = grid_grad_y.data(); + for (int i = 0; i < n * out_h * out_w; i++) { + grid_grad_data[2 * i] = grid_grad_x_data[i]; + grid_grad_data[2 * i + 1] = grid_grad_y_data[i]; + } + } +} + +template +static void GatherOutputGradToInputGrad(const DenseTensor& output_grad, + DenseTensor* input_grad, + const DenseTensor& x, + const DenseTensor& y) { + const int n = output_grad.dims()[0]; + const int c = output_grad.dims()[1]; + const int out_h = output_grad.dims()[2]; + const int out_w = output_grad.dims()[3]; + const int in_h = input_grad->dims()[2]; + const int in_w = input_grad->dims()[3]; + auto x_t = EigenTensor::From(x); + auto y_t = EigenTensor::From(y); + auto input_grad_t = EigenTensor::From(*input_grad); + auto output_grad_t = EigenTensor::From(output_grad); + for (int i = 0; i < n; i++) { + for (int k = 0; k < out_h; k++) { + for (int l = 0; l < out_w; l++) { + if (IsInBound( + x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1), (T)(in_h - 1))) { + for (int j = 0; j < c; j++) { + input_grad_t(i, + j, + static_cast(round(y_t(i, k, l))), + static_cast(round(x_t(i, k, l)))) += + output_grad_t(i, j, k, l); + } + } + } + } + } +} + +template +void GridSampleGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& grid, + const DenseTensor& out_grid, + const std::string& mode, + const std::string& padding_mode, + bool align_corners, + DenseTensor* x_grad, + DenseTensor* grid_grad) { + const int n = grid.dims()[0]; + const int out_h = grid.dims()[1]; + const int out_w = grid.dims()[2]; + const int c = x.dims()[1]; + const int in_h = x.dims()[2]; + const int in_w = x.dims()[3]; + + x_grad->Resize({n, c, in_h, in_w}); + dev_ctx.template Alloc(x_grad); + phi::funcs::SetConstant()(dev_ctx, x_grad, static_cast(0)); + + if (grid_grad != nullptr) { + grid_grad->Resize({n, out_h, out_w, 2}); + dev_ctx.template Alloc(grid_grad); + phi::funcs::SetConstant()( + dev_ctx, grid_grad, static_cast(0)); + } + + DenseTensor grid_x, grid_y; + DenseTensor grid_x_scale, grid_y_scale; + CalcGridLocationsWithGrad(dev_ctx, + grid, + in_h, + in_w, + align_corners, + padding_mode, + &grid_x, + &grid_y, + &grid_x_scale, + &grid_y_scale); + if (mode == "bilinear") { + GatherBilinearGrad(dev_ctx, + x, + out_grid, + &grid_x, + &grid_y, + &grid_x_scale, + &grid_y_scale, + x_grad, + grid_grad); + } else { + auto grid_x_t = EigenTensor::From(grid_x); + auto grid_y_t = EigenTensor::From(grid_y); + grid_x_t = grid_x_t.round(); + grid_y_t = grid_y_t.round(); + GatherOutputGradToInputGrad(out_grid, x_grad, grid_x, grid_y); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(grid_sample_grad, + CPU, + ALL_LAYOUT, + phi::GridSampleGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/cpu/grid_sample_kernel.cc b/paddle/phi/kernels/cpu/grid_sample_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..92a528cdda96a191cf73115feb3cf3dd3656305d --- /dev/null +++ b/paddle/phi/kernels/cpu/grid_sample_kernel.cc @@ -0,0 +1,184 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/grid_sample_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/grid_sample_utils.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +using Array4 = Eigen::DSizes; + +template +static inline void Clip(const CPUContext& ctx, + DenseTensor* grid_slice, + const int max_val, // height-1 or width-1 + bool align_corners, + std::string padding_mode) { + auto& place = *ctx.eigen_device(); + auto grid_slice_t = EigenTensor::From(*grid_slice); + if (padding_mode == "border") { + grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast(0)) + .cwiseMin(static_cast(max_val)); + } else if (padding_mode == "reflection") { + if (align_corners) { + auto double_range = static_cast(max_val * 2); + auto grid_abs = grid_slice_t.abs(); + auto extra = grid_abs - (grid_abs / double_range).floor() * double_range; + grid_slice_t.device(place) = extra.cwiseMin(double_range - extra); + if (max_val == 0) { + grid_slice_t.device(place) = grid_slice_t.constant(static_cast(0)); + } + } else { + auto double_range = static_cast((max_val + 1) * 2); + auto grid_abs = (grid_slice_t + static_cast(0.5)).abs(); + auto extra = grid_abs - (grid_abs / double_range).floor() * double_range; + grid_slice_t.device(place) = + extra.cwiseMin(double_range - extra) - static_cast(0.5); + grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast(0)) + .cwiseMin(static_cast(max_val)); + } + } +} + +template +static void CalcGridLocations(const CPUContext& ctx, + const DenseTensor& grid, + const int in_h, + const int in_w, + bool align_corners, + std::string padding_mode, + DenseTensor* grid_x, + DenseTensor* grid_y) { + const int n = grid.dims()[0]; + const int out_h = grid.dims()[1]; + const int out_w = grid.dims()[2]; + + // split grid with shape (n, h, w, 2) into (x, y) by the 3rd Dim + grid_x->Resize({n, out_h, out_w}); + grid_y->Resize({n, out_h, out_w}); + T* grid_x_data = ctx.Alloc(grid_x); + T* grid_y_data = ctx.Alloc(grid_y); + const T* grid_data = grid.data(); + for (int i = 0; i < n * out_h * out_w; i++) { + grid_x_data[i] = grid_data[2 * i]; + grid_y_data[i] = grid_data[(2 * i) + 1]; + } + + Unnormalize(ctx, grid_x, in_w - 1, align_corners); + Unnormalize(ctx, grid_y, in_h - 1, align_corners); + + Clip(ctx, grid_x, in_w - 1, align_corners, padding_mode); + Clip(ctx, grid_y, in_h - 1, align_corners, padding_mode); +} + +template +static void BilinearInter(const CPUContext& ctx, + const DenseTensor& input, + DenseTensor* grid_x, + DenseTensor* grid_y, + DenseTensor* out) { + auto& place = *ctx.eigen_device(); + const int n = grid_x->dims()[0]; + const int out_h = grid_x->dims()[1]; + const int out_w = grid_x->dims()[2]; + const int c = input.dims()[1]; + + DenseTensor x_w, x_e, y_n, y_s; + DenseTensor d_w, d_e, d_n, d_s; + DenseTensor v_wn, v_en, v_ws, v_es; + + AllNeigbors(ctx, + input, + grid_x, + grid_y, + &x_w, + &x_e, + &y_n, + &y_s, + &d_w, + &d_e, + &d_n, + &d_s, + &v_wn, + &v_en, + &v_ws, + &v_es); + + auto d_w_t = EigenTensor::From(d_w); + auto d_e_t = EigenTensor::From(d_e); + auto d_n_t = EigenTensor::From(d_n); + auto d_s_t = EigenTensor::From(d_s); + + auto d_w_scaled_t = + d_w_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1)); + auto d_e_scaled_t = + d_e_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1)); + auto d_n_scaled_t = + d_n_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1)); + auto d_s_scaled_t = + d_s_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1)); + auto v_wn_t = EigenTensor::From(v_wn); + auto v_en_t = EigenTensor::From(v_en); + auto v_ws_t = EigenTensor::From(v_ws); + auto v_es_t = EigenTensor::From(v_es); + auto output_t = EigenTensor::From(*out); + // bilinear interpolaetion by 4 corner points + output_t.device(place) = v_wn_t * d_e_scaled_t * d_s_scaled_t + + v_en_t * d_w_scaled_t * d_s_scaled_t + + v_ws_t * d_e_scaled_t * d_n_scaled_t + + v_es_t * d_w_scaled_t * d_n_scaled_t; +} + +template +void GridSampleKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& grid, + const std::string& mode, + const std::string& padding_mode, + bool align_corners, + DenseTensor* out) { + const int n = grid.dims()[0]; + const int out_h = grid.dims()[1]; + const int out_w = grid.dims()[2]; + const int c = x.dims()[1]; + const int in_h = x.dims()[2]; + const int in_w = x.dims()[3]; + + out->Resize(phi::make_ddim({n, c, out_h, out_w})); + dev_ctx.template Alloc(out); + phi::funcs::SetConstant()(dev_ctx, out, static_cast(0)); + + DenseTensor grid_x, grid_y; + CalcGridLocations( + dev_ctx, grid, in_h, in_w, align_corners, padding_mode, &grid_x, &grid_y); + + if (mode == "bilinear") { + BilinearInter(dev_ctx, x, &grid_x, &grid_y, out); + } else if (mode == "nearest") { + auto grid_x_t = EigenTensor::From(grid_x); + auto grid_y_t = EigenTensor::From(grid_y); + grid_x_t = grid_x_t.round(); + grid_y_t = grid_y_t.round(); + GetGridPointValue(x, out, grid_x, grid_y); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + grid_sample, CPU, ALL_LAYOUT, phi::GridSampleKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/grid_sample_utils.h b/paddle/phi/kernels/cpu/grid_sample_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..53a16446d7e8c65b3d2d63835e6a2b86c1f96795 --- /dev/null +++ b/paddle/phi/kernels/cpu/grid_sample_utils.h @@ -0,0 +1,160 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" + +namespace phi { + +template +void Unnormalize(const CPUContext& ctx, + DenseTensor* grid_slice, + const int max_val, // height-1 or width-1 + bool align_corners) { + auto& place = *ctx.eigen_device(); + auto grid_slice_t = EigenTensor::From(*grid_slice); + + if (!align_corners) { + auto factor = static_cast((max_val + 1) * 0.5); + grid_slice_t.device(place) = + (grid_slice_t + static_cast(1)) * factor - static_cast(0.5); + } else { + auto factor = static_cast(max_val * 0.5); + grid_slice_t.device(place) = (grid_slice_t + static_cast(1)) * factor; + } +} + +template +inline bool IsInBound(T x, T y, T x_max, T y_max) { + if (x < 0 || x > x_max || y < 0 || y > y_max) { + return false; + } + return true; +} + +template +void GetGridPointValue(const DenseTensor& input, + DenseTensor* output, + const DenseTensor& x, + const DenseTensor& y) { + const int n = input.dims()[0]; + const int c = input.dims()[1]; + const int in_h = input.dims()[2]; + const int in_w = input.dims()[3]; + const int out_h = x.dims()[1]; + const int out_w = x.dims()[2]; + auto x_t = EigenTensor::From(x); + auto y_t = EigenTensor::From(y); + auto output_t = EigenTensor::From(*output).setConstant((T)0); + auto input_t = EigenTensor::From(input); + + for (int i = 0; i < n; i++) { + for (int k = 0; k < out_h; k++) { + for (int l = 0; l < out_w; l++) { + if (IsInBound( + x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1), (T)(in_h - 1))) { + for (int j = 0; j < c; j++) { + output_t(i, j, k, l) = + input_t(i, + j, + static_cast(round(y_t(i, k, l))), + static_cast(round(x_t(i, k, l)))); + } + } + } + } + } +} + +template +void AllNeigbors(const CPUContext& ctx, + const DenseTensor& input, + DenseTensor* grid_x, + DenseTensor* grid_y, + DenseTensor* x_w, + DenseTensor* x_e, + DenseTensor* y_n, + DenseTensor* y_s, // positions + DenseTensor* d_w, + DenseTensor* d_e, + DenseTensor* d_n, + DenseTensor* d_s, // distance + DenseTensor* v_wn, + DenseTensor* v_en, + DenseTensor* v_ws, + DenseTensor* v_es) { // values + auto& place = *ctx.eigen_device(); + + const int c = input.dims()[1]; + const int n = grid_x->dims()[0]; + const int out_h = grid_x->dims()[1]; + const int out_w = grid_x->dims()[2]; + // calculate coords of 4 corner points + x_w->Resize({n, out_h, out_w}); + x_e->Resize({n, out_h, out_w}); + y_n->Resize({n, out_h, out_w}); + y_s->Resize({n, out_h, out_w}); + ctx.Alloc(x_w); + ctx.Alloc(x_e); + ctx.Alloc(y_n); + ctx.Alloc(y_s); + auto x_w_t = EigenTensor::From(*x_w); + auto x_e_t = EigenTensor::From(*x_e); + auto y_n_t = EigenTensor::From(*y_n); + auto y_s_t = EigenTensor::From(*y_s); + + auto grid_x_t = EigenTensor::From(*grid_x); + auto grid_y_t = EigenTensor::From(*grid_y); + + x_w_t.device(place) = grid_x_t.floor(); + x_e_t.device(place) = x_w_t + static_cast(1); + y_n_t.device(place) = grid_y_t.floor(); + y_s_t.device(place) = y_n_t + static_cast(1); + + // calculate distances to 4 sides + d_w->Resize({n, out_h, out_w}); + d_e->Resize({n, out_h, out_w}); + d_n->Resize({n, out_h, out_w}); + d_s->Resize({n, out_h, out_w}); + ctx.Alloc(d_w); + ctx.Alloc(d_e); + ctx.Alloc(d_n); + ctx.Alloc(d_s); + auto d_w_t = EigenTensor::From(*d_w); + auto d_e_t = EigenTensor::From(*d_e); + auto d_n_t = EigenTensor::From(*d_n); + auto d_s_t = EigenTensor::From(*d_s); + d_w_t.device(place) = grid_x_t - x_w_t; + d_e_t.device(place) = x_e_t - grid_x_t; + d_n_t.device(place) = grid_y_t - y_n_t; + d_s_t.device(place) = y_s_t - grid_y_t; + + // calc 4 corner points value + v_wn->Resize({n, c, out_h, out_w}); + v_en->Resize({n, c, out_h, out_w}); + v_ws->Resize({n, c, out_h, out_w}); + v_es->Resize({n, c, out_h, out_w}); + ctx.Alloc(v_wn); + ctx.Alloc(v_en); + ctx.Alloc(v_ws); + ctx.Alloc(v_es); + GetGridPointValue(input, v_wn, *x_w, *y_n); + GetGridPointValue(input, v_en, *x_e, *y_n); + GetGridPointValue(input, v_ws, *x_w, *y_s); + GetGridPointValue(input, v_es, *x_e, *y_s); +} + +} // namespace phi diff --git a/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad.h b/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad.h new file mode 100644 index 0000000000000000000000000000000000000000..b79aab96c0fc2251f35fe93b525a03676e01fdb1 --- /dev/null +++ b/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad.h @@ -0,0 +1,110 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/operators/math/matrix_bit_code.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/selected_rows.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +namespace math = paddle::operators::math; + +template +void HierarchicalSigmoidGradKernelImpl( + const Context& ctx, + const DenseTensor& x, + const DenseTensor& w, + const DenseTensor& label, + const DenseTensor& pre_out, + const DenseTensor& out_grad, + paddle::optional path, + paddle::optional code, + paddle::optional bias, + int num_classes, + bool remote_prefetch, + int trainer_id, + const std::vector& height_sections, + const std::vector& epmap, + const std::vector& table_names, + bool is_sparse, + DenseTensor* x_grad, + DenseTensor* w_grad, + DenseTensor* bias_grad, + SelectedRows* w_grad_sr = nullptr) { + funcs::SetConstant zero; + DenseTensor pre_out_grad; + + pre_out_grad.Resize(pre_out.dims()); + ctx.template Alloc(&pre_out_grad); + ctx.template Alloc(x_grad); + zero(ctx, x_grad, static_cast(0.0)); + + bool is_custom = false; + if (path.get_ptr()) { + is_custom = true; + } + + std::unique_ptr> bit_code; + if (!is_custom) { + bit_code.reset(new math::MatrixBitCodeFunctor( + num_classes, label.template data())); + } else { + bit_code.reset(new math::MatrixBitCodeFunctor( + *(path.get_ptr()), *(code.get_ptr()), label.template data())); + } + + // softrelu derivative + + auto blas = funcs::GetBlas(ctx); + + auto* pre_out_grad_data = pre_out_grad.data(); + auto* pre_out_data = pre_out.template data(); + auto n = pre_out.numel(); + blas.VEXP(n, pre_out_data, pre_out_grad_data); + blas.VINV(n, pre_out_grad_data, pre_out_grad_data); + for (int64_t i = 0; i < n; ++i) { + pre_out_grad_data[i] = 1.0 - pre_out_grad_data[i]; + } + bit_code->Sub(&pre_out_grad); // the gradient of clip(w * x + b) + auto* out_grad_data = out_grad.template data(); + + int64_t dim0 = pre_out_grad.dims()[0]; + int64_t dim1 = pre_out_grad.dims()[1]; + for (int64_t i = 0; i < dim0; ++i) { + T tmp = out_grad_data[i]; + blas.SCAL(dim1, tmp, pre_out_grad_data + i * dim1); + } + // TODO(guosheng): multiply pre_out_grad with subgradient of clipping to + // be consistent with the clipping in forward. + if (bias_grad) { + ctx.template Alloc(bias_grad); + zero(ctx, bias_grad, static_cast(0.0)); + bit_code->AddGrad(pre_out_grad, bias_grad); + } + ctx.template Alloc(w_grad); + zero(ctx, w_grad, static_cast(0.0)); + if (!is_sparse) { + bit_code->MulGradWeight(pre_out_grad, w_grad, x); + } else { + bit_code->MulGradWeight(pre_out_grad, w_grad_sr, x); + } + bit_code->MulGradError(pre_out_grad, w, x_grad); +} + +} // namespace phi diff --git a/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad_kernel.cc b/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..f64a1a8162a379bdad99c6519ef996a4203544a7 --- /dev/null +++ b/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad_kernel.cc @@ -0,0 +1,71 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/hierarchical_sigmoid_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/hierarchical_sigmoid_grad.h" + +namespace phi { + +template +void HierarchicalSigmoidGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& w, + const DenseTensor& label, + const DenseTensor& pre_out, + const DenseTensor& out_grad, + paddle::optional path, + paddle::optional code, + paddle::optional bias, + int num_classes, + bool remote_prefetch, + int trainer_id, + const std::vector& height_sections, + const std::vector& epmap, + const std::vector& table_names, + bool is_sparse, + DenseTensor* x_grad, + DenseTensor* w_grad, + DenseTensor* bias_grad) { + HierarchicalSigmoidGradKernelImpl(ctx, + x, + w, + label, + pre_out, + out_grad, + path, + code, + bias, + num_classes, + remote_prefetch, + trainer_id, + height_sections, + epmap, + table_names, + is_sparse, + x_grad, + w_grad, + bias_grad); +} + +} // namespace phi + +PD_REGISTER_KERNEL(hierarchical_sigmoid_grad, + CPU, + ALL_LAYOUT, + phi::HierarchicalSigmoidGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/cpu/hierarchical_sigmoid_kernel.cc b/paddle/phi/kernels/cpu/hierarchical_sigmoid_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..096a54f9fb263d3c153ab687d83bb61c63b117d7 --- /dev/null +++ b/paddle/phi/kernels/cpu/hierarchical_sigmoid_kernel.cc @@ -0,0 +1,115 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/hierarchical_sigmoid_kernel.h" + +#include "paddle/fluid/operators/clip_op.h" +#include "paddle/fluid/operators/math/matrix_bit_code.h" +#include "paddle/fluid/platform/transform.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" +#include "paddle/phi/kernels/funcs/math_function_impl.h" + +namespace phi { + +namespace math = paddle::operators::math; + +template +void HierarchicalSigmoidKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& w, + const DenseTensor& label, + paddle::optional path, + paddle::optional code, + paddle::optional bias, + int num_classes, + bool remote_prefetch, + int trainer_id, + const std::vector& height_sections, + const std::vector& epmap, + const std::vector& table_names, + bool is_sparse, + DenseTensor* out, + DenseTensor* pre_out, + DenseTensor* w_out) { + size_t num_classes_st = static_cast(num_classes); + // for remote prefetch + + bool is_custom = false; + if (path.get_ptr()) { + is_custom = true; + } + int64_t code_length = path.get_ptr() ? path.get_ptr()->dims()[1] + : math::FindLastSet(num_classes_st - 1); + int64_t batch_size = x.dims()[0]; + DenseTensor sum; + pre_out->Resize(phi::make_ddim({batch_size, code_length})); + ctx.template Alloc(pre_out); + auto* pre_out_data = pre_out->data(); + auto pre_out_mat = EigenMatrix::From(*pre_out); + // Not all class(leaf) nodes' path lengths equal code_length, thus init as + // 0s can avoid out of path's loss. + funcs::SetConstant zero; + zero(ctx, pre_out, static_cast(0.0)); + auto& place = *ctx.eigen_device(); + funcs::RowwiseSum row_sum; + + std::unique_ptr> bit_code; + if (!is_custom) { + bit_code.reset(new math::MatrixBitCodeFunctor( + num_classes_st, label.template data())); + } else { + bit_code.reset(new math::MatrixBitCodeFunctor( + *(path.get_ptr()), *(code.get_ptr()), label.template data())); + } + + std::vector sum_dims({batch_size, 1UL}); + sum.Resize(phi::make_ddim(sum_dims)); + ctx.template Alloc(&sum); + auto sum_mat = EigenMatrix::From(sum); + ctx.template Alloc(out); + auto out_mat = EigenMatrix::From(*out); + if (bias.get_ptr()) { + bit_code->Add(*(bias.get_ptr()), pre_out); + } + bit_code->Mul(pre_out, w, x); + // clip to [-40, 40] + paddle::platform::Transform trans; + trans(ctx, + pre_out_data, + pre_out_data + pre_out->numel(), + pre_out_data, + paddle::operators::ClipFunctor(static_cast(-40.0), + static_cast(40.0))); + bit_code->Sum(*pre_out, out, static_cast(-1)); + // use softrelu to calculate cross entropy + pre_out_mat.device(place) = (static_cast(1.0) + pre_out_mat.exp()).log(); + row_sum(ctx, *pre_out, &sum); + // TODO(guosheng): Subtract the out of path's loss, since not all + // class(leaf) nodes' path lengths equal code_length. But it won't break the + // gradient check since both have the out of path's loss and will cancel out + // each other. + out_mat.device(place) = sum_mat + out_mat; +} + +} // namespace phi + +PD_REGISTER_KERNEL(hierarchical_sigmoid, + CPU, + ALL_LAYOUT, + phi::HierarchicalSigmoidKernel, + float, + double) {} diff --git a/paddle/phi/kernels/cpu/index_select_grad_kernel.cc b/paddle/phi/kernels/cpu/index_select_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..9dd50e7df8f06dad1b4a4e51b48cda8d7e2c91eb --- /dev/null +++ b/paddle/phi/kernels/cpu/index_select_grad_kernel.cc @@ -0,0 +1,63 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/index_select_grad_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/utils/data_type.h" +#include "paddle/phi/kernels/cpu/index_select_impl.h" + +namespace phi { + +template +void IndexSelectGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& index, + const DenseTensor& out_grad, + int dim, + DenseTensor* x_grad) { + if (dim < 0) { + dim += out_grad.dims().size(); + } + const auto& index_type = index.dtype(); + + bool index_type_match = + index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64; + PADDLE_ENFORCE_EQ(index_type_match, + true, + phi::errors::InvalidArgument( + "Input(Index) holds the wrong type, it holds %s, but " + "desires to be %s or %s", + index_type, + phi::DataType::INT32, + phi::DataType::INT64)); + + if (index_type == phi::DataType::INT32) { + IndexSelectGradInner(ctx, out_grad, index, x_grad, dim); + } else if (index_type == phi::DataType::INT64) { + IndexSelectGradInner( + ctx, out_grad, index, x_grad, dim); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(index_select_grad, + CPU, + ALL_LAYOUT, + phi::IndexSelectGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/index_select_impl.h b/paddle/phi/kernels/cpu/index_select_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..163174580ff785910cc749711b2f917391a691ff --- /dev/null +++ b/paddle/phi/kernels/cpu/index_select_impl.h @@ -0,0 +1,178 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +struct IndexSelectAdd { + void operator()(const Context& ctx, + int slice_size, + const T* src_pointer, + const T* p_pointer, + T* dist_pointer) { + for (int i = 0; i < slice_size; i++) { + dist_pointer[i] = src_pointer[i] + p_pointer[i]; + } + } +}; + +template +struct IndexSelectAdd< + Context, + T, + typename std::enable_if::value>::type> { + void operator()(const Context& ctx, + int slice_size, + const T* src_pointer, + const T* p_pointer, + T* dist_pointer) { + auto blas = phi::funcs::GetBlas(ctx); + blas.VADD(slice_size, src_pointer, p_pointer, dist_pointer); + } +}; + +template +void IndexSelectInner(const Context& ctx, + DenseTensor* input, + const DenseTensor& index, + DenseTensor* output, + int dim) { + auto input_dim = input->dims(); + auto input_dim_size = input_dim.size(); + auto output_dim = output->dims(); + auto index_size = index.dims()[0]; + + DenseTensor index_cpu_copy; + if (!paddle::platform::is_cpu_place(index.place())) { + phi::Copy(ctx, index, phi::CPUPlace(), true, &index_cpu_copy); + } + const IndexT* index_data = paddle::platform::is_cpu_place(index.place()) + ? index.data() + : index_cpu_copy.data(); + ctx.template Alloc(output); + + auto slice_size = 1; + for (auto i = dim + 1; i < input_dim_size; i++) { + slice_size *= input_dim[i]; + } + + auto outer_nums = 1; + for (auto i = 0; i < dim; i++) { + outer_nums *= input_dim[i]; + } + + for (int i = 0; i < index_size; i++) { + PADDLE_ENFORCE_GE( + index_data[i], + 0, + phi::errors::InvalidArgument( + "Variable value (index) of OP(index_select) " + "expected >= 0 and < %ld, but got %ld. Please check input " + "value.", + input_dim[dim], + index_data[i])); + PADDLE_ENFORCE_LT( + index_data[i], + input_dim[dim], + phi::errors::InvalidArgument( + "Variable value (index) of OP(index_select) " + "expected >= 0 and < %ld, but got %ld. Please check input " + "value.", + input_dim[dim], + index_data[i])); + } + + VLOG(3) << "Index_Select_Debug; outer_nums: " << outer_nums + << "; slice_size: " << slice_size << "; index_size: " << index_size; + + input->Resize(phi::make_ddim({outer_nums, input_dim[dim], slice_size})); + output->Resize(phi::make_ddim({outer_nums, index_size, slice_size})); + + auto input_tensor = EigenTensor::From(*input); + auto output_tensor = EigenTensor::From(*output); + + auto& place = *ctx.eigen_device(); + + for (auto j = 0; j < index_size; j++) { + IndexT index_value = index_data[j]; + auto output_t = output_tensor.chip(j, 1); + output_t.device(place) = input_tensor.chip(index_value, 1); + } + input->Resize(input_dim); + output->Resize(output_dim); +} + +template +void IndexSelectGradInner(const Context& ctx, + const DenseTensor& out_grad, + const DenseTensor& index, + DenseTensor* x_grad, + int dim) { + const T* input_data = out_grad.data(); + const IndexT* index_data = index.data(); + + const T* p_output = ctx.template Alloc(x_grad); + T* out_data = ctx.template Alloc(x_grad); + + auto input_dim = out_grad.dims(); + auto input_dim_size = input_dim.size(); + auto output_dim = x_grad->dims(); + + phi::funcs::SetConstant set_constant; + set_constant(ctx, x_grad, static_cast(0.0)); + + auto slice_size = 1; + for (auto i = dim + 1; i < input_dim_size; i++) { + slice_size *= input_dim[i]; + } + + auto input_width = slice_size * input_dim[dim]; + auto output_width = slice_size * output_dim[dim]; + + auto outer_nums = 1; + for (auto i = 0; i < dim; i++) { + outer_nums *= input_dim[i]; + } + + auto index_size = index.dims()[0]; + VLOG(3) << "Index_Select_Grad_Debug; outer_nums: " << outer_nums + << "; slice_size: " << slice_size << "; input_width: " << input_width + << "; output_width: " << output_width + << "; index_size: " << index_size; + + for (auto i = 0; i < outer_nums; i++) { + auto input_start_offset = i * input_width; + auto output_start_offset = i * output_width; + + for (auto j = 0; j < index_size; j++) { + IndexT index_value = index_data[j]; + auto src = input_data + input_start_offset + j * slice_size; + auto p_out = p_output + output_start_offset + index_value * slice_size; + auto dst = out_data + output_start_offset + index_value * slice_size; + IndexSelectAdd index_select_add; + index_select_add(ctx, slice_size, src, p_out, dst); + } + } + x_grad->Resize(output_dim); +} + +} // namespace phi diff --git a/paddle/phi/kernels/cpu/index_select_kernel.cc b/paddle/phi/kernels/cpu/index_select_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..5341ede6b2fd846ee3c14d092d166f2832e3bff7 --- /dev/null +++ b/paddle/phi/kernels/cpu/index_select_kernel.cc @@ -0,0 +1,62 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/index_select_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/utils/data_type.h" +#include "paddle/phi/kernels/cpu/index_select_impl.h" + +namespace phi { + +template +void IndexSelectKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& index, + int dim, + DenseTensor* output) { + auto inputs = x; + if (dim < 0) { + dim += inputs.dims().size(); + } + const auto& index_type = index.dtype(); + + bool index_type_match = + index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64; + PADDLE_ENFORCE_EQ(index_type_match, + true, + phi::errors::InvalidArgument( + "Input(Index) holds the wrong type, it holds %s, but " + "desires to be %s or %s", + index_type, + phi::DataType::INT32, + phi::DataType::INT64)); + + if (index_type == phi::DataType::INT32) { + IndexSelectInner(ctx, &inputs, index, output, dim); + } else if (index_type == phi::DataType::INT64) { + IndexSelectInner(ctx, &inputs, index, output, dim); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(index_select, + CPU, + ALL_LAYOUT, + phi::IndexSelectKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/isclose_kernel.cc b/paddle/phi/kernels/cpu/isclose_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..633c6ba093e42762e3d5b64415d6098c3add6b8a --- /dev/null +++ b/paddle/phi/kernels/cpu/isclose_kernel.cc @@ -0,0 +1,21 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/isclose_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/isclose_kernel_impl.h" + +PD_REGISTER_KERNEL( + isclose, CPU, ALL_LAYOUT, phi::IscloseKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/kldiv_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/kldiv_loss_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..f9399d38d711f56305641c9f3170306bacdd6095 --- /dev/null +++ b/paddle/phi/kernels/cpu/kldiv_loss_grad_kernel.cc @@ -0,0 +1,22 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/kldiv_loss_grad_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/kldiv_loss_grad_kernel_impl.h" + +PD_REGISTER_KERNEL( + kldiv_loss_grad, CPU, ALL_LAYOUT, phi::KLDivLossGradKernel, float, double) { +} diff --git a/paddle/phi/kernels/cpu/kldiv_loss_kernel.cc b/paddle/phi/kernels/cpu/kldiv_loss_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..c462b8ec32c89dfcf2657018baf9b13764f2858e --- /dev/null +++ b/paddle/phi/kernels/cpu/kldiv_loss_kernel.cc @@ -0,0 +1,23 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/kldiv_loss_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/kldiv_loss_kernel_impl.h" + +namespace phi {} // namespace phi + +PD_REGISTER_KERNEL( + kldiv_loss, CPU, ALL_LAYOUT, phi::KLDivLossKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/kthvalue_grad_kernel.cc b/paddle/phi/kernels/cpu/kthvalue_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..185d6cbedc85db83032cecd3c2f6cd1b0f46cbaf --- /dev/null +++ b/paddle/phi/kernels/cpu/kthvalue_grad_kernel.cc @@ -0,0 +1,168 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/kthvalue_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { +template +static void kthvalueAssign(const Type& input_height, + const Type& input_width, + const int& input_dim, + const DenseTensor* input, + const DenseTensor* indices, + T* output_data) { +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (Type i = 0; i < input_height; ++i) { + if (input_dim == 1) { + auto e_input = EigenVector::Flatten(*input); + auto e_indices = EigenVector::Flatten(*indices); + output_data[i * input_width + e_indices(0)] = e_input(0); + } else { + auto e_input = EigenMatrix::Reshape(*input, input_dim - 1); + auto e_indices = EigenMatrix::Reshape(*indices, input_dim - 1); + output_data[i * input_width + e_indices(i, 0)] = e_input(i, 0); + } + } +} + +template +void KthvalueGradKernel(const Context& dev_ctx, + const DenseTensor& d_out, + const DenseTensor& x, + const DenseTensor& indices, + int k, + int axis, + bool keepdim, + DenseTensor* d_x) { + auto in_dims = x.dims(); + auto out_dims = indices.dims(); + axis = (axis < 0) ? (in_dims.size() + axis) : axis; + if (!keepdim) { + std::vector tmp_out_shape; + for (int i = 0; i < axis; i++) { + tmp_out_shape.emplace_back(out_dims[i]); + } + tmp_out_shape.emplace_back(1); + for (int i = axis + 1; i < in_dims.size(); i++) { + tmp_out_shape.emplace_back(out_dims[i - 1]); + } + out_dims = phi::make_ddim(tmp_out_shape); + } + T* x_grad_data = dev_ctx.template Alloc(d_x); + if (axis == in_dims.size() - 1) { + const int64_t input_height = + phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); + const int64_t input_width = in_dims[in_dims.size() - 1]; + memset(x_grad_data, 0, d_x->numel() * sizeof(T)); + if (keepdim) { + kthvalueAssign(input_height, + input_width, + in_dims.size(), + &d_out, + &indices, + x_grad_data); + } else { + DenseTensor out_grad_tmp, indices_tmp; + out_grad_tmp.Resize(d_out.dims()); + indices_tmp.Resize(indices.dims()); + dev_ctx.template Alloc(&out_grad_tmp); + dev_ctx.template Alloc(&indices_tmp); + Copy(dev_ctx, d_out, dev_ctx.GetPlace(), false, &out_grad_tmp); + Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &indices_tmp); + out_grad_tmp.Resize(out_dims); + indices_tmp.Resize(out_dims); + kthvalueAssign(input_height, + input_width, + in_dims.size(), + &out_grad_tmp, + &indices_tmp, + x_grad_data); + } + } else { + std::vector trans; + for (int i = 0; i < axis; i++) { + trans.emplace_back(i); + } + trans.emplace_back(out_dims.size() - 1); + for (int i = axis + 1; i < out_dims.size() - 1; i++) { + trans.emplace_back(i); + } + trans.emplace_back(axis); + DDim trans_dims(out_dims); + DDim trans_in_dims(in_dims); + for (size_t i = 0; i < trans.size(); i++) { + trans_dims[i] = out_dims[trans[i]]; + trans_in_dims[i] = in_dims[trans[i]]; + } + DenseTensor trans_dO, trans_ind; + trans_dO.Resize(trans_dims); + trans_ind.Resize(trans_dims); + dev_ctx.template Alloc(&trans_dO); + dev_ctx.template Alloc(&trans_ind); + int ndims = trans.size(); + if (keepdim) { + funcs::TransCompute( + ndims, dev_ctx, d_out, &trans_dO, trans); + funcs::TransCompute( + ndims, dev_ctx, indices, &trans_ind, trans); + } else { + DenseTensor out_grad_tmp, indices_tmp; + out_grad_tmp.Resize(d_out.dims()); + indices_tmp.Resize(indices.dims()); + dev_ctx.template Alloc(&out_grad_tmp); + dev_ctx.template Alloc(&indices_tmp); + Copy(dev_ctx, d_out, dev_ctx.GetPlace(), false, &out_grad_tmp); + Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &indices_tmp); + out_grad_tmp.Resize(out_dims); + indices_tmp.Resize(out_dims); + funcs::TransCompute( + ndims, dev_ctx, out_grad_tmp, &trans_dO, trans); + funcs::TransCompute( + ndims, dev_ctx, indices_tmp, &trans_ind, trans); + } + const int64_t input_height = phi::product( + phi::slice_ddim(trans_in_dims, 0, trans_in_dims.size() - 1)); + const int64_t input_width = trans_in_dims[trans_in_dims.size() - 1]; + DenseTensor tmp_out; + tmp_out.Resize(trans_in_dims); + T* t_out = dev_ctx.template Alloc(&tmp_out); + memset(t_out, 0, d_x->numel() * sizeof(T)); + kthvalueAssign(input_height, + input_width, + in_dims.size(), + &trans_dO, + &trans_ind, + t_out); + funcs::TransCompute( + ndims, dev_ctx, tmp_out, d_x, trans); + } +} +} // namespace phi + +PD_REGISTER_KERNEL(kthvalue_grad, + CPU, + ALL_LAYOUT, + phi::KthvalueGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/kthvalue_kernel.cc b/paddle/phi/kernels/cpu/kthvalue_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..5e436623cae7bbc27903eff8e2bf01a41ded9c94 --- /dev/null +++ b/paddle/phi/kernels/cpu/kthvalue_kernel.cc @@ -0,0 +1,167 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/kthvalue_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { +template +static void getKthvalue(Type input_height, + Type input_width, + int input_dim, + const DenseTensor* input, + T* t_out, + Type* t_indices, + const int& k) { + bool partial_sort_flag = (k * 64) < input_width; +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (Type i = 0; i < input_height; ++i) { + std::vector> col_vec; + col_vec.reserve(input_width); + if (input_dim == 1) { + auto e_input = EigenVector::Flatten(*input); + for (Type j = 0; j < input_width; ++j) { + col_vec.emplace_back(std::pair(e_input(j), j)); + } + } else { + auto e_input = EigenMatrix::Reshape(*input, input_dim - 1); + for (Type j = 0; j < input_width; ++j) { + col_vec.emplace_back(std::pair(e_input(i, j), j)); + } + } + if (partial_sort_flag) { + std::partial_sort( + col_vec.begin(), + col_vec.begin() + k, + col_vec.end(), + [](const std::pair& l, const std::pair& r) { + return (!std::isnan(static_cast(l.first)) && + std::isnan(static_cast(r.first))) || + (l.first < r.first); + }); + } else { + std::nth_element( + col_vec.begin(), + col_vec.begin() + k - 1, + col_vec.end(), + [](const std::pair& l, const std::pair& r) { + return (!std::isnan(static_cast(l.first)) && + std::isnan(static_cast(r.first))) || + (l.first < r.first); + }); + } + t_out[i] = col_vec[k - 1].first; + t_indices[i] = col_vec[k - 1].second; + } +} + +template +void KthvalueKernel(const Context& dev_ctx, + const DenseTensor& x, + int k, + int axis, + bool keepdim, + DenseTensor* output, + DenseTensor* indices) { + const auto& in_dims = x.dims(); + if (axis < 0) axis += in_dims.size(); + T* output_data = dev_ctx.template Alloc(output); + int64_t* indices_data = dev_ctx.template Alloc(indices); + auto out_dims = output->dims(); + if (axis == in_dims.size() - 1) { + const int64_t& input_height = + phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); + const int64_t& input_width = in_dims[in_dims.size() - 1]; + getKthvalue(input_height, + input_width, + in_dims.size(), + &x, + output_data, + indices_data, + k); + } else { + std::vector trans; + for (int i = 0; i < axis; i++) { + trans.emplace_back(i); + } + trans.emplace_back(in_dims.size() - 1); + for (int i = axis + 1; i < in_dims.size() - 1; i++) { + trans.emplace_back(i); + } + trans.emplace_back(axis); + if (!keepdim) { + std::vector tmp_out_shape; + for (int i = 0; i < axis; i++) { + tmp_out_shape.emplace_back(in_dims[i]); + } + tmp_out_shape.emplace_back(1); + for (int i = axis + 1; i < in_dims.size(); i++) { + tmp_out_shape.emplace_back(in_dims[i]); + } + DDim tmp_out_dims = phi::make_ddim(tmp_out_shape); + output->Resize(tmp_out_dims); + indices->Resize(tmp_out_dims); + } + DDim trans_dims(in_dims); + DDim trans_out_dims(in_dims); + + for (size_t i = 0; i < trans.size(); i++) { + trans_dims[i] = in_dims[trans[i]]; + trans_out_dims[i] = in_dims[trans[i]]; + } + trans_out_dims[in_dims.size() - 1] = 1; + DenseTensor trans_inp; + trans_inp.Resize(trans_dims); + dev_ctx.template Alloc(&trans_inp); + int ndims = trans.size(); + funcs::TransCompute( + ndims, dev_ctx, x, &trans_inp, trans); + + const int64_t input_height = + phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); + const int64_t input_width = trans_dims[trans_dims.size() - 1]; + DenseTensor tmp_out, tmp_indices; + tmp_out.Resize(trans_out_dims); + T* t_out = dev_ctx.template Alloc(&tmp_out); + tmp_indices.Resize(trans_out_dims); + int64_t* t_ind = dev_ctx.template Alloc(&tmp_indices); + getKthvalue( + input_height, input_width, in_dims.size(), &trans_inp, t_out, t_ind, k); + funcs::TransCompute( + ndims, dev_ctx, tmp_indices, indices, trans); + funcs::TransCompute( + ndims, dev_ctx, tmp_out, output, trans); + if (!keepdim) { + output->Resize(out_dims); + indices->Resize(out_dims); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(kthvalue, + CPU, + ALL_LAYOUT, + phi::KthvalueKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..cee48ed96db1c60fb77dc7c870cb256b7ce0cb6e --- /dev/null +++ b/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc @@ -0,0 +1,186 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/layer_norm_grad_kernel.h" +#include "paddle/phi/kernels/cpu/elementwise.h" +#include "paddle/phi/kernels/funcs/layer_norm_util.h" +#if !defined(PADDLE_WITH_CUDA) && !defined(_WIN32) && !defined(__APPLE__) && \ + !defined(__OSX__) +#include "paddle/fluid/operators/jit/kernels.h" +#endif +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/elementwise_base.h" +#include "paddle/phi/kernels/funcs/elementwise_functor.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +void LayerNormGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& mean, + const DenseTensor& variance, + paddle::optional scale_opt, + paddle::optional bias_opt, + const DenseTensor& out_grad, + float epsilon, + int begin_norm_axis, + bool is_test, + DenseTensor* x_grad, + DenseTensor* scale_grad, + DenseTensor* bias_grad) { + auto* scale = scale_opt.get_ptr(); + auto d_y = out_grad; + + // init output + auto* d_x = x_grad; + auto* d_scale = scale_grad; + auto* d_bias = bias_grad; + + const auto& x_dims = x.dims(); + auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis); + int left = static_cast(matrix_dim[0]); + int right = static_cast(matrix_dim[1]); + DDim matrix_shape({left, right}); + + d_y.Resize(matrix_shape); + + funcs::ColwiseSum2D colwise_sum(left, right, dev_ctx); + DenseTensor x_tmp = x; + + DenseTensor temp; + DenseTensor temp_norm; + if (d_scale || d_x) { + x_tmp.Resize(matrix_shape); + temp.Resize(matrix_shape); + dev_ctx.template Alloc(&temp); + + temp_norm.Resize(matrix_shape); + dev_ctx.template Alloc(&temp_norm); + // get x_norm + phi::funcs::ElementwiseCompute, T, T>( + dev_ctx, + x_tmp, + mean, + /*axis*/ 0, + funcs::SubtractFunctor(), + &temp_norm); + phi::funcs::ElementwiseCompute, T, T>( + dev_ctx, + temp_norm, + variance, + /*axis*/ 0, + funcs::DivAndSqrtFunctor(static_cast(epsilon)), + &temp_norm); + } + + if (d_bias) { + dev_ctx.template Alloc(d_bias); + colwise_sum(dev_ctx, d_y, d_bias); + } + if (d_scale) { + dev_ctx.template Alloc(d_scale); + phi::funcs::ElementwiseCompute, T, T>( + dev_ctx, temp_norm, d_y, 0, funcs::MultiplyFunctor(), &temp); + colwise_sum(dev_ctx, temp, d_scale); + } + + if (d_x) { + DDim vec_shape({left}); + dev_ctx.template Alloc(d_x); + auto dx_dim = d_x->dims(); + DenseTensor temp_vec; + temp_vec.Resize(vec_shape); + dev_ctx.template Alloc(&temp_vec); + + funcs::RowwiseMean2D row_mean(left, right, dev_ctx); + + if (d_scale) { + // dy_dx + phi::funcs::ElementwiseCompute, T, T>( + dev_ctx, d_y, *scale, /*axis*/ 1, funcs::MultiplyFunctor(), &temp); + phi::Copy(dev_ctx, temp, dev_ctx.GetPlace(), false, d_x); + + // dy_dmean_dx + row_mean(dev_ctx, temp, &temp_vec); + phi::funcs::ElementwiseCompute, T, T>( + dev_ctx, + *d_x, + temp_vec, + /*axis*/ 0, + funcs::SubtractFunctor(), + d_x); + + // dy_var_dx + phi::funcs::ElementwiseCompute, T, T>( + dev_ctx, + temp, + temp_norm, + /*axis*/ 0, + funcs::MultiplyFunctor(), + &temp); + } else { + // dy_dx + phi::Copy(dev_ctx, d_y, dev_ctx.GetPlace(), false, d_x); + + // dy_dmean_dx + row_mean(dev_ctx, d_y, &temp_vec); + phi::funcs::ElementwiseCompute, T, T>( + dev_ctx, + *d_x, + temp_vec, + /*axis*/ 0, + funcs::SubtractFunctor(), + d_x); + + // dy_var_dx + phi::funcs::ElementwiseCompute, T, T>( + dev_ctx, + d_y, + temp_norm, + /*axis*/ 0, + funcs::MultiplyFunctor(), + &temp); + } + // dy_var_dx + row_mean(dev_ctx, temp, &temp_vec); + phi::funcs::ElementwiseCompute, T, T>( + dev_ctx, + temp_norm, + temp_vec, + /*axis*/ 0, + funcs::MultiplyFunctor(), + &temp); + phi::funcs::ElementwiseCompute, T, T>( + dev_ctx, *d_x, temp, /*axis*/ 0, funcs::SubtractFunctor(), d_x); + + phi::funcs::ElementwiseCompute, T, T>( + dev_ctx, + *d_x, + variance, + /*axis*/ 0, + funcs::DivAndSqrtFunctor(static_cast(epsilon)), + d_x); + d_x->Resize(dx_dim); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + layer_norm_grad, CPU, ALL_LAYOUT, phi::LayerNormGradKernel, float, double) { +} diff --git a/paddle/phi/kernels/cpu/layer_norm_kernel.cc b/paddle/phi/kernels/cpu/layer_norm_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..5b09d68c7ca081e9a6157857eea8338aaa93d34d --- /dev/null +++ b/paddle/phi/kernels/cpu/layer_norm_kernel.cc @@ -0,0 +1,145 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/layer_norm_kernel.h" +#include "paddle/phi/kernels/cpu/elementwise.h" +#include "paddle/phi/kernels/funcs/layer_norm_util.h" +#if !defined(PADDLE_WITH_CUDA) && !defined(_WIN32) && !defined(__APPLE__) && \ + !defined(__OSX__) +#include "paddle/fluid/operators/jit/kernels.h" +#endif +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/elementwise_base.h" +#include "paddle/phi/kernels/funcs/elementwise_functor.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +void LayerNormKernel(const Context& dev_ctx, + const DenseTensor& x, + paddle::optional scale_opt, + paddle::optional bias_opt, + float epsilon, + int begin_norm_axis, + bool is_test, + DenseTensor* y, + DenseTensor* mean, + DenseTensor* var) { + const auto x_dims = x.dims(); + auto* scale = scale_opt.get_ptr(); + auto* bias = bias_opt.get_ptr(); + + dev_ctx.template Alloc(y); + dev_ctx.template Alloc(mean); + dev_ctx.template Alloc(var); + + auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis); + int left = static_cast(matrix_dim[0]); + int right = static_cast(matrix_dim[1]); + DDim matrix_shape({left, right}); + + auto x_tmp = x; + x_tmp.Resize(matrix_shape); + DenseTensor out; + out.ShareDataWith(*y); + out.Resize(matrix_shape); + +#if defined(PADDLE_WITH_CUDA) || defined(_WIN32) || defined(__APPLE__) || \ + defined(__OSX__) + + funcs::RowwiseMean2D row_mean(left, right, dev_ctx); + + // get mean + row_mean(dev_ctx, x_tmp, mean); + + // get variance + + phi::funcs::ElementwiseCompute, T, T>( + dev_ctx, x_tmp, *mean, 0, funcs::SubAndSquareFunctor(), &out); + + row_mean(dev_ctx, out, var); + + // get x_norm + phi::funcs::ElementwiseCompute, T, T>( + dev_ctx, x_tmp, *mean, 0, funcs::SubtractFunctor(), &out); + + phi::funcs::ElementwiseCompute, T, T>( + dev_ctx, + out, + *var, + 0, + funcs::DivAndSqrtFunctor(static_cast(epsilon)), + &out); + + if (scale) { + phi::funcs::ElementwiseCompute, T, T>( + dev_ctx, out, *scale, 1, funcs::MultiplyFunctor(), &out); + } + if (bias) { + phi::funcs::ElementwiseCompute, T, T>( + dev_ctx, out, *bias, 1, funcs::AddFunctor(), &out); + } +#else + PADDLE_ENFORCE_EQ(mean->numel(), + left, + phi::errors::InvalidArgument( + "mean's length (%d) is not equal with expected (%d).", + mean->numel(), + left)); + PADDLE_ENFORCE_EQ(var->numel(), + left, + phi::errors::InvalidArgument( + "var's length (%d) is not equal with expected (%d).", + var->numel(), + left)); + if (scale) { + PADDLE_ENFORCE_EQ( + scale->numel(), + right, + phi::errors::InvalidArgument( + "scale's length (%d) is not equal with expected (%d).", + scale->numel(), + right)); + } + if (bias) { + PADDLE_ENFORCE_EQ(bias->numel(), + right, + phi::errors::InvalidArgument( + "bias's length (%d) is not equal with expected (%d).", + bias->numel(), + right)); + } + + auto ker = paddle::operators::jit::KernelFuncs< + paddle::operators::jit::LayerNormTuple, + phi::CPUPlace>::Cache() + .At(right); + ker(x_tmp.data(), + out.data(), + mean->data(), + var->data(), + scale ? scale->data() : nullptr, + bias ? bias->data() : nullptr, + static_cast(left), + static_cast(epsilon), + right); +#endif +} + +} // namespace phi + +PD_REGISTER_KERNEL( + layer_norm, CPU, ALL_LAYOUT, phi::LayerNormKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/lgamma_grad_kernel.cc b/paddle/phi/kernels/cpu/lgamma_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..116fa3f8d3f6a91ec0705b92ff65aa2a411f4f23 --- /dev/null +++ b/paddle/phi/kernels/cpu/lgamma_grad_kernel.cc @@ -0,0 +1,20 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/lgamma_grad_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h" +PD_REGISTER_KERNEL( + lgamma_grad, CPU, ALL_LAYOUT, phi::LgammaGradKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/lgamma_kernel.cc b/paddle/phi/kernels/cpu/lgamma_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..f849322174d295d95fcd9080e090d5a7ece0ec79 --- /dev/null +++ b/paddle/phi/kernels/cpu/lgamma_kernel.cc @@ -0,0 +1,51 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/lgamma_kernel.h" + +#include +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/for_range.h" + +namespace phi { +template +struct LgammaFunctor { + LgammaFunctor(const T* input, T* output, int64_t numel) + : input_(input), output_(output), numel_(numel) {} + + HOSTDEVICE void operator()(int64_t idx) const { + output_[idx] = Eigen::numext::lgamma(input_[idx]); + } + + private: + const T* input_; + T* output_; + int64_t numel_; +}; + +template +void LgammaKernel(const Context& dev_ctx, + const DenseTensor& x, + DenseTensor* out) { + auto numel = x.numel(); + auto* x_data = x.data(); + auto* out_data = dev_ctx.template Alloc(out); + phi::funcs::ForRange for_range(dev_ctx, numel); + LgammaFunctor functor(x_data, out_data, numel); + for_range(functor); +} +} // namespace phi + +PD_REGISTER_KERNEL(lgamma, CPU, ALL_LAYOUT, phi::LgammaKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/log_softmax_grad_kernel.cc b/paddle/phi/kernels/cpu/log_softmax_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..5f344b9cc3fe0a4c71470c361f2e8f370bc5908a --- /dev/null +++ b/paddle/phi/kernels/cpu/log_softmax_grad_kernel.cc @@ -0,0 +1,88 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/log_softmax_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/axis_utils.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" + +namespace phi { + +template +using EigenMatrixTemplate = EigenMatrix; + +template +struct LogSoftmaxGradFunctor { + void operator()(const Context& context, + const DenseTensor* Y, + const DenseTensor* dY, + DenseTensor* dX, + const int axis) { + constexpr int kBatchDim = 0; + constexpr int kClassDim = 1; + + const int n = funcs::SizeToAxis(axis, Y->dims()); + const int d = funcs::SizeFromAxis(axis, Y->dims()); + phi::DDim dim_2d{n, d}; + + auto y = EigenMatrixTemplate::From(*Y, dim_2d); + auto dy = EigenMatrixTemplate::From(*dY, dim_2d); + auto dx = EigenMatrixTemplate::From(*dX, dim_2d); + + const int axis_dim = Y->dims()[axis]; + const int batch_size = y.dimension(kBatchDim); + const int num_classes = y.dimension(kClassDim); + const int num_remain = num_classes / axis_dim; + + Eigen::DSizes along_class(kClassDim); + Eigen::DSizes batch_axis_remain(batch_size, axis_dim, num_remain); + Eigen::DSizes one_axis(1, axis_dim); + + dx.device(*context.eigen_device()) = + dy - + (y.exp()) * (dy.reshape(batch_axis_remain) + .sum(along_class) + .broadcast(one_axis)); + } +}; + +template +void LogSoftmaxGradKernel(const Context& dev_ctx, + const DenseTensor& out, + const DenseTensor& out_grad, + int axis, + DenseTensor* x_grad) { + const int rank = out.dims().size(); + const int canonical_axis = funcs::CanonicalAxis(axis, rank); + + dev_ctx.template Alloc(x_grad); + if (out.numel() != 0) { + LogSoftmaxGradFunctor()( + dev_ctx, &out, &out_grad, x_grad, canonical_axis); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(log_softmax_grad, + CPU, + ALL_LAYOUT, + phi::LogSoftmaxGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/cpu/log_softmax_kernel.cc b/paddle/phi/kernels/cpu/log_softmax_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..241742378cc5d012d2816745d0f83fc586089ef7 --- /dev/null +++ b/paddle/phi/kernels/cpu/log_softmax_kernel.cc @@ -0,0 +1,123 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/log_softmax_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/axis_utils.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" + +namespace phi { + +template +using EigenMatrixTemplate = EigenMatrix; + +template +struct ValueClip { + HOSTDEVICE T operator()(const T& x) const { + const T kThreshold = static_cast(-64.); + return x < kThreshold ? kThreshold : x; + } +}; + +template +struct LogSoftmaxFunctor { + void operator()(const Context& context, + const DenseTensor* X, + DenseTensor* Y, + const int axis) { + constexpr int kBatchDim = 0; + constexpr int kClassDim = 1; + constexpr int kAxisDim = 1; + + int axis_dim = X->dims()[axis]; + const int n = funcs::SizeToAxis(axis, X->dims()); + const int d = funcs::SizeFromAxis(axis, X->dims()); + phi::DDim dim_2d{n, d}; + + auto logits = EigenMatrixTemplate::From(*X, dim_2d); + auto log_softmax = EigenMatrixTemplate::From(*Y, dim_2d); + + const int batch_size = logits.dimension(kBatchDim); + const int num_classes = logits.dimension(kClassDim); + const int num_remain = num_classes / axis_dim; + + Eigen::DSizes along_axis(kAxisDim); + Eigen::DSizes batch_classes(batch_size, num_classes); + Eigen::DSizes batch_by_one(batch_size, 1); + Eigen::DSizes one_by_class(1, num_classes); + Eigen::DSizes batch_one_remain(batch_size, 1, num_remain); + Eigen::DSizes one_axis_one(1, axis_dim, 1); + Eigen::DSizes one_axis(1, axis_dim); + Eigen::DSizes batch_axis_remain(batch_size, axis_dim, num_remain); + + // For numerical stability, logits should be shifted by maximum number along + // axis, calculate shifted_logits into log_softmax tensor for memory reuse. + if (num_remain == 1) { + // axis == -1, axis and class in same dimension, calculate along + // class dimension directly for higher performance + log_softmax.device(*context.eigen_device()) = + (logits - + logits.maximum(along_axis) + .eval() + .reshape(batch_by_one) + .broadcast(one_by_class)) + .unaryExpr(ValueClip()); + } else { + // axis != -1, class dimension split into (axis, remain), max and sum + // should be calculated along axis dimension + log_softmax.device(*context.eigen_device()) = + (logits.reshape(batch_axis_remain) - + logits.reshape(batch_axis_remain) + .maximum(along_axis) + .eval() + .reshape(batch_one_remain) + .broadcast(one_axis_one) + .reshape(batch_classes)) + .unaryExpr(ValueClip()); + } + + log_softmax.device(*context.eigen_device()) = + log_softmax - + log_softmax.exp() + .eval() + .reshape(batch_axis_remain) + .sum(along_axis) + .log() + .broadcast(one_axis); + } +}; + +template +void LogSoftmaxKernel(const Context& dev_ctx, + const DenseTensor& x, + int axis, + DenseTensor* out) { + const int rank = x.dims().size(); + const int canonical_axis = funcs::CanonicalAxis(axis, rank); + + dev_ctx.template Alloc(out); + if (x.numel() != 0) { + LogSoftmaxFunctor()(dev_ctx, &x, out, canonical_axis); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + log_softmax, CPU, ALL_LAYOUT, phi::LogSoftmaxKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/math_kernel.cc b/paddle/phi/kernels/cpu/math_kernel.cc deleted file mode 100644 index 250f656926c0536f71e5724eb9df779c1502a673..0000000000000000000000000000000000000000 --- a/paddle/phi/kernels/cpu/math_kernel.cc +++ /dev/null @@ -1,183 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/math_kernel.h" - -#include "paddle/phi/api/ext/dispatch.h" -#include "paddle/phi/backends/cpu/cpu_context.h" -#include "paddle/phi/common/scalar.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/cpu/elementwise.h" -#include "paddle/phi/kernels/cpu/reduce.h" -#include "paddle/phi/kernels/funcs/elementwise_base.h" -#include "paddle/phi/kernels/funcs/elementwise_functor.h" -#include "paddle/phi/kernels/funcs/reduce_functor.h" - -// See Note [ Why still include the fluid headers? ] -#include "paddle/fluid/framework/eigen.h" -#include "paddle/phi/common/bfloat16.h" -#include "paddle/phi/common/complex.h" - -namespace phi { - -#define DEFINE_CPU_ELEMENTWISE_OP(name) \ - template \ - void name##RawKernel(const Context& dev_ctx, \ - const DenseTensor& x, \ - const DenseTensor& y, \ - int axis, \ - DenseTensor* out) { \ - dev_ctx.template Alloc(out); \ - if (x.dims() == y.dims()) { \ - SameDimsElementwiseCompute>()( \ - dev_ctx, x, y, out); \ - } else { \ - auto x_dims = x.dims(); \ - auto y_dims = y.dims(); \ - if (x_dims.size() >= y_dims.size()) { \ - funcs::ElementwiseCompute, T>( \ - dev_ctx, x, y, axis, funcs::name##Functor(), out); \ - } else { \ - funcs::ElementwiseCompute, T>( \ - dev_ctx, x, y, axis, funcs::Inverse##name##Functor(), out); \ - } \ - } \ - } - -template -void MeanRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out) { - auto out_dtype = x.dtype(); - phi::Reduce( - dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); -} - -template -void SumRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DataType out_dtype, - DenseTensor* out) { - phi::Reduce( - dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); -} - -template -void DivideRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - int axis, - DenseTensor* out) { - // allocate memory for out - dev_ctx.template Alloc(out); - if (x.dims() == y.dims() && std::is_floating_point::value) { - SameDimsElementwiseCompute>()( - dev_ctx, x, y, out); - } else { - auto x_dims = x.dims(); - auto y_dims = y.dims(); - if (x_dims.size() >= y_dims.size()) { - funcs::ElementwiseCompute, T>( - dev_ctx, x, y, axis, funcs::DivideFunctor(), out); - } else { - funcs::ElementwiseCompute, T>( - dev_ctx, x, y, axis, funcs::InverseDivideFunctor(), out); - } - } -} - -// Create the definition of Add -DEFINE_CPU_ELEMENTWISE_OP(Add) - -// Create the definition of Subtract -DEFINE_CPU_ELEMENTWISE_OP(Subtract) - -// Create the definition of Multiply -DEFINE_CPU_ELEMENTWISE_OP(Multiply) - -} // namespace phi - -using complex64 = ::phi::dtype::complex; -using complex128 = ::phi::dtype::complex; - -// NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16 -// using bfloat16 = ::phi::dtype::bfloat16; -PD_REGISTER_KERNEL(add_raw, - CPU, - ALL_LAYOUT, - phi::AddRawKernel, - float, - double, - int16_t, - int, - int64_t, - complex64, - complex128) {} -PD_REGISTER_KERNEL(subtract_raw, - CPU, - ALL_LAYOUT, - phi::SubtractRawKernel, - float, - double, - int16_t, - int, - int64_t, - complex64, - complex128, - phi::dtype::bfloat16) {} -PD_REGISTER_KERNEL(divide_raw, - CPU, - ALL_LAYOUT, - phi::DivideRawKernel, - float, - double, - int, - int64_t, - complex64, - complex128) {} -PD_REGISTER_KERNEL(multiply_raw, - CPU, - ALL_LAYOUT, - phi::MultiplyRawKernel, - float, - double, - int, - int64_t, - bool, - complex64, - complex128, - phi::dtype::bfloat16) {} -PD_REGISTER_KERNEL(sum_raw, - CPU, - ALL_LAYOUT, - phi::SumRawKernel, - bool, - float, - double, - phi::dtype::float16, - int16_t, - int, - int64_t, - complex64, - complex128) { - kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); -} -PD_REGISTER_KERNEL( - mean_raw, CPU, ALL_LAYOUT, phi::MeanRawKernel, float, double, bool) {} diff --git a/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc index 70b6316e1044426a0743c8d5251ca7d210956356..ae1e406d16eec44168b2b7232586293bf90e4bd8 100644 --- a/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc +++ b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc @@ -17,13 +17,13 @@ #include #include #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/elementwise_kernel.h" #include "paddle/phi/kernels/full_kernel.h" #include "paddle/phi/kernels/funcs/compare_functors.h" #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/elementwise_base.h" #include "paddle/phi/kernels/impl/matrix_rank_kernel_impl.h" -#include "paddle/phi/kernels/math_kernel.h" -#include "paddle/phi/kernels/reduce_max_kernel.h" +#include "paddle/phi/kernels/reduce_kernel.h" namespace phi { diff --git a/paddle/phi/kernels/cpu/mode_grad_kernel.cc b/paddle/phi/kernels/cpu/mode_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..ca813c1757eacce24ecea8687b7b80bd43c5e8f9 --- /dev/null +++ b/paddle/phi/kernels/cpu/mode_grad_kernel.cc @@ -0,0 +1,170 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/mode_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/funcs/mode.h" + +namespace phi { + +template +void ModeGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& indices, + const DenseTensor& out_grad, + int axis, + bool keepdim, + DenseTensor* x_grad) { + auto in_dims = x.dims(); + auto out_dims = indices.dims(); + + // axis < 0, get the real axis + axis = (axis < 0) ? (in_dims.size() + axis) : axis; + + if (!keepdim) { + std::vector tmp_out_shape; + for (int i = 0; i < axis; i++) { + tmp_out_shape.emplace_back(out_dims[i]); + } + tmp_out_shape.emplace_back(1); + for (int i = axis + 1; i < in_dims.size(); i++) { + tmp_out_shape.emplace_back(out_dims[i - 1]); + } + out_dims = phi::make_ddim(tmp_out_shape); + } + T* x_grad_data = dev_ctx.template Alloc(x_grad); + + if (axis == in_dims.size() - 1) { + // allocate the memory for the input_grad + // assign the out_grad to input_grad directly + const int64_t input_height = + phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); + const int64_t input_width = in_dims[in_dims.size() - 1]; + + // init the output grad with 0, because some input elements has no grad + memset(x_grad_data, 0, x_grad->numel() * sizeof(T)); + // Assign the output_grad to input_grad + if (keepdim) { + funcs::ModeAssign(input_height, + input_width, + in_dims.size(), + &out_grad, + &indices, + x_grad_data); + } else { + DenseTensor out_grad_tmp; + dev_ctx.template Alloc(&out_grad_tmp); + DenseTensor indices_tmp; + dev_ctx.template Alloc(&indices_tmp); + + phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, &out_grad_tmp); + phi::Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &indices_tmp); + + out_grad_tmp.Resize(out_dims); + indices_tmp.Resize(out_dims); + + funcs::ModeAssign(input_height, + input_width, + in_dims.size(), + &out_grad_tmp, + &indices_tmp, + x_grad_data); + } + } else { + // can not assign grad to input_grad, must do the transpose + std::vector trans_axis; + for (int i = 0; i < axis; i++) { + trans_axis.emplace_back(i); + } + trans_axis.emplace_back(out_dims.size() - 1); + for (int i = axis + 1; i < out_dims.size() - 1; i++) { + trans_axis.emplace_back(i); + } + trans_axis.emplace_back(axis); + DDim trans_shape(out_dims); + DDim trans_in_shape(in_dims); + for (size_t i = 0; i < trans_axis.size(); i++) { + trans_shape[i] = out_dims[trans_axis[i]]; + trans_in_shape[i] = in_dims[trans_axis[i]]; + } + // transpose the out_grad, indices + DenseTensor trans_dO; + trans_dO.Resize(trans_shape); + dev_ctx.template Alloc(&trans_dO); + + DenseTensor trans_ind; + trans_ind.Resize(trans_shape); + dev_ctx.template Alloc(&trans_ind); + + int ndims = trans_axis.size(); + + if (keepdim) { + // Do transpose + funcs::TransCompute( + ndims, dev_ctx, out_grad, &trans_dO, trans_axis); + funcs::TransCompute( + ndims, dev_ctx, indices, &trans_ind, trans_axis); + } else { + DenseTensor out_grad_tmp; + dev_ctx.template Alloc(&out_grad_tmp); + + DenseTensor indices_tmp; + dev_ctx.template Alloc(&indices_tmp); + + phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, &out_grad_tmp); + phi::Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &indices_tmp); + out_grad_tmp.Resize(out_dims); + indices_tmp.Resize(out_dims); + // Do transpose + funcs::TransCompute( + ndims, dev_ctx, out_grad_tmp, &trans_dO, trans_axis); + funcs::TransCompute( + ndims, dev_ctx, indices_tmp, &trans_ind, trans_axis); + } + const int64_t input_height = phi::product( + phi::slice_ddim(trans_in_shape, 0, trans_in_shape.size() - 1)); + const int64_t input_width = trans_in_shape[trans_in_shape.size() - 1]; + + // Assign the out_grad to tranpose input_grad + DenseTensor tmp_out; + tmp_out.Resize(trans_in_shape); + T* t_out = dev_ctx.template Alloc(&tmp_out); + memset(t_out, 0, x_grad->numel() * sizeof(T)); + + funcs::ModeAssign(input_height, + input_width, + in_dims.size(), + &trans_dO, + &trans_ind, + t_out); + + // Transpose back + funcs::TransCompute( + ndims, dev_ctx, tmp_out, x_grad, trans_axis); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(mode_grad, + CPU, + ALL_LAYOUT, + phi::ModeGradKernel, + float, + double, + int32_t, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/mode_kernel.cc b/paddle/phi/kernels/cpu/mode_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..6535d1b89af420ee4266981f004983157179f34f --- /dev/null +++ b/paddle/phi/kernels/cpu/mode_kernel.cc @@ -0,0 +1,121 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/mode_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/mode.h" + +namespace phi { + +template +void ModeKernel(const Context& dev_ctx, + const DenseTensor& x, + int axis, + bool keepdim, + DenseTensor* out, + DenseTensor* indices) { + const auto& in_dims = x.dims(); + auto out_dims = out->dims(); + // axis < 0, cacluate the real axis + if (axis < 0) axis += in_dims.size(); + + T* output_data = dev_ctx.template Alloc(out); + int64_t* indices_data = dev_ctx.template Alloc(indices); + // if axis is not the last dim, transpose it to the last dim, do the + // calculation, then tranpose it back to original axis. + if (axis == in_dims.size() - 1) { + const int64_t& input_height = + phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); + const int64_t& input_width = in_dims[in_dims.size() - 1]; + funcs::GetMode(input_height, + input_width, + in_dims.size(), + &x, + output_data, + indices_data); + } else { + std::vector trans_axis; + for (int i = 0; i < axis; i++) { + trans_axis.emplace_back(i); + } + trans_axis.push_back(in_dims.size() - 1); + for (int i = axis + 1; i < in_dims.size() - 1; i++) { + trans_axis.emplace_back(i); + } + trans_axis.emplace_back(axis); + + if (!keepdim) { + std::vector tmp_out_shape; + for (int i = 0; i < axis; i++) { + tmp_out_shape.emplace_back(in_dims[i]); + } + tmp_out_shape.emplace_back(1); + for (int i = axis + 1; i < in_dims.size(); i++) { + tmp_out_shape.emplace_back(in_dims[i]); + } + DDim tmp_out_dim = phi::make_ddim(tmp_out_shape); + out->Resize(tmp_out_dim); + indices->Resize(tmp_out_dim); + } + + // get the trans input_dims, out_dims + DDim trans_shape(in_dims); + DDim trans_out_shape(in_dims); + + for (size_t i = 0; i < trans_axis.size(); i++) { + trans_shape[i] = in_dims[trans_axis[i]]; + trans_out_shape[i] = in_dims[trans_axis[i]]; + } + trans_out_shape[in_dims.size() - 1] = 1; + + DenseTensor trans_input; + trans_input.Resize(trans_shape); + dev_ctx.template Alloc(&trans_input); + int ndims = trans_axis.size(); + + // transpose the input value + funcs::TransCompute( + ndims, dev_ctx, x, &trans_input, trans_axis); + + const int64_t input_height = + phi::product(phi::slice_ddim(trans_shape, 0, trans_shape.size() - 1)); + const int64_t input_width = trans_shape[trans_shape.size() - 1]; + DenseTensor tmp_out; + tmp_out.Resize(trans_out_shape); + T* t_out = dev_ctx.template Alloc(&tmp_out); + + DenseTensor tmp_indices; + tmp_indices.Resize(trans_out_shape); + int64_t* t_ind = dev_ctx.template Alloc(&tmp_indices); + + funcs::GetMode( + input_height, input_width, in_dims.size(), &trans_input, t_out, t_ind); + // transpose back + funcs::TransCompute( + ndims, dev_ctx, tmp_indices, indices, trans_axis); + funcs::TransCompute( + ndims, dev_ctx, tmp_out, out, trans_axis); + if (!keepdim) { + out->Resize(out_dims); + indices->Resize(out_dims); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + mode, CPU, ALL_LAYOUT, phi::ModeKernel, float, double, int32_t, int64_t) {} diff --git a/paddle/phi/kernels/cpu/multiplex_grad_kernel.cc b/paddle/phi/kernels/cpu/multiplex_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..f5a426e93db2cf23962276632fead69565999d37 --- /dev/null +++ b/paddle/phi/kernels/cpu/multiplex_grad_kernel.cc @@ -0,0 +1,65 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/multiplex_grad_kernel.h" + +#include "paddle/fluid/memory/memcpy.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" + +namespace phi { + +template +void MultiplexGradKernel(const Context& ctx, + const DenseTensor& ids, + const DenseTensor& out_grad, + std::vector ins_grad) { + size_t idx = -1UL; + for (size_t i = 0; i < ins_grad.size(); i++) { + if (ins_grad[i]) { + ctx.template Alloc(ins_grad[i]); + auto t = phi::EigenVector::Flatten(*ins_grad[i]); + t.device(*ctx.eigen_device()) = t.constant(static_cast(0)); + idx = i; + } + } + if (idx == -1UL) return; + + auto rows = ins_grad[idx]->dims()[0]; + auto cols = ins_grad[idx]->numel() / rows; + auto* index = ids.data(); + for (auto i = 0; i < rows; i++) { + size_t k = static_cast(index[i]); + if (ins_grad[k]) { + paddle::memory::Copy(ctx.GetPlace(), + ins_grad[k]->data() + i * cols, + ctx.GetPlace(), + out_grad.data() + i * cols, + cols * sizeof(T)); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(multiplex_grad, + CPU, + ALL_LAYOUT, + phi::MultiplexGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/multiplex_kernel.cc b/paddle/phi/kernels/cpu/multiplex_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..2d9f4c51a981ed8701afe0aa4e7d6a8955f4348c --- /dev/null +++ b/paddle/phi/kernels/cpu/multiplex_kernel.cc @@ -0,0 +1,65 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/multiplex_kernel.h" + +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void MultiplexKernel(const Context& ctx, + const std::vector& ins, + const DenseTensor& ids, + DenseTensor* out) { + ctx.template Alloc(out); + for (size_t i = 0; i < ins.size(); ++i) { + PADDLE_ENFORCE_GT( + ins[i]->numel(), + 0, + errors::OutOfRange( + "indexing will be out of bounds with size 0 for the %d-th input.", + i)); + } + auto rows = ins[0]->dims()[0]; + auto cols = ins[0]->numel() / rows; + auto index = ids.data(); + for (auto i = 0; i < rows; i++) { + int32_t k = index[i]; + PADDLE_ENFORCE_GE( + k, 0, errors::PreconditionNotMet("index must be nonnegative.")); + PADDLE_ENFORCE_LT(static_cast(k), + ins.size(), + errors::PreconditionNotMet( + "index exceeds the number of candidate tensors.")); + paddle::memory::Copy(ctx.GetPlace(), + out->data() + i * cols, + ctx.GetPlace(), + ins[k]->data() + i * cols, + cols * sizeof(T)); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(multiplex, + CPU, + ALL_LAYOUT, + phi::MultiplexKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/prelu_grad_kernel.cc b/paddle/phi/kernels/cpu/prelu_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..97558cdb31f666fd7c5dd8b15e1d7feef6556a0b --- /dev/null +++ b/paddle/phi/kernels/cpu/prelu_grad_kernel.cc @@ -0,0 +1,119 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/prelu_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void PReluGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& alpha, + const DenseTensor& out_grad, + const std::string& mode, + const std::string& data_format, + DenseTensor* x_grad, + DenseTensor* alpha_grad) { + const T* alpha_ptr = alpha.data(); + const T* x_ptr = x.data(); + const T* out_grad_ptr = out_grad.data(); + int numel = x.numel(); + auto dim = x.dims(); + int index = 0; + int i = 0; + if (x_grad) { + T* x_grad_ptr = dev_ctx.template Alloc(x_grad); + if (mode == "channel") { + if (data_format == "NCHW") { + int temp = 1; + for (int j = 2; j < dim.size(); j++) { + temp *= dim[j]; + } + for (i = 0; i < numel; i++) { + index = (i / temp) % dim[1]; + x_grad_ptr[i] = x_ptr[i] > 0 ? out_grad_ptr[i] + : alpha_ptr[index] * out_grad_ptr[i]; + } + } else { + for (i = 0; i < numel; i++) { + index = i % dim[dim.size() - 1]; + x_grad_ptr[i] = x_ptr[i] > 0 ? out_grad_ptr[i] + : alpha_ptr[index] * out_grad_ptr[i]; + } + } + } else if (mode == "element") { + int temp = 1; + for (int j = 1; j < dim.size(); j++) { + temp *= dim[j]; + } + for (i = 0; i < numel; i++) { + index = i % temp; + x_grad_ptr[i] = + x_ptr[i] > 0 ? out_grad_ptr[i] : alpha_ptr[index] * out_grad_ptr[i]; + } + } else { + for (i = 0; i < numel; i++) { + x_grad_ptr[i] = + x_ptr[i] > 0 ? out_grad_ptr[i] : alpha_ptr[0] * out_grad_ptr[i]; + } + } + } + + index = 0; + if (alpha_grad) { + T* alpha_grad_ptr = dev_ctx.template Alloc(alpha_grad); + memset(alpha_grad_ptr, 0, sizeof(T) * alpha_grad->numel()); + + if (mode == "channel") { + if (data_format == "NCHW") { + int temp = 1; + for (int j = 2; j < dim.size(); j++) { + temp *= dim[j]; + } + for (i = 0; i < numel; i++) { + index = (i / temp) % dim[1]; + alpha_grad_ptr[index] += + x_ptr[i] > 0 ? 0 : x_ptr[i] * out_grad_ptr[i]; + } + } else { + for (i = 0; i < numel; i++) { + index = i % dim[dim.size() - 1]; + alpha_grad_ptr[index] += + x_ptr[i] > 0 ? 0 : x_ptr[i] * out_grad_ptr[i]; + } + } + } else if (mode == "element") { + int temp = 1; + for (int j = 1; j < dim.size(); j++) { + temp *= dim[j]; + } + for (i = 0; i < numel; i++) { + index = i % temp; + alpha_grad_ptr[index] += x_ptr[i] > 0 ? 0 : x_ptr[i] * out_grad_ptr[i]; + } + } else { + for (i = 0; i < numel; i++) { + alpha_grad_ptr[0] += x_ptr[i] > 0 ? 0 : x_ptr[i] * out_grad_ptr[i]; + } + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + prelu_grad, CPU, ALL_LAYOUT, phi::PReluGradKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/prelu_kernel.cc b/paddle/phi/kernels/cpu/prelu_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..8f389ab9ff459d1935518f35e7884d144bec5020 --- /dev/null +++ b/paddle/phi/kernels/cpu/prelu_kernel.cc @@ -0,0 +1,71 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/prelu_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void PReluKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& alpha, + const std::string& mode, + const std::string& data_format, + DenseTensor* out) { + const T* x_ptr = x.data(); + const T* alpha_ptr = alpha.data(); + T* o_ptr = dev_ctx.template Alloc(out); + + int numel = x.numel(); + auto dim = x.dims(); + int index = 0; + int i = 0; + if (mode == "channel") { + if (data_format == "NCHW") { + int temp = 1; + for (int j = 2; j < dim.size(); j++) { + temp *= dim[j]; + } + for (i = 0; i < numel; i++) { + index = (i / temp) % dim[1]; + o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i]; + } + } else { + for (i = 0; i < numel; i++) { + index = i % dim[dim.size() - 1]; + o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i]; + } + } + } else if (mode == "element") { + int temp = 1; + for (int j = 1; j < dim.size(); j++) { + temp *= dim[j]; + } + for (i = 0; i < numel; i++) { + index = i % temp; + o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i]; + } + } else { + for (i = 0; i < numel; i++) { + o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[0] * x_ptr[i]; + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(prelu, CPU, ALL_LAYOUT, phi::PReluKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/qr_kernel.cc b/paddle/phi/kernels/cpu/qr_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..e2e32567441ae8ff5315856e3f9132c9553f6d62 --- /dev/null +++ b/paddle/phi/kernels/cpu/qr_kernel.cc @@ -0,0 +1,116 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "paddle/phi/kernels/qr_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/complex_functors.h" + +namespace phi { + +static inline std::tuple ParseQrMode(const std::string& mode) { + bool compute_q; + bool reduced; + if (mode == "reduced") { + compute_q = true; + reduced = true; + } else if (mode == "complete") { + compute_q = true; + reduced = false; + } else if (mode == "r") { + compute_q = false; + reduced = true; + } else { + PADDLE_THROW(errors::InvalidArgument( + "QR received unrecognized mode '%s'" + " but expected one of 'reduced' (default), 'r', or 'complete'", + mode)); + } + return std::make_tuple(compute_q, reduced); +} + +template +void QrKernel(const Context& ctx, + const DenseTensor& x, + const std::string& mode, + DenseTensor* q, + DenseTensor* r) { + bool compute_q; + bool reduced_mode; + std::tie(compute_q, reduced_mode) = ParseQrMode(mode); + auto numel = x.numel(); + PADDLE_ENFORCE_GT( + numel, 0, errors::PreconditionNotMet("The input of QR is empty.")); + auto x_dims = x.dims(); + int x_rank = x_dims.size(); + int m = x_dims[x_rank - 2]; + int n = x_dims[x_rank - 1]; + int min_mn = std::min(m, n); + int k = reduced_mode ? min_mn : m; + int batch_size = numel / (m * n); + int x_stride = m * n; + int q_stride = m * k; + int r_stride = k * n; + auto* x_data = x.data>(); + T* q_data = nullptr; + if (compute_q) { + q_data = ctx.template Alloc>( + q, batch_size * m * k * sizeof(phi::dtype::Real)); + } + auto* r_data = ctx.template Alloc>( + r, batch_size * k * n * sizeof(phi::dtype::Real)); + + // Implement QR by calling Eigen + for (int i = 0; i < batch_size; ++i) { + const T* x_matrix_ptr = x_data + i * x_stride; + T* r_matrix_ptr = r_data + i * r_stride; + using EigenDynamicMatrix = + Eigen::Matrix; + auto x_matrix = Eigen::Map(x_matrix_ptr, m, n); + Eigen::HouseholderQR qr(x_matrix); + if (reduced_mode) { + auto qr_top_matrix = qr.matrixQR().block(0, 0, min_mn, n); + auto r_matrix_view = + qr_top_matrix.template triangularView(); + auto r_matrix = EigenDynamicMatrix(r_matrix_view); + memcpy(r_matrix_ptr, r_matrix.data(), r_matrix.size() * sizeof(T)); + } else { + auto r_matrix_view = + qr.matrixQR().template triangularView(); + auto r_matrix = EigenDynamicMatrix(r_matrix_view); + memcpy(r_matrix_ptr, r_matrix.data(), r_matrix.size() * sizeof(T)); + } + + if (compute_q) { + T* q_matrix_ptr = q_data + i * q_stride; + if (reduced_mode) { + auto q_matrix = + qr.householderQ() * EigenDynamicMatrix::Identity(m, min_mn); + q_matrix.transposeInPlace(); + memcpy(q_matrix_ptr, q_matrix.data(), q_matrix.size() * sizeof(T)); + } else { + auto q_matrix = qr.householderQ() * EigenDynamicMatrix::Identity(m, m); + q_matrix.transposeInPlace(); + memcpy(q_matrix_ptr, q_matrix.data(), q_matrix.size() * sizeof(T)); + } + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(qr, CPU, ALL_LAYOUT, phi::QrKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/reduce.h b/paddle/phi/kernels/cpu/reduce.h index 4e268d40038cfb56b1e772e14b0ed7699f9700dd..af67bdf5d624f33fd4ec06db425ec8312b490642 100644 --- a/paddle/phi/kernels/cpu/reduce.h +++ b/paddle/phi/kernels/cpu/reduce.h @@ -239,4 +239,29 @@ void Reduce(const DeviceContext& dev_ctx, } } +template +void BoolReduceKernel(const DeviceContext& dev_ctx, + const phi::DenseTensor& input, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + phi::DenseTensor* output) { + dev_ctx.template Alloc(output); + + // The dims has full dim, set the reduce_all is True + const auto& input_dim_size = input.dims().size(); + std::set dims_set(dims.begin(), dims.end()); + bool full_dim = true; + for (auto i = 0; i < input_dim_size; i++) { + if (dims_set.find(i) == dims_set.end()) { + full_dim = false; + break; + } + } + reduce_all = (reduce_all || full_dim); + + ReduceKernelImpl( + dev_ctx, input, output, dims, keep_dim, reduce_all); +} + } // namespace phi diff --git a/paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc b/paddle/phi/kernels/cpu/reduce_grad_kernel.cc similarity index 53% rename from paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc rename to paddle/phi/kernels/cpu/reduce_grad_kernel.cc index efea054555e86be79b5cdb09fe8c4784a1ad0c3b..78a7ae8d415b5d4b18fdf8e469576db50f739e38 100644 --- a/paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/reduce_grad_kernel.cc @@ -12,33 +12,19 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/reduce_sum_grad_kernel.h" +#include "paddle/phi/kernels/reduce_grad_kernel.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/cast_kernel.h" -#include "paddle/phi/kernels/cpu/reduce_grad.h" #include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/funcs/reduce_functor.h" +#include "paddle/phi/kernels/impl/reduce_grad.h" +#include "paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h" +#include "paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h" +#include "paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h" namespace phi { -struct SumGradFunctor { - template - void operator()(const DeviceContext& place, - X* x, - Y* y, - DX* dx, - DY* dy, - const Dim& dim, - int size) { - dx->device(place) = dy->broadcast(dim); - } -}; - template void ComputeFromInput(const Context& dev_ctx, const DenseTensor& x, @@ -111,16 +97,38 @@ void ReduceSumGradKernel(const Context& dev_ctx, } } - ReduceGradKernel(dev_ctx, - x, - out_grad, - paddle::none, - dims, - keep_dim, - reduce_all, - in_dtype, - out_dtype, - x_grad); + ReduceGradKernel(dev_ctx, + x, + out_grad, + paddle::none, + dims, + keep_dim, + reduce_all, + in_dtype, + out_dtype, + x_grad); +} + +template +void ReduceMeanGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType in_dtype, + DataType out_dtype, + DenseTensor* x_grad) { + ReduceGradKernel(dev_ctx, + x, + out_grad, + paddle::none, + dims, + keep_dim, + reduce_all, + in_dtype, + out_dtype, + x_grad); } } // namespace phi @@ -137,3 +145,38 @@ PD_REGISTER_KERNEL(sum_grad, int64_t, phi::dtype::complex, phi::dtype::complex) {} + +PD_REGISTER_KERNEL(mean_grad, + CPU, + ALL_LAYOUT, + phi::ReduceMeanGradKernel, + bool, + float, + double) {} + +PD_REGISTER_KERNEL(prod_grad, + CPU, + ALL_LAYOUT, + phi::ReduceProdGradKernel, + float, + double, + int, + int64_t) {} + +PD_REGISTER_KERNEL(max_grad, + CPU, + ALL_LAYOUT, + phi::ReduceMaxGradKernel, + float, + double, + int, + int64_t) {} + +PD_REGISTER_KERNEL(min_grad, + CPU, + ALL_LAYOUT, + phi::ReduceMinGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/reduce_kernel.cc b/paddle/phi/kernels/cpu/reduce_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..bc99e2cb39a6976943ba8fa77f7816c8f5e9b284 --- /dev/null +++ b/paddle/phi/kernels/cpu/reduce_kernel.cc @@ -0,0 +1,145 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/reduce.h" +#include "paddle/phi/kernels/funcs/reduce_functor.h" + +namespace phi { + +template +void MeanRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +template +void SumRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType out_dtype, + DenseTensor* out) { + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +template +void ProdRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +template +void MaxRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +template +void MinRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +template +void AllRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + phi::BoolReduceKernel( + dev_ctx, x, dims, keep_dim, reduce_all, out); +} + +template +void AnyRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + phi::BoolReduceKernel( + dev_ctx, x, dims, keep_dim, reduce_all, out); +} + +} // namespace phi + +using complex64 = ::phi::dtype::complex; +using complex128 = ::phi::dtype::complex; + +PD_REGISTER_KERNEL(sum_raw, + CPU, + ALL_LAYOUT, + phi::SumRawKernel, + bool, + float, + double, + phi::dtype::float16, + int16_t, + int, + int64_t, + complex64, + complex128) { + kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); +} +PD_REGISTER_KERNEL( + mean_raw, CPU, ALL_LAYOUT, phi::MeanRawKernel, float, double, bool) {} + +PD_REGISTER_KERNEL(prod_raw, + CPU, + ALL_LAYOUT, + phi::ProdRawKernel, + float, + double, + int, + int64_t) {} + +PD_REGISTER_KERNEL( + max_raw, CPU, ALL_LAYOUT, phi::MaxRawKernel, float, double, int, int64_t) {} + +PD_REGISTER_KERNEL( + min_raw, CPU, ALL_LAYOUT, phi::MinRawKernel, float, double, int, int64_t) {} + +PD_REGISTER_KERNEL(all_raw, CPU, ALL_LAYOUT, phi::AllRawKernel, bool) {} +PD_REGISTER_KERNEL(any_raw, CPU, ALL_LAYOUT, phi::AnyRawKernel, bool) {} diff --git a/paddle/phi/kernels/cpu/reduce_prod_kernel.cc b/paddle/phi/kernels/cpu/reduce_prod_kernel.cc deleted file mode 100644 index 9a9bf46e948bc52c6a1e9679b4d3e51b10d89e6d..0000000000000000000000000000000000000000 --- a/paddle/phi/kernels/cpu/reduce_prod_kernel.cc +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/reduce_prod_kernel.h" - -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/cpu/reduce.h" -#include "paddle/phi/kernels/funcs/reduce_functor.h" - -namespace phi { - -template -void ReduceProdKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out) { - auto out_dtype = x.dtype(); - phi::Reduce( - dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); -} - -} // namespace phi - -PD_REGISTER_KERNEL(reduce_prod, - CPU, - ALL_LAYOUT, - phi::ReduceProdKernel, - float, - double, - int, - int64_t) {} diff --git a/paddle/phi/kernels/cpu/roi_align_grad_kernel.cc b/paddle/phi/kernels/cpu/roi_align_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..a91b8b6c1fcd3306521fb7cbc26d8c7adaf2d4f8 --- /dev/null +++ b/paddle/phi/kernels/cpu/roi_align_grad_kernel.cc @@ -0,0 +1,203 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/roi_align_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +void bilinear_interpolate_gradient(const int height, + const int width, + T y, + T x, + const T out_grad_this_bin, + const T count, + T* batch_grad_data) { + int x_low, y_low, x_high, y_high; + T w1, w2, w3, w4; + if (y < -1.0 || y > height || x < -1.0 || x > width) { + w1 = w2 = w3 = w4 = 0; + x_low = x_high = y_low = y_high = -1; + return; + } + y = y <= 0 ? 0 : y; + x = x <= 0 ? 0 : x; + y_low = static_cast(y); + x_low = static_cast(x); + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = static_cast(y_low); + } else { + y_high = y_low + 1; + } + + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = static_cast(x_low); + } else { + x_high = x_low + 1; + } + + T ly = y - y_low, lx = x - x_low; + T hy = 1. - ly, hx = 1. - lx; + w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + T diff1 = out_grad_this_bin * w1 / count; + T diff2 = out_grad_this_bin * w2 / count; + T diff3 = out_grad_this_bin * w3 / count; + T diff4 = out_grad_this_bin * w4 / count; + if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { + *(batch_grad_data + y_low * width + x_low) += diff1; + *(batch_grad_data + y_low * width + x_high) += diff2; + *(batch_grad_data + y_high * width + x_low) += diff3; + *(batch_grad_data + y_high * width + x_high) += diff4; + } +} + +template +void RoiAlignGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& boxes, + paddle::optional boxes_num, + const DenseTensor& out_grad, + int pooled_height, + int pooled_width, + float spatial_scale, + int sampling_ratio, + bool aligned, + DenseTensor* dx) { + auto in_dims = x.dims(); + int channels = in_dims[1]; + int height = in_dims[2]; + int width = in_dims[3]; + int rois_num = boxes.dims()[0]; + + if (!dx) { + return; + } + + DenseTensor roi_batch_id_list = Empty(dev_ctx, {rois_num}); + int* box_batch_id_data = roi_batch_id_list.data(); + + int boxes_batch_size; + if (boxes_num) { + boxes_batch_size = boxes_num->numel(); + auto* boxes_num_data = boxes_num->data(); + int start = 0; + for (int n = 0; n < boxes_batch_size; ++n) { + for (int i = start; i < start + boxes_num_data[n]; ++i) { + box_batch_id_data[i] = n; + } + start += boxes_num_data[n]; + } + } else { + auto boxes_lod = boxes.lod().back(); + boxes_batch_size = boxes_lod.size() - 1; + for (int n = 0; n < boxes_batch_size; ++n) { + for (std::size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) { + box_batch_id_data[i] = n; + } + } + } + dev_ctx.template Alloc(dx); + + phi::funcs::SetConstant set_zero; + set_zero(dev_ctx, dx, static_cast(0)); + + int output_grad_size = out_grad.numel(); + + if ((!out_grad.IsInitialized()) || (output_grad_size <= 0)) { + return; + } + + const T* boxes_data = boxes.data(); + const T* out_grad_data = out_grad.data(); + T* dx_data = dev_ctx.template Alloc(dx); + + auto in_stride = phi::stride(x.dims()); + auto roi_stride = phi::stride(boxes.dims()); + auto out_stride = phi::stride(out_grad.dims()); + + T roi_offset = aligned ? T(0.5) : 0; + for (int n = 0; n < rois_num; ++n) { + int box_batch_idx = box_batch_id_data[n]; + T roi_xmin = boxes_data[0] * spatial_scale - roi_offset; + T roi_ymin = boxes_data[1] * spatial_scale - roi_offset; + T roi_xmax = boxes_data[2] * spatial_scale - roi_offset; + T roi_ymax = boxes_data[3] * spatial_scale - roi_offset; + + T roi_width = roi_xmax - roi_xmin; + T roi_height = roi_ymax - roi_ymin; + roi_width = std::max(roi_width, static_cast(1.)); + roi_height = std::max(roi_height, static_cast(1.)); + if (!aligned) { + roi_width = std::max(roi_width, static_cast(1.)); + roi_height = std::max(roi_height, static_cast(1.)); + } + + T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + for (int c = 0; c < channels; ++c) { + T* batch_grad_data = + dx_data + box_batch_idx * in_stride[0] + c * in_stride[1]; + const T* batch_out_grad_data = + out_grad_data + n * out_stride[0] + c * out_stride[1]; + for (int ph = 0; ph < pooled_height; ++ph) { + for (int pw = 0; pw < pooled_width; ++pw) { + int pool_index = ph * pooled_width + pw; + T out_grad_this_bin = batch_out_grad_data[pool_index]; + int roi_bin_grid_h = (sampling_ratio > 0) + ? sampling_ratio + : ceil(roi_height / pooled_height); + int roi_bin_grid_w = (sampling_ratio > 0) + ? sampling_ratio + : ceil(roi_width / pooled_width); + T count = roi_bin_grid_h * roi_bin_grid_w; + for (int iy = 0; iy < roi_bin_grid_h; iy++) { + const T y = roi_ymin + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + const T x = roi_xmin + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + bilinear_interpolate_gradient(height, + width, + y, + x, + out_grad_this_bin, + count, + batch_grad_data); + } + } + } + } + } + boxes_data += roi_stride[0]; + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(roi_align_grad, + CPU, + ALL_LAYOUT, + phi::RoiAlignGradKernel, + float, + double, + int) {} diff --git a/paddle/phi/kernels/cpu/roi_align_kernel.cc b/paddle/phi/kernels/cpu/roi_align_kernel.cc index 35ab99a98eba7e59853fb311d5b2307b69ae31b2..4752a9b3a48fdcce5f3211a7aadca663fb44aa05 100644 --- a/paddle/phi/kernels/cpu/roi_align_kernel.cc +++ b/paddle/phi/kernels/cpu/roi_align_kernel.cc @@ -179,7 +179,7 @@ void AvgPool(const std::vector& interpolated_values, } template -void ROIAlignKernel(const Context& dev_ctx, +void RoiAlignKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& boxes, paddle::optional boxes_num, @@ -315,4 +315,4 @@ void ROIAlignKernel(const Context& dev_ctx, } // namespace phi PD_REGISTER_KERNEL( - roi_align, CPU, ALL_LAYOUT, phi::ROIAlignKernel, float, double, int) {} + roi_align, CPU, ALL_LAYOUT, phi::RoiAlignKernel, float, double, int) {} diff --git a/paddle/phi/kernels/cpu/roi_pool_grad_kernel.cc b/paddle/phi/kernels/cpu/roi_pool_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..0eaa873590eb0ce16933de474cc028e751fdd4a9 --- /dev/null +++ b/paddle/phi/kernels/cpu/roi_pool_grad_kernel.cc @@ -0,0 +1,108 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/roi_pool_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +void RoiPoolGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& boxes, + paddle::optional boxes_num, + const DenseTensor& arg_max, + const DenseTensor& out_grad, + int pooled_height, + int pooled_width, + float spatial_scale, + DenseTensor* dx) { + if (dx) { + int rois_num = boxes.dims()[0]; + DenseTensor box_batch_id_list = Empty(dev_ctx, {rois_num}); + int* box_batch_id_data = box_batch_id_list.data(); + + int boxes_batch_size; + if (boxes_num) { + boxes_batch_size = boxes_num->numel(); + auto* boxes_num_data = boxes_num->data(); + int start = 0; + for (int n = 0; n < boxes_batch_size; ++n) { + for (int i = start; i < start + boxes_num_data[n]; ++i) { + box_batch_id_data[i] = n; + } + start += boxes_num_data[n]; + } + } else { + auto boxes_lod = boxes.lod().back(); + boxes_batch_size = boxes_lod.size() - 1; + for (int n = 0; n < boxes_batch_size; ++n) { + for (size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) { + box_batch_id_data[i] = n; + } + } + } + + const T* boxes_data = boxes.data(); + const T* out_grad_data = out_grad.data(); + const int64_t* arg_max_data = arg_max.data(); + T* dx_data = dev_ctx.template Alloc(dx); + + phi::funcs::SetConstant set_zero; + set_zero(dev_ctx, dx, static_cast(0)); + + auto in_stride = phi::stride(x.dims()); + auto arg_max_stride = phi::stride(arg_max.dims()); + auto roi_stride = phi::stride(boxes.dims()); + auto out_stride = phi::stride(out_grad.dims()); + + int channels = x.dims()[1]; + + for (int n = 0; n < rois_num; ++n) { + int roi_batch_idx = box_batch_id_data[n]; + T* batch_grad_data = dx_data + roi_batch_idx * in_stride[0]; + for (int c = 0; c < channels; ++c) { + for (int ph = 0; ph < pooled_height; ++ph) { + for (int pw = 0; pw < pooled_width; ++pw) { + int pool_index = ph * pooled_width + pw; + if (arg_max_data[pool_index] >= 0) { + auto index = arg_max_data[pool_index]; + batch_grad_data[index] += out_grad_data[pool_index]; + } + } + } + batch_grad_data += in_stride[1]; + out_grad_data += out_stride[1]; + arg_max_data += arg_max_stride[1]; + } + boxes_data += roi_stride[0]; + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(roi_pool_grad, + CPU, + ALL_LAYOUT, + phi::RoiPoolGradKernel, + float, + double, + int) { + kernel->InputAt(3).SetDataType(phi::DataType::INT64); +} diff --git a/paddle/phi/kernels/cpu/roi_pool_kernel.cc b/paddle/phi/kernels/cpu/roi_pool_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..02020354cd35701b5fdcd1e8beae87bc813ca18f --- /dev/null +++ b/paddle/phi/kernels/cpu/roi_pool_kernel.cc @@ -0,0 +1,163 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/roi_pool_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/empty_kernel.h" + +namespace phi { + +template +void RoiPoolKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& boxes, + paddle::optional boxes_num, + int pooled_height, + int pooled_width, + float spatial_scale, + DenseTensor* out, + DenseTensor* arg_max) { + auto x_dims = x.dims(); + int batch_size = x_dims[0]; + int channels = x_dims[1]; + int height = x_dims[2]; + int width = x_dims[3]; + int rois_num = boxes.dims()[0]; + + auto in_stride = phi::stride(x_dims); + auto arg_max_stride = phi::stride(arg_max->dims()); + auto box_stride = phi::stride(boxes.dims()); + auto out_stride = phi::stride(out->dims()); + + const T* input_data = x.data(); + + DenseTensor box_batch_id_list = Empty(dev_ctx, {rois_num}); + int* box_batch_id_data = box_batch_id_list.data(); + + int boxes_batch_size; + if (boxes_num) { + boxes_batch_size = boxes_num->numel(); + PADDLE_ENFORCE_EQ( + boxes_batch_size, + batch_size, + phi::errors::InvalidArgument("The boxes_batch_size and imgs " + "batch_size must be the same.")); + auto* boxes_num_data = boxes_num->data(); + int start = 0; + for (int n = 0; n < boxes_batch_size; ++n) { + for (int i = start; i < start + boxes_num_data[n]; ++i) { + box_batch_id_data[i] = n; + } + start += boxes_num_data[n]; + } + } else { + auto boxes_lod = boxes.lod().back(); + boxes_batch_size = boxes_lod.size() - 1; + PADDLE_ENFORCE_EQ( + boxes_batch_size, + batch_size, + phi::errors::InvalidArgument("The boxes_batch_size and imgs " + "batch_size must be the same.")); + int rois_num_with_lod = boxes_lod[boxes_batch_size]; + PADDLE_ENFORCE_EQ( + rois_num, + rois_num_with_lod, + phi::errors::InvalidArgument("The rois_num from input " + "and lod must be the same.")); + for (int n = 0; n < boxes_batch_size; ++n) { + for (size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) { + box_batch_id_data[i] = n; + } + } + } + + T* output_data = dev_ctx.template Alloc(out); + int64_t* arg_max_data = dev_ctx.template Alloc(arg_max); + + const T* boxes_data = boxes.data(); + for (int n = 0; n < rois_num; ++n) { + int box_batch_id = box_batch_id_data[n]; + int box_start_w = round(boxes_data[0] * spatial_scale); + int box_start_h = round(boxes_data[1] * spatial_scale); + int box_end_w = round(boxes_data[2] * spatial_scale); + int box_end_h = round(boxes_data[3] * spatial_scale); + + // Force malformed ROIs to be 1x1 + int box_height = std::max(box_end_h - box_start_h + 1, 1); + int box_width = std::max(box_end_w - box_start_w + 1, 1); + + const float bin_size_h = + static_cast(box_height) / static_cast(pooled_height); + const float bin_size_w = + static_cast(box_width) / static_cast(pooled_width); + + const T* batch_data = input_data + box_batch_id * in_stride[0]; + + for (int c = 0; c < channels; ++c) { + for (int ph = 0; ph < pooled_height; ++ph) { + for (int pw = 0; pw < pooled_width; ++pw) { + // Compute pooling region for this output unit: + // start (included) = floor(ph * box_height / pooled_height_) + // end (excluded) = ceil((ph + 1) * box_height / pooled_height_) + int hstart = + static_cast(floor(static_cast(ph) * bin_size_h)); + int wstart = + static_cast(floor(static_cast(pw) * bin_size_w)); + int hend = + static_cast(ceil(static_cast(ph + 1) * bin_size_h)); + int wend = + static_cast(ceil(static_cast(pw + 1) * bin_size_w)); + + hstart = std::min(std::max(hstart + box_start_h, 0), height); + hend = std::min(std::max(hend + box_start_h, 0), height); + wstart = std::min(std::max(wstart + box_start_w, 0), width); + wend = std::min(std::max(wend + box_start_w, 0), width); + + const int pool_index = ph * pooled_width + pw; + + // Define an empty pooling region to be zero + bool is_empty = (hend <= hstart) || (wend <= wstart); + output_data[pool_index] = + is_empty ? 0 : -std::numeric_limits::max(); + arg_max_data[pool_index] = -1; + + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + const int index = h * width + w; + if (batch_data[index] > output_data[pool_index]) { + output_data[pool_index] = batch_data[index]; + arg_max_data[pool_index] = index; + } + } + } + } + } + + batch_data += in_stride[1]; + output_data += out_stride[1]; + arg_max_data += arg_max_stride[1]; + } + // Increment ROI data pointer + boxes_data += box_stride[0]; + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + roi_pool, CPU, ALL_LAYOUT, phi::RoiPoolKernel, float, double, int) { + kernel->OutputAt(1).SetDataType(phi::DataType::INT64); +} diff --git a/paddle/phi/kernels/cpu/roll_grad_kernel.cc b/paddle/phi/kernels/cpu/roll_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..b0d0c0663e4a2eb71f4500baaf43bc8a891acddd --- /dev/null +++ b/paddle/phi/kernels/cpu/roll_grad_kernel.cc @@ -0,0 +1,64 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/roll_grad_kernel.h" + +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/roll_kernel_impl.h" + +namespace phi { + +template +void RollGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const ScalarArray& shifts, + const std::vector& axis, + DenseTensor* x_grad) { + std::vector out_vec; + paddle::framework::TensorToVector(out_grad, dev_ctx, &out_vec); + + auto shifts_data = shifts.GetData(); + size_t nums = shifts_data.size(); + DDim input_dim = out_grad.dims(); + auto dims = axis; + + // axis = none, reshape to 1-D tensor + if (dims.size() == 0) { + dims.push_back(0l); + input_dim = phi::Dim<1>(out_vec.size()); + } + + for (size_t i = 0; i < nums; i++) { + ShiftAlongDim(out_vec.data(), input_dim, dims[i], 0 - shifts_data[i]); + } + + dev_ctx.template Alloc(x_grad); + paddle::framework::TensorFromVector(out_vec, dev_ctx, x_grad); + x_grad->Resize(out_grad.dims()); +} + +} // namespace phi + +PD_REGISTER_KERNEL(roll_grad, + CPU, + ALL_LAYOUT, + phi::RollGradKernel, + float, + double, + int, + int64_t, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/cpu/roll_kernel.cc b/paddle/phi/kernels/cpu/roll_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..25b64ef257dfb801f0050aad388b9fb0b3020ea5 --- /dev/null +++ b/paddle/phi/kernels/cpu/roll_kernel.cc @@ -0,0 +1,75 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/roll_kernel.h" + +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/phi/common/complex.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/roll_kernel_impl.h" + +namespace phi { + +template +void RollKernel(const Context& dev_ctx, + const DenseTensor& x, + const ScalarArray& shifts, + const std::vector& axis, + DenseTensor* out) { + std::vector out_vec; + paddle::framework::TensorToVector(x, dev_ctx, &out_vec); + + auto shifts_data = shifts.GetData(); + size_t nums = shifts_data.size(); + DDim input_dim = x.dims(); + auto dims = axis; + + // axis = none, reshape to 1-D tensor + if (dims.size() == 0) { + dims.push_back(0l); + input_dim = phi::Dim<1>(out_vec.size()); + } + + for (size_t i = 0; i < nums; i++) { + PADDLE_ENFORCE_EQ( + dims[i] < input_dim.size() && dims[i] >= (0 - input_dim.size()), + true, + phi::errors::OutOfRange( + "Attr(axis[%d]) is out of range, It's expected " + "to be in range of [-%d, %d]. But received Attr(axis[%d]) = %d.", + i, + input_dim.size(), + input_dim.size() - 1, + i, + dims[i])); + ShiftAlongDim(out_vec.data(), input_dim, dims[i], shifts_data[i]); + } + dev_ctx.template Alloc(out); + paddle::framework::TensorFromVector(out_vec, dev_ctx, out); + out->Resize(x.dims()); +} + +} // namespace phi + +PD_REGISTER_KERNEL(roll, + CPU, + ALL_LAYOUT, + phi::RollKernel, + float, + double, + int, + int64_t, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/cpu/roll_kernel_impl.h b/paddle/phi/kernels/cpu/roll_kernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..924e71aff31f3f874fb35586f496b9c5952c3757 --- /dev/null +++ b/paddle/phi/kernels/cpu/roll_kernel_impl.h @@ -0,0 +1,76 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +inline void ShiftAlongDim(T* data, + const DDim& input_dim, + int64_t dim, + int64_t shift) { + if (dim < 0) { + dim += input_dim.size(); + } + if (input_dim[dim] == 0) { + return; + } + shift = shift % input_dim[dim]; + if (shift < 0) { + shift += input_dim[dim]; + } + + auto outer_loops = 1; + for (auto i = 0; i < dim; i++) { + outer_loops *= input_dim[i]; + } + auto slice_width = 1; + for (auto i = dim + 1; i < input_dim.size(); i++) { + slice_width *= input_dim[i]; + } + + VLOG(3) << "shift_along_dim_debug: input_dim: " << input_dim + << "; dim: " << dim << "; shift: " << shift + << "; outer_loops: " << outer_loops + << "; slice_width: " << slice_width; + if (shift == 0) { + return; + } + + std::vector head; + auto head_size = slice_width * (input_dim[dim] - shift); + head.resize(head_size); + + for (auto i = 0; i < outer_loops; i++) { + for (auto j = 0; j < head_size; j++) { + head[j] = data[i * input_dim[dim] * slice_width + j]; + } + for (auto j = input_dim[dim] - shift; j < input_dim[dim]; j++) { + auto dst_pos = j - input_dim[dim] + shift; + for (auto k = 0; k < slice_width; k++) { + data[(i * input_dim[dim] + dst_pos) * slice_width + k] = + data[(i * input_dim[dim] + j) * slice_width + k]; + } + } + for (auto j = 0; j < head_size; j++) { + data[(i * input_dim[dim] + shift) * slice_width + j] = head[j]; + } + } +} + +} // namespace phi diff --git a/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc b/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc index 585c27bdcec97e11a68cdc536c829f76c000a8df..a5c9dc4c55e495833f40ec7499e6c0373594d319 100644 --- a/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc @@ -23,4 +23,6 @@ PD_REGISTER_KERNEL(segment_pool_grad, ALL_LAYOUT, phi::SegmentPoolGradKernel, float, - double) {} + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/segment_pool_kernel.cc b/paddle/phi/kernels/cpu/segment_pool_kernel.cc index d0413457f8177338aa450211539dc16d0880c74c..ad76a7a86bcb28f291288418c43740ed0b7adb97 100644 --- a/paddle/phi/kernels/cpu/segment_pool_kernel.cc +++ b/paddle/phi/kernels/cpu/segment_pool_kernel.cc @@ -18,5 +18,11 @@ #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" -PD_REGISTER_KERNEL( - segment_pool, CPU, ALL_LAYOUT, phi::SegmentPoolKernel, float, double) {} +PD_REGISTER_KERNEL(segment_pool, + CPU, + ALL_LAYOUT, + phi::SegmentPoolKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/shape_kernel.cc b/paddle/phi/kernels/cpu/tril_triu_grad_kernel.cc similarity index 74% rename from paddle/phi/kernels/cpu/shape_kernel.cc rename to paddle/phi/kernels/cpu/tril_triu_grad_kernel.cc index 073dc63b2a4348d4091af8c285f9ddebd799acc5..14aca258a2c71a0651868f6917e2707987179ee0 100644 --- a/paddle/phi/kernels/cpu/shape_kernel.cc +++ b/paddle/phi/kernels/cpu/tril_triu_grad_kernel.cc @@ -12,22 +12,18 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/shape_kernel.h" -#include "paddle/phi/kernels/impl/shape_kernel_impl.h" +#include "paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" -PD_REGISTER_KERNEL(shape, +PD_REGISTER_KERNEL(tril_triu_grad, CPU, ALL_LAYOUT, - phi::ShapeKernel, + phi::TrilTriuGradKernel, bool, - int, - int8_t, - uint8_t, - int64_t, float, double, - phi::dtype::complex, - phi::dtype::complex) {} + int, + int64_t, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/cpu/tril_triu_kernel.cc b/paddle/phi/kernels/cpu/tril_triu_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..a3d20e55e21fb6e11f63ef05f5de63fbc51caf5e --- /dev/null +++ b/paddle/phi/kernels/cpu/tril_triu_kernel.cc @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/impl/tril_triu_kernel_impl.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(tril_triu, + CPU, + ALL_LAYOUT, + phi::TrilTriuKernel, + bool, + float, + double, + int, + int64_t, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc b/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc index ab3d3c2376b8b05b4909f2c44df260299c2fe460..4247e597acef4aac14f93066a3ea6232734e0c8c 100644 --- a/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc +++ b/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc @@ -37,13 +37,8 @@ void TruncatedGaussianRandomKernel(const Context& dev_ctx, T* data = dev_ctx.template Alloc(tensor); - auto normal_cdf = [](float x) { - return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0; - }; - float a_normal_cdf = normal_cdf((-2.0 - mean) / std); - float b_normal_cdf = normal_cdf((2.0 - mean) / std); - std::uniform_real_distribution dist(2.0 * a_normal_cdf - 1.0, - 2.0 * b_normal_cdf - 1.0); + std::uniform_real_distribution dist(std::numeric_limits::min(), + 1.0); TruncatedNormal truncated_normal(mean, std); int64_t size = tensor->numel(); diff --git a/paddle/phi/kernels/cumprod_grad_kernel.h b/paddle/phi/kernels/cumprod_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..b3cb17b28e07f3d9d4d0a1671acc9d639b855e08 --- /dev/null +++ b/paddle/phi/kernels/cumprod_grad_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void CumprodGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& dout, + int dim, + DenseTensor* dx); +} // phi diff --git a/paddle/phi/kernels/cumprod_kernel.h b/paddle/phi/kernels/cumprod_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..96d76cb0f43702cb5798ec9c2d527464ea51ba1f --- /dev/null +++ b/paddle/phi/kernels/cumprod_kernel.h @@ -0,0 +1,26 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void CumprodKernel(const Context& dev_ctx, + const DenseTensor& x, + int dim, + DenseTensor* out); +} // phi diff --git a/paddle/phi/kernels/cumsum_kernel.h b/paddle/phi/kernels/cumsum_kernel.h index fd90c7b8f5eee81b517013069ca9c2b366aa7d13..f105c94d559d873c8a11025a3c8c931010050445 100644 --- a/paddle/phi/kernels/cumsum_kernel.h +++ b/paddle/phi/kernels/cumsum_kernel.h @@ -18,7 +18,7 @@ namespace phi { -template +template void CumsumKernel(const Context& dev_ctx, const DenseTensor& x, int axis, diff --git a/paddle/phi/kernels/deformable_conv_kernel.h b/paddle/phi/kernels/deformable_conv_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..3886e6801a31bf9f747b324ae4c355bd48c53cd7 --- /dev/null +++ b/paddle/phi/kernels/deformable_conv_kernel.h @@ -0,0 +1,35 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void DeformableConvKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& offset, + const DenseTensor& filter, + const DenseTensor& mask, + const std::vector& strides, + const std::vector& paddings, + const std::vector& dilations, + int deformable_groups, + int groups, + int im2col_step, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/determinant_grad_kernel.h b/paddle/phi/kernels/determinant_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..87228afc51b52bb95838d3161f372c2ebae19b2c --- /dev/null +++ b/paddle/phi/kernels/determinant_grad_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void DeterminantGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& out_grad, + DenseTensor* x_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/determinant_kernel.h b/paddle/phi/kernels/determinant_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..abd5f5691b3e5df6274db1bcd3a915c80bf93ec4 --- /dev/null +++ b/paddle/phi/kernels/determinant_kernel.h @@ -0,0 +1,26 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void DeterminantKernel(const Context& dev_ctx, + const DenseTensor& x, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/math_kernel.cc b/paddle/phi/kernels/elementwise_kernel.cc similarity index 70% rename from paddle/phi/kernels/math_kernel.cc rename to paddle/phi/kernels/elementwise_kernel.cc index a5d3f51e5447fa41447c4b59c3beb8c917f8a0e5..9d10a48c9e0795d8914c0c6cfb49b7686575cfac 100644 --- a/paddle/phi/kernels/math_kernel.cc +++ b/paddle/phi/kernels/elementwise_kernel.cc @@ -12,34 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/math_kernel.h" +#include "paddle/phi/kernels/elementwise_kernel.h" #include "paddle/phi/backends/all_context.h" #include "paddle/phi/core/kernel_registry.h" namespace phi { -template -void MeanKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - DenseTensor* out) { - bool reduce_all = false; - MeanRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); -} - -template -void SumKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - DataType out_dtype, - bool keep_dim, - DenseTensor* out) { - bool reduce_all = false; - SumRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out_dtype, out); -} - template void AddKernel(const Context& dev_ctx, const DenseTensor& x, @@ -81,25 +60,6 @@ void MultiplyKernel(const Context& dev_ctx, using complex64 = ::phi::dtype::complex; using complex128 = ::phi::dtype::complex; -PD_REGISTER_KERNEL( - mean, CPU, ALL_LAYOUT, phi::MeanKernel, float, double, bool) {} - -PD_REGISTER_KERNEL(sum, - CPU, - ALL_LAYOUT, - phi::SumKernel, - bool, - float, - double, - phi::dtype::float16, - int16_t, - int, - int64_t, - complex64, - complex128) { - kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); -} - PD_REGISTER_KERNEL(add, CPU, ALL_LAYOUT, @@ -147,32 +107,7 @@ PD_REGISTER_KERNEL(multiply, phi::dtype::bfloat16) {} #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -PD_REGISTER_KERNEL(mean, - GPU, - ALL_LAYOUT, - phi::MeanKernel, - float, - double, - bool, - int, - int64_t, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(sum, - GPU, - ALL_LAYOUT, - phi::SumKernel, - bool, - float, - double, - phi::dtype::float16, - phi::dtype::bfloat16, - int16_t, - int, - int64_t, - complex64, - complex128) { - kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); -} + PD_REGISTER_KERNEL(add, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/elementwise_kernel.h b/paddle/phi/kernels/elementwise_kernel.h index c1e73ad91c67d415437829d5fc731ac91a5722f5..b064ecc454c592df49670205163e73d2d3b249b3 100644 --- a/paddle/phi/kernels/elementwise_kernel.h +++ b/paddle/phi/kernels/elementwise_kernel.h @@ -15,7 +15,7 @@ #pragma once #include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/device_context.h" +#include "paddle/phi/infermeta/binary.h" namespace phi { @@ -33,4 +33,100 @@ void ElementwiseFMinKernel(const Context& dev_ctx, int axis, DenseTensor* out); +template +void AddRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out); + +template +void AddKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out); + +template +void SubtractRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out); + +template +void SubtractKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out); + +template +void DivideRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out); + +template +void DivideKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out); + +template +void MultiplyRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out); + +template +void MultiplyKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out); + +template +DenseTensor Add(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y) { + DenseTensor dense_out; + MetaTensor meta_out(&dense_out); + ElementwiseInferMeta(x, y, &meta_out); + AddKernel(dev_ctx, x, y, &dense_out); + return dense_out; +} + +template +DenseTensor Subtract(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y) { + DenseTensor dense_out; + MetaTensor meta_out(&dense_out); + ElementwiseInferMeta(x, y, &meta_out); + SubtractKernel(dev_ctx, x, y, &dense_out); + return dense_out; +} + +template +DenseTensor Divide(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y) { + DenseTensor dense_out; + MetaTensor meta_out(&dense_out); + ElementwiseInferMeta(x, y, &meta_out); + DivideKernel(dev_ctx, x, y, &dense_out); + return dense_out; +} + +template +DenseTensor Multiply(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y) { + DenseTensor dense_out; + MetaTensor meta_out(&dense_out); + ElementwiseInferMeta(x, y, &meta_out); + MultiplyKernel(dev_ctx, x, y, &dense_out); + return dense_out; +} + } // namespace phi diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h index c8fb54bb102d389cf005bac6d0f0edb78fb845ee..6c5ffbd06e3a435d9568a6c4717d8ce83b5aec00 100644 --- a/paddle/phi/kernels/funcs/activation_functor.h +++ b/paddle/phi/kernels/funcs/activation_functor.h @@ -29,11 +29,17 @@ #include #include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/float16.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/extensions.h" + +#ifdef PADDLE_WITH_XPU_KP +#define __forceinline__ __inline__ +#endif namespace phi { namespace funcs { @@ -776,6 +782,447 @@ struct ThresholdedReluGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; +// tanhshrink(x) = x - tanh(x) +// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) +template +struct TanhShrinkFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x - x.tanh(); + } +}; + +template +struct TanhShrinkGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * (x.tanh() * x.tanh()); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +// tanhshrink(x) = x - tanh(x) +// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) +template +struct HardShrinkFunctor : public BaseActivationFunctor { + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + template + void operator()(Device d, X x, Out out) const { + auto temp1 = x < static_cast(threshold * -1.f); + auto temp2 = x > static_cast(threshold); + out.device(d) = x * (temp1 || temp2).template cast(); + } +}; + +template +struct HardShrinkGradFunctor : public BaseActivationFunctor { + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto temp1 = x < static_cast(threshold * -1.f); + auto temp2 = x > static_cast(threshold); + dx.device(d) = dout * (temp1 || temp2).template cast(); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +// softshrink(x) = x - lambda, if x > lambda; x + lambda, if x < -lambda; 0 +// otherwise +template +struct SoftShrinkFunctor : public BaseActivationFunctor { + float lambda; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"lambda", &lambda}}; + } + + template + void operator()(Device d, X x, Out out) const { + auto lambdaT = static_cast(lambda); + auto temp1 = (x > lambdaT).template cast(); + auto temp2 = (x < -lambdaT).template cast(); + out.device(d) = temp1 * (x - lambdaT) + temp2 * (x + lambdaT); + } +}; + +template +struct SoftShrinkGradFunctor : public BaseActivationFunctor { + float lambda; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"lambda", &lambda}}; + } + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto lambdaT = static_cast(lambda); + auto temp1 = (x > lambdaT).template cast(); + auto temp2 = (x < -lambdaT).template cast(); + dx.device(d) = dout * (temp1 + temp2).template cast(); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct ELUFunctor : public BaseActivationFunctor { + float alpha; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + + template + void operator()(Device d, X x, Out out) const { + out.device(d) = + (x < static_cast(0)) + .select(static_cast(alpha) * (x.exp() - static_cast(1)), x); + } +}; + +template +struct ELUGradFunctor : public BaseActivationFunctor { + float alpha; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + // case 1: alpha >= 0 + // dx = dout, if out > 0 + // dx = dout * (out + alpha), if out <= 0 + dx.device(d) = (out > static_cast(0)) + .select(dout, dout * (out + static_cast(alpha))); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct ELUGradNegativeAlphaFunctor : public BaseActivationFunctor { + float alpha; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + // case 2: alpha < 0 + // dx = dout, if x > 0 + // dx = dout * (out + alpha), if x <=0 + dx.device(d) = (x > static_cast(0)) + .select(dout, dout * static_cast(alpha) * x.exp()); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct ELUGradGradFunctor : public BaseActivationFunctor { + float alpha; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + template + void operator()(const Device& dev, + const DenseTensor* X, + const DenseTensor* ddX, + DenseTensor* ddOut, + const DenseTensor* dOut, + DenseTensor* dX) const { + auto* d = dev.eigen_device(); + auto ddx = EigenVector::Flatten( + GET_DATA_SAFELY(ddX, "Input", "DDX", "ELUGradGrad")); + auto x = EigenVector::Flatten( + GET_DATA_SAFELY(X, "Input", "X", "ELUGradGrad")); + + if (dX) { + auto dx = EigenVector::Flatten( + GET_DATA_SAFELY(dX, "Output", "DX", "ELUGradGrad")); + auto dout = EigenVector::Flatten( + GET_DATA_SAFELY(dOut, "Output", "DOut", "ELUGradGrad")); + dx.device(*d) = ddx * dout * static_cast(alpha) * x.exp() * + (x <= static_cast(0)).template cast(); + } + + if (ddOut) { + auto ddout = EigenVector::Flatten( + GET_DATA_SAFELY(ddOut, "Output", "DDOut", "ELUGradGrad")); + ddout.device(*d) = ddx * + ((x > static_cast(0)).template cast() + + static_cast(alpha) * x.exp() * + (x <= static_cast(0)).template cast()) + .template cast(); + } + } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +// silu(x) = x / (1 + exp(-x)) +template +struct SiluFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + auto temp = static_cast(1) / (static_cast(1) + (-x).exp()); + out.device(d) = x * temp; + } +}; + +// silu'(x) = (1 / (1 + e^{-x})) * (1 + out * e^{-x})) +template +struct SiluGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto temp1 = static_cast(1) + (-x).exp(); // 1+e^(-x) + auto temp2 = x * (-x).exp(); // x*e^(-x) + dx.device(d) = dout * ((static_cast(1) / temp1) * + (static_cast(1) + (temp2 / temp1))); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +// sigmoid(x) = 1 / (1 + exp(-x)) +template +struct SigmoidFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = static_cast(1) / (static_cast(1) + (-x).exp()); + } +}; + +template +struct SigmoidGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * out * (static_cast(1) - out); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + +/* + Out + DOut -> SigmoidGradGrad -> DOutNew + DDX DDOut + + DDOut = (1-Out)*Out*DDX + DOutNew = (1-2*Out)*DOut*DDX +*/ +template +struct SigmoidGradGradFunctor : public BaseActivationFunctor { + template + void operator()(const Device& dev, + const DenseTensor* Out, + const DenseTensor* ddX, + const DenseTensor* dOut, + DenseTensor* dOutNew, + DenseTensor* ddOut) const { + auto* d = dev.eigen_device(); + auto ddx = EigenVector::Flatten( + GET_DATA_SAFELY(ddX, "Input", "DDX", "SigmoidGradGrad")); + auto out = EigenVector::Flatten( + GET_DATA_SAFELY(Out, "Input", "Out", "SigmoidGradGrad")); + + if (dOutNew) { + auto dout = EigenVector::Flatten( + GET_DATA_SAFELY(dOut, "Input", "DOut", "SigmoidGradGrad")); + auto dout_new = EigenVector::Flatten( + GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "SigmoidGradGrad")); + dout_new.device(*d) = + (static_cast(1) - static_cast(2) * out) * dout * ddx; + } + if (ddOut) { + auto ddout = EigenVector::Flatten( + GET_DATA_SAFELY(ddOut, "Output", "DDOut", "SigmoidGradGrad")); + ddout.device(*d) = (static_cast(1) - out) * out * ddx; + } + } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + +/* + Out + DOut D_Dout + DDx -> SigmoidTripleGrad -> D_DDx + D_DDout d_OutNew + D_Dout_new + + D_Dout = (1-2*Out)*DDx*D_Dout_new + D_DDx = (1-Out)*Out*D_DDout + (1-2*Out)*DOut*D_Dout_new + D_OutNew = (DDx-2*Out*DDx)*D_DDout - 2*DOut*DDx*D_Dout_new + + Out, DDX, DOut, D_DDOut, D_DOut_New // input + D_OutNew, D_DOut, D_DDx // output +*/ +template +struct SigmoidTripleGradFunctor : public BaseActivationFunctor { + template + void operator()(const Device& dev, + const DenseTensor* Out, + const DenseTensor* ddX, + const DenseTensor* dOut, + const DenseTensor* d_DDOut, + const DenseTensor* d_dOut_New, + DenseTensor* d_d_Out, + DenseTensor* d_Out_New, + DenseTensor* d_DDx) const { + auto* d = dev.eigen_device(); + auto ddx = EigenVector::Flatten( + GET_DATA_SAFELY(ddX, "Input", "DDX", "SigmoidTripleGrad")); + auto out = EigenVector::Flatten( + GET_DATA_SAFELY(Out, "Input", "Out", "SigmoidTripleGrad")); + auto dout = EigenVector::Flatten( + GET_DATA_SAFELY(dOut, "Input", "DOut", "SigmoidTripleGrad")); + auto d_ddOut = EigenVector::Flatten( + GET_DATA_SAFELY(d_DDOut, "Input", "D_DDOut", "SigmoidTripleGrad")); + auto d_dOutNew = EigenVector::Flatten(GET_DATA_SAFELY( + d_dOut_New, "Input", "D_DOut_New", "SigmoidTripleGrad")); + + if (d_Out_New) { + auto d_OutNew = EigenVector::Flatten(GET_DATA_SAFELY( + d_Out_New, "Output", "D_OutNew", "SigmoidTripleGrad")); + d_OutNew.device(*d) = (ddx - static_cast(2) * out * ddx) * d_ddOut - + static_cast(2) * dout * ddx * d_dOutNew; + } + if (d_d_Out) { + auto d_dOut = EigenVector::Flatten( + GET_DATA_SAFELY(d_d_Out, "Output", "D_DOut", "SigmoidTripleGrad")); + d_dOut.device(*d) = + (static_cast(1) - static_cast(2) * out) * ddx * d_dOutNew; + } + if (d_DDx) { + auto d_ddx = EigenVector::Flatten( + GET_DATA_SAFELY(d_DDx, "Output", "D_DDx", "SigmoidTripleGrad")); + d_ddx.device(*d) = + (static_cast(1) - out) * out * d_ddOut + + (static_cast(1) - static_cast(2) * out) * dout * d_dOutNew; + } + } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + +// Originally: logsigmoid(x) = -log (1 + exp(-x)) +// For numerical stability, we can use the log-sum-exp trick: +// https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/ +// We can rewrite the above equation as: +// out = -log( exp(0) + exp(-x)) [since exp(0) = 1] +// = -log( exp(max(-x, 0) - max(-x, 0)) + exp(-x + max(-x, 0) - max(-x, 0))) +// = -log( exp(max(-x, 0)) * exp(-max(-x, 0)) - exp(max(-x, 0)) * exp(-x - +// max(-x, 0))) +// = -log( exp(max(-x, 0)) * (exp(-max(-x, 0)) + exp(-x - max(-x, 0)))) +// = -log( exp(max(-x, 0)) - log(exp(-max(-x, 0)) + exp(-x - max(-x, 0))) +// +// Hence, logsigmoid(x) = - (max(-x, 0) + log(exp(-max(-x, 0)) +// + exp(-x - max(-x, 0)))) +template +struct LogSigmoidFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + auto temp = (-x).cwiseMax(static_cast(0)); // temp = max(-x, 0) + out.device(d) = -temp - (((-temp).exp() + (-x - temp).exp()).log()); + } +}; + +// Originally: f' = exp(-x) / (1 + exp(-x)) +// For numerical stability: f' = exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) + +// exp(-x - max(-x, 0))) +template +struct LogSigmoidGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto temp = (-x).cwiseMax(static_cast(0)); // temp = max(-x, 0) + dx.device(d) = + dout * ((-x - temp).exp() / ((-temp).exp() + (-x - temp).exp())); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct HardSigmoidFunctor : public BaseActivationFunctor { + float slope; + float offset; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"slope", &slope}, {"offset", &offset}}; + } + + template + void operator()(Device d, X x, Out out) const { + auto temp = x * static_cast(slope) + static_cast(offset); + out.device(d) = + temp.cwiseMax(static_cast(0)).cwiseMin(static_cast(1)); + } +}; + +template +struct HardSigmoidGradFunctor : public BaseActivationFunctor { + float slope; + float offset; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"slope", &slope}, {"offset", &offset}}; + } + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * + ((out > static_cast(0)) * (out < static_cast(1))) + .template cast() * + static_cast(slope); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + #if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) template struct CudaReluFunctor : public BaseActivationFunctor { @@ -1214,6 +1661,315 @@ struct CudaLeakyReluGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; + +template +struct CudaSoftShrinkFunctor : public BaseActivationFunctor { + float lambda; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"lambda", &lambda}}; + } + + // softshrink(x) = x - lambda, if x > lambda; + // x + lambda, if x < -lambda; + // 0, otherwise. + __device__ __forceinline__ T operator()(const T x) const { + T l = static_cast(lambda); + T temp1 = static_cast(x > l); + T temp2 = static_cast(x < -l); + return temp1 * (x - l) + temp2 * (x + l); + } +}; + +template +struct CudaSoftShrinkGradFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + float lambda; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"lambda", &lambda}}; + } + + // dx = dout, if x > lambda or x < -lambda else 0 + __device__ __forceinline__ T operator()(const T dout, const T x) const { + T l = static_cast(lambda); + return (x >= -l && x <= l) ? zero : dout; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct CudaTanhShrinkFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + + // tanhshrink(x) = x - tanh(x) + __device__ __forceinline__ T operator()(const T arg_x) const { + MPType x = static_cast(arg_x); + return static_cast(x - tanh(x)); + } +}; + +template +struct CudaTanhShrinkGradFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + + // dx = dout * tanh(x)^2 + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_x) const { + MPType dout = static_cast(arg_dout); + MPType x = static_cast(arg_x); + return static_cast(dout * tanh(x) * tanh(x)); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct CudaHardShrinkFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + // hadrshrink(x) = (x > -threshold && x < threshold) ? 0 : x + __device__ __forceinline__ T operator()(const T x) const { + T t = static_cast(threshold); + return (x > -t && x < t) ? zero : x; + } +}; + +template +struct CudaHardShrinkGradFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + // dx = (x > -threshold && x < threshold) ? 0 : dout + __device__ __forceinline__ T operator()(const T dout, const T x) const { + T t = static_cast(threshold); + return (x > -t && x < t) ? zero : dout; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct CudaELUFunctor : public BaseActivationFunctor { + using CT = typename phi::dtype::MPTypeTrait::Type; + CT zero = static_cast(0.0f); + CT one = static_cast(1.0f); + float alpha; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + + // elu(x) = x, if x > 0 + // elu(x) = alpha * (e^x - 1), if x <= 0 + __device__ __forceinline__ T operator()(const T arg_x) const { + CT x = static_cast(arg_x); + CT temp = static_cast(alpha) * (exp(x) - one); + CT res = x > zero ? x : temp; + return static_cast(res); + } +}; + +template +struct CudaELUGradFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + MPType zero = static_cast(0.0f); + float alpha; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + + // case 1: alpha >= 0 + // dx = dout, if out > 0 + // dx = dout * (out + alpha), if out <= 0 + __device__ __forceinline__ T operator()(T arg_dout, T arg_out) const { + MPType dout = static_cast(arg_dout); + MPType out = static_cast(arg_out); + MPType a = static_cast(alpha); + MPType out_pos = static_cast(out > zero); + MPType out_neg = static_cast(out <= zero); + return static_cast(dout * (out_pos + out_neg * (out + a))); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + +template +struct CudaELUGradNegativeAlphaFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + MPType zero = static_cast(0.0f); + float alpha; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + + // case 2: alpha < 0 + // dx = dout, if x > 0 + // dx = dout * (out + alpha), if x <=0 + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_out, + const T arg_x) const { + MPType dout = static_cast(arg_dout); + MPType out = static_cast(arg_out); + MPType x = static_cast(arg_x); + MPType a = static_cast(alpha); + MPType x_pos = static_cast(x > zero); + MPType x_neg = static_cast(x <= zero); + return static_cast(dout * (x_pos + x_neg * (out + a))); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct CudaSiluFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + + // silu(x) = x / (1 + exp(-x)) + __device__ __forceinline__ T operator()(const T arg_x) const { + MPType x = static_cast(arg_x); + return static_cast(x / (one + exp(-x))); + } +}; + +template +struct CudaSiluGradFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + + // dx = dout * (1 + exp(-x) + x * exp(-x) / (1 + exp(-x))^2) + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_x) const { + MPType dout = static_cast(arg_dout); + MPType x = static_cast(arg_x); + MPType temp = one / (one + exp(-x)); + return static_cast(dout * (temp * (one + x * (one - temp)))); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct CudaSigmoidFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + + // sigmoid(x) = 1 / (1 + exp(-x)) + __device__ __forceinline__ T operator()(const T arg_x) const { + MPType x = static_cast(arg_x); + return static_cast(one / (one + exp(-x))); + } +}; + +template +struct CudaSigmoidGradFunctor : public BaseActivationFunctor { + T one = static_cast(1.0f); + + // dx = dout * out * (1 - out) + __device__ __forceinline__ T operator()(const T dout, const T out) const { + return dout * out * (one - out); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + +template +struct CudaLogSigmoidFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + MPType zero = static_cast(0.0f); + + // logsigmoid(x) = log(1 / (1 + exp(-x))) + // For numerical stability, + // logsigmoid(x) = + // - (max(-x, 0) + log(exp(-max(-x, 0)) + exp(-x - max(-x, 0)))) + __device__ __forceinline__ T operator()(const T arg_x) const { + MPType x = static_cast(arg_x); + MPType temp = x > zero ? zero : -x; + return static_cast(-temp - log(exp(-temp) + exp(-x - temp))); + } +}; + +template +struct CudaLogSigmoidGradFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + MPType zero = static_cast(0.0f); + + // dx = dout * exp(-x) / (1 + exp(-x)) + // For numerical stability: + // dx = dout * exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) + exp(-x - max(-x, + // 0))) + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_x) const { + MPType dout = static_cast(arg_dout); + MPType x = static_cast(arg_x); + MPType temp1 = x > zero ? zero : -x; + MPType temp2 = exp(-x - temp1); + return static_cast(dout * (temp2 / (exp(-temp1) + temp2))); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct CudaHardSigmoidFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + T one = static_cast(1.0f); + float slope; + float offset; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"slope", &slope}, {"offset", &offset}}; + } + + // hard_sigmoid(x) = 0, when x <= -3 + // 1, when x >= 3 + // x * slope + offset, otherwise + __device__ __forceinline__ T operator()(const T x) const { + T temp = x * static_cast(slope) + static_cast(offset); + T temp_max = temp > zero ? temp : zero; + T temp_min = temp_max < one ? temp_max : one; + return temp_min; + } +}; + +template +struct CudaHardSigmoidGradFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + T one = static_cast(1.0f); + float slope; + float offset; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"slope", &slope}, {"offset", &offset}}; + } + + // dx = (out > 0 && out < 1) ? dout * slope : 0 + __device__ __forceinline__ T operator()(const T dout, const T out) const { + return (out > zero && out < one) ? dout * static_cast(slope) : zero; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + #endif } // namespace funcs diff --git a/paddle/phi/kernels/funcs/cumprod.h b/paddle/phi/kernels/funcs/cumprod.h new file mode 100644 index 0000000000000000000000000000000000000000..ac40523c1c4378b3b8b20e9f32c59e664ee1eafc --- /dev/null +++ b/paddle/phi/kernels/funcs/cumprod.h @@ -0,0 +1,52 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { +static void GetCumprodDimInfo(const DDim& dim, + int cumprod_dim, + size_t* outer_dim, + size_t* mid_dim, + size_t* inner_dim) { + PADDLE_ENFORCE_GE( + cumprod_dim, + -dim.size(), + phi::errors::InvalidArgument( + "The input dim of CumprodOp should be larger than the opposite " + "rank of input x which is %d.But received dim=%d", + -dim.size(), + cumprod_dim)); + PADDLE_ENFORCE_LT(cumprod_dim, + dim.size(), + phi::errors::InvalidArgument( + "The input dim of CumprodOp should be smaller than the " + "rank of input x which is %d.But received dim=%d", + dim.size(), + cumprod_dim)); + if (cumprod_dim < 0) cumprod_dim += dim.size(); + + *outer_dim = 1; + for (int i = 0; i < cumprod_dim; ++i) { + *outer_dim *= dim[i]; + } + *mid_dim = dim[cumprod_dim]; + *inner_dim = 1; + for (int i = cumprod_dim + 1; i < dim.size(); ++i) { + *inner_dim *= dim[i]; + } +} +} // namespace phi diff --git a/paddle/phi/kernels/funcs/elementwise_functor.h b/paddle/phi/kernels/funcs/elementwise_functor.h index f9e66836a62699f16f0ea32c2af9175d1a1b88b2..ac262fe2d571e587e3bdfa6a2d4e58bd5b865e68 100644 --- a/paddle/phi/kernels/funcs/elementwise_functor.h +++ b/paddle/phi/kernels/funcs/elementwise_functor.h @@ -67,6 +67,11 @@ struct InverseMultiplyFunctor { } }; +template +struct IsZeroFunctor { + HOSTDEVICE bool operator()(T x) const { return x == static_cast(0); } +}; + // Divide #define DIV_ERROR_INFO \ "InvalidArgumentError: Integer division by zero encountered in " \ diff --git a/paddle/phi/kernels/funcs/layer_norm_util.h b/paddle/phi/kernels/funcs/layer_norm_util.h new file mode 100644 index 0000000000000000000000000000000000000000..e78730cbf38495637e4bd4c455a3f522b38a9017 --- /dev/null +++ b/paddle/phi/kernels/funcs/layer_norm_util.h @@ -0,0 +1,165 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/device_context.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { +namespace funcs { + +// Wrap RowwiseMean and ColwiseMean. +// Reuse the cpu codes and replace the gpu codes with cublas_gemv, which is +// significantly faster. Unlike the RowwiseMean and ColwiseMean, the +// implementation only considers 2D. +template +struct RowwiseMean2D { + RowwiseMean2D(int left, int right, const DeviceContext& dev_ctx); + + void operator()(const DeviceContext& context, + const DenseTensor& input, + DenseTensor* vec); +}; + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +template +class RowwiseMean2D { + public: + RowwiseMean2D(int left, int right, const DeviceContext& dev_ctx) + : left_(left), right_(right) { + DDim ones_dim({right_}); + divisor_.Resize(ones_dim); + dev_ctx.template Alloc(&divisor_); + phi::funcs::set_constant(dev_ctx, &divisor_, 1.0 / right); + } + void operator()(const phi::GPUContext& context, + const DenseTensor& input, + DenseTensor* out) { + phi::funcs::GetBlas(context).GEMV(false, + left_, + right_, + 1., + input.data(), + divisor_.data(), + 0., + out->data()); + } + + private: + int left_; + int right_; + DenseTensor divisor_; +}; +#endif + +template +class RowwiseMean2D { + public: + RowwiseMean2D(int left, int right, const DeviceContext& dev_ctx) {} + + void operator()(const phi::CPUContext& context, + const DenseTensor& input, + DenseTensor* out) { + row_mean_(context, input, out); + } + + private: + phi::funcs::RowwiseMean row_mean_; +}; + +template +struct ColwiseSum2D { + ColwiseSum2D(int left, int right, const DeviceContext& dev_ctx); + + void operator()(const phi::DeviceContext& context, + const DenseTensor& input, + DenseTensor* vec); +}; + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +template +class ColwiseSum2D { + public: + ColwiseSum2D(int left, int right, const phi::GPUContext& dev_ctx) + : left_(left), right_(right) { + DDim ones_dim({left_}); + divisor_.Resize(ones_dim); + dev_ctx.template Alloc(&divisor_); + phi::funcs::set_constant(dev_ctx, &divisor_, 1.0); + } + + void operator()(const phi::GPUContext& context, + const DenseTensor& input, + DenseTensor* out) { + phi::funcs::GetBlas(context).GEMV(true, + left_, + right_, + 1., + input.data(), + divisor_.data(), + 0., + out->data()); + } + + private: + int left_; + int right_; + DenseTensor divisor_; +}; +#endif + +template +class ColwiseSum2D { + public: + ColwiseSum2D(int left, int right, const phi::CPUContext& dev_ctx) {} + + void operator()(const phi::CPUContext& context, + const DenseTensor& input, + DenseTensor* out) { + col_wise_(context, input, out); + } + + private: + phi::funcs::ColwiseSum col_wise_; +}; + +template +struct SubAndSquareFunctor { + inline HOSTDEVICE T operator()(T a, T b) const { return (a - b) * (a - b); } +}; + +template +struct DivAndSqrtFunctor { + explicit DivAndSqrtFunctor(T epsilon) { epsilon_ = epsilon; } + inline HOSTDEVICE T operator()(T a, T b) const { + return a / (sqrt(b + epsilon_)); + } + + private: + T epsilon_; +}; + +template +struct MulInvVarFunctor { + inline HOSTDEVICE T operator()(T a, T b) const { + return a * std::sqrt(1.0 / b); + } +}; + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/math_function.cc b/paddle/phi/kernels/funcs/math_function.cc index 4201a75be8ac7ee9f7e633f6def1e002ce4b7e8a..afa2214f5b9df968d9fe01f6310e151c12e19362 100644 --- a/paddle/phi/kernels/funcs/math_function.cc +++ b/paddle/phi/kernels/funcs/math_function.cc @@ -331,12 +331,20 @@ template struct ColwiseSum; template struct ColwiseSum; template struct ColwiseSum; +template struct ColwiseSum; +template struct ColwiseSum; +template struct ColwiseSum; +template struct ColwiseSum; + template struct RowwiseSum; template struct RowwiseSum; template struct RowwiseMean; template struct RowwiseMean; +template struct RowwiseMean; +template struct RowwiseMean; + template struct ElementwiseAddTo { void operator()(paddle::platform::CPUDeviceContext* ctx, diff --git a/paddle/phi/kernels/funcs/mode.h b/paddle/phi/kernels/funcs/mode.h new file mode 100644 index 0000000000000000000000000000000000000000..1b7641762e2639acf3db540280891b518f22eed2 --- /dev/null +++ b/paddle/phi/kernels/funcs/mode.h @@ -0,0 +1,197 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#include +#include +#include +#include +#include +#include +#include +#include +#endif + +#include +#include +#include +#include +#ifdef PADDLE_WITH_MKLML +#include +#endif + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { +namespace funcs { + +static int ComputeBlockSize(int col) { + if (col > 512) + return 1024; + else if (col > 256 && col <= 512) + return 512; + else if (col > 128 && col <= 256) + return 256; + else if (col > 64 && col <= 128) + return 128; + else + return 64; +} + +static inline void GetDims( + const phi::DDim& dim, int axis, int* pre, int* n, int* post) { + *pre = 1; + *post = 1; + *n = dim[axis]; + for (int i = 0; i < axis; ++i) { + (*pre) *= dim[i]; + } + for (int i = axis + 1; i < dim.size(); ++i) { + (*post) *= dim[i]; + } +} + +template +static void GetMode(Type input_height, + Type input_width, + int input_dim, + const DenseTensor* input, + T* t_out, + Type* t_indices) { +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (Type i = 0; i < input_height; ++i) { + std::vector> col_vec; + col_vec.reserve(input_width); + if (input_dim == 1) { + auto e_input = EigenVector::Flatten(*input); + for (Type j = 0; j < input_width; ++j) { + col_vec.emplace_back(std::pair(e_input(j), j)); + } + } else { + auto e_input = EigenMatrix::Reshape(*input, input_dim - 1); + for (Type j = 0; j < input_width; ++j) { + col_vec.emplace_back(std::pair(e_input(i, j), j)); + } + } + std::sort(col_vec.begin(), + col_vec.end(), + [](const std::pair& l, const std::pair& r) { + return (!std::isnan(static_cast(l.first)) && + std::isnan(static_cast(r.first))) || + (l.first < r.first); + }); + T mode = 0; + int64_t indice = 0; + int64_t cur_freq = 0; + int64_t max_freq = 0; + for (int64_t i = 0; i < input_width; ++i) { + ++cur_freq; + if (i == input_width - 1 || (col_vec[i + 1].first != col_vec[i].first)) { + if (cur_freq > max_freq) { + max_freq = cur_freq; + mode = col_vec[i].first; + indice = col_vec[i].second; + } + cur_freq = 0; + } + } + t_out[i] = mode; + t_indices[i] = indice; + } +} + +template +static void ModeAssign(const Type& input_height, + const Type& input_width, + const int& input_dim, + const DenseTensor* input, + const DenseTensor* indices, + T* output_data) { +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (Type i = 0; i < input_height; ++i) { + if (input_dim == 1) { + auto e_input = EigenVector::Flatten(*input); + auto e_indices = EigenVector::Flatten(*indices); + output_data[i * input_width + e_indices(0)] = e_input(0); + } else { + auto e_input = EigenMatrix::Reshape(*input, input_dim - 1); + auto e_indices = EigenMatrix::Reshape(*indices, input_dim - 1); + output_data[i * input_width + e_indices(i, 0)] = e_input(i, 0); + } + } +} + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +template +static void GetModebySort(const phi::GPUContext& dev_ctx, + const DenseTensor* input_tensor, + const int64_t num_cols, + const int64_t num_rows, + T* out_tensor, + int64_t* indices_tensor) { + DenseTensor input_tmp; + input_tmp.Resize(phi::make_ddim({num_rows, num_cols})); + T* input_tmp_data = dev_ctx.Alloc(&input_tmp); + phi::Copy(dev_ctx, *input_tensor, dev_ctx.GetPlace(), false, &input_tmp); + + thrust::device_ptr out_tensor_ptr(out_tensor); + thrust::device_ptr indices_tensor_ptr(indices_tensor); + + for (int64_t i = 0; i < num_rows; ++i) { + T* begin = input_tmp_data + num_cols * i; + T* end = input_tmp_data + num_cols * (i + 1); + thrust::device_vector indices_data(num_cols); + thrust::sequence( + thrust::device, indices_data.begin(), indices_data.begin() + num_cols); + thrust::sort_by_key(thrust::device, begin, end, indices_data.begin()); + int unique = 1 + thrust::inner_product(thrust::device, + begin, + end - 1, + begin + 1, + 0, + thrust::plus(), + thrust::not_equal_to()); + thrust::device_vector keys_data(unique); + thrust::device_vector cnts_data(unique); + thrust::reduce_by_key(thrust::device, + begin, + end, + thrust::constant_iterator(1), + keys_data.begin(), + cnts_data.begin()); + auto it = thrust::max_element( + thrust::device, cnts_data.begin(), cnts_data.begin() + unique); + T mode = keys_data[it - cnts_data.begin()]; + int64_t counts = cnts_data[it - cnts_data.begin()]; + auto pos = thrust::find(thrust::device, begin, end, mode); + int64_t index = indices_data[pos - begin + counts - 1]; + out_tensor_ptr[i] = static_cast(mode); + indices_tensor_ptr[i] = static_cast(index); + } +} +#endif + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/pooling.cu b/paddle/phi/kernels/funcs/pooling.cu index 4cf5e1c02c59757ee8bd0ae91c18d0882b702da1..417c1cd234754f994383988c63ff44ba06794822 100644 --- a/paddle/phi/kernels/funcs/pooling.cu +++ b/paddle/phi/kernels/funcs/pooling.cu @@ -392,7 +392,7 @@ void Pool2dDirectCUDAFunctor::operator()( int nthreads = batch_size * output_channels * output_height * output_width; int thread_num = 1024; #ifdef WITH_NV_JETSON - // paddle::platform::ChangeThreadNum(context, &thread_num); + // backends::gpu::ChangeThreadNum(context, &thread_num); thread_num = 512; #endif int blocks = (nthreads + thread_num - 1) / thread_num; @@ -460,7 +460,7 @@ class Pool2dFunctor { int nthreads = batch_size * output_channels * output_height * output_width; int thread_num = 1024; #ifdef WITH_NV_JETSON - paddle::platform::ChangeThreadNum(context, &thread_num); + backends::gpu::ChangeThreadNum(context, &thread_num); #endif int blocks = (nthreads + thread_num - 1) / thread_num; dim3 threads(thread_num, 1); @@ -527,7 +527,7 @@ class Pool2dFunctor { int nthreads = batch_size * output_channels * output_height * output_width; int thread_num = 1024; #ifdef WITH_NV_JETSON - paddle::platform::ChangeThreadNum(context, &thread_num); + backends::gpu::ChangeThreadNum(context, &thread_num); #endif int blocks = (nthreads + thread_num - 1) / thread_num; dim3 threads(thread_num, 1); @@ -1293,7 +1293,7 @@ class Pool3dFunctor { output_width; int thread_num = 1024; #ifdef WITH_NV_JETSON - paddle::platform::ChangeThreadNum(context, &thread_num); + backends::gpu::ChangeThreadNum(context, &thread_num); #endif int blocks = (nthreads + thread_num - 1) / thread_num; dim3 threads(thread_num, 1); @@ -1369,7 +1369,7 @@ class Pool3dFunctor { output_width; int thread_num = 1024; #ifdef WITH_NV_JETSON - paddle::platform::ChangeThreadNum(context, &thread_num); + backends::gpu::ChangeThreadNum(context, &thread_num); #endif int blocks = (nthreads + thread_num - 1) / thread_num; dim3 threads(thread_num, 1); @@ -1906,7 +1906,7 @@ class MaxPool2dWithIndexFunctor { int nthreads = batch_size * output_channels * output_height * output_width; int thread_num = 1024; #ifdef WITH_NV_JETSON - paddle::platform::ChangeThreadNum(context, &thread_num); + backends::gpu::ChangeThreadNum(context, &thread_num); #endif int blocks = (nthreads + thread_num - 1) / thread_num; @@ -2205,7 +2205,7 @@ class MaxPool3dWithIndexFunctor { output_width; int thread_num = 1024; #ifdef WITH_NV_JETSON - paddle::platform::ChangeThreadNum(context, &thread_num); + backends::gpu::ChangeThreadNum(context, &thread_num); #endif int blocks = (nthreads + thread_num - 1) / thread_num; diff --git a/paddle/phi/kernels/funcs/pooling.h b/paddle/phi/kernels/funcs/pooling.h index 19c6d52c4c9018f821c4e7f6ddaebf933aa045e8..fa285dc69d1ca552afbd1f41e050ee603be07239 100644 --- a/paddle/phi/kernels/funcs/pooling.h +++ b/paddle/phi/kernels/funcs/pooling.h @@ -43,7 +43,7 @@ template class MaxPool { public: DEVICE inline T initial() { return static_cast(-FLT_MAX); } - DEVICE inline void compute(const T& x, T* y) { *y = *y > x ? *y : x; } + HOSTDEVICE inline void compute(const T& x, T* y) { *y = *y > x ? *y : x; } DEVICE inline void finalize(const T& pool_field, T* y) {} }; diff --git a/paddle/phi/kernels/funcs/reduce_function.h b/paddle/phi/kernels/funcs/reduce_function.h index 5834f091d9a4de02afe7488ededc0189ae6f21d0..85c371e9f9d450c55741b901eff6f102fa6c3f6f 100644 --- a/paddle/phi/kernels/funcs/reduce_function.h +++ b/paddle/phi/kernels/funcs/reduce_function.h @@ -14,8 +14,8 @@ #pragma once -// CUDA and HIP use same api -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +// CUDA, XPU and HIP use same api +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(__xpu__) #include #include @@ -220,7 +220,7 @@ struct IndexCalculator { phi::Array dims; phi::Array strides; phi::Array reduce_strides; -#ifndef PADDLE_WITH_XPU2 +#ifndef PADDLE_WITH_XPU_KP phi::Array divmoders; #endif }; @@ -231,81 +231,65 @@ struct ReduceIndexMapping { HOSTDEVICE explicit ReduceIndexMapping(const kps::DimConfig& dims) : dim(dims) {} +#ifdef PADDLE_WITH_XPU_KP __device__ __forceinline__ int BlockIdX() { -#ifdef PADDLE_WITH_XPU2 if (ReduceLastDim) { return (cluster_id() / dim.split_num_x % dim.split_num_y); } else { return cluster_id() % dim.split_num_x; } -#else - return blockIdx.x; -#endif } __device__ __forceinline__ int BlockIdY() { -#ifdef PADDLE_WITH_XPU2 if (ReduceLastDim) { return (cluster_id() % dim.split_num_x); } else { return (cluster_id() / dim.split_num_x % dim.split_num_y); } -#else - return blockIdx.y; -#endif } - __device__ __forceinline__ int BlockDimX() { -#ifdef PADDLE_WITH_XPU2 - return dim.deal_size_x; -#else - return blockDim.x; -#endif - } + __device__ __forceinline__ int BlockDimX() { return dim.deal_size_x; } - __device__ __forceinline__ int BlockDimY() { -#ifdef PADDLE_WITH_XPU2 - return 1; -#else - return blockDim.y; -#endif - } + __device__ __forceinline__ int BlockDimY() { return 1; } __device__ __forceinline__ int GridDimX() { -#ifdef PADDLE_WITH_XPU2 if (ReduceLastDim) { return dim.split_num_y; } else { return dim.split_num_x; } -#else - return gridDim.x; -#endif } __device__ __forceinline__ int GridDimY() { -#ifdef PADDLE_WITH_XPU2 if (ReduceLastDim) { return dim.split_num_x; } else { return dim.split_num_y; } -#else - return gridDim.y; -#endif } __device__ __forceinline__ int GetLoopSize() { -#ifdef PADDLE_WITH_XPU2 if (ReduceLastDim) { return dim.deal_size_y; } else { return dim.deal_size_x; } + } #else - return 1; + __device__ __forceinline__ int BlockIdX() { return blockIdx.x; } + + __device__ __forceinline__ int BlockIdY() { return blockIdx.y; } + + __device__ __forceinline__ int BlockDimX() { return blockDim.x; } + + __device__ __forceinline__ int BlockDimY() { return blockDim.y; } + + __device__ __forceinline__ int GridDimX() { return gridDim.x; } + + __device__ __forceinline__ int GridDimY() { return gridDim.y; } + + __device__ int GetLoopSize() { return 1; } #endif - } }; // when reduce_type == kReduceLastDim this struct will be used @@ -341,7 +325,7 @@ struct ReduceConfig { // when should_reduce_again is true, we need malloc temp space for temp data void SetOutputData(Ty* y_data, - const phi::GPUContext& dev_ctx, + const KPDevice& dev_ctx, phi::DenseTensor* tmp) { if (should_reduce_again) { tmp->Resize(phi::make_ddim( @@ -640,9 +624,7 @@ struct ReduceConfig { int blocking_size; bool should_reduce_again; bool reduce_last_dim; - Ty* output_data; - dim3 block; dim3 grid; }; @@ -770,9 +752,10 @@ __global__ void ReduceAnyKernel(const Tx* x, kps::Reduce( &reduce_var, &reduce_var, reducer, reduce_last_dim); - if (need_store) { - y[store_offset + i] = static_cast(reduce_var); - } + + Ty result = static_cast(reduce_var); + kps::details::WriteData( + y + store_offset + i, &result, static_cast(need_store)); } } @@ -882,30 +865,18 @@ static void LaunchReduceKernel(const Tx* x_data, dim.SetRem(config.reduce_num % config.block.x, 0, 0); #ifdef PADDLE_WITH_XPU_KP - ReduceAnyKernel<<<8, 64, 0, stream>>>( - x_data, - config.output_data, - reducer, - transform, - init, - config.reduce_num, - config.left_num, - config.reduce_last_dim, - reduce_index_calculator, - left_index_calculator, - dim); + auto grid_num = 8; + auto block_num = 64; #else + auto grid_num = config.grid; + auto block_num = config.block; +#endif ReduceAnyKernel<<>>( + OneDimIndexCal><<>>( x_data, config.output_data, reducer, @@ -917,7 +888,6 @@ static void LaunchReduceKernel(const Tx* x_data, reduce_index_calculator, left_index_calculator, dim); -#endif } else { int reduce_rank = config.reduce_strides.size(); @@ -938,30 +908,18 @@ static void LaunchReduceKernel(const Tx* x_data, dim.SetRem(config.reduce_num % config.block.x, 0, 0); #ifdef PADDLE_WITH_XPU_KP - ReduceAnyKernel<<<8, 64, 0, stream>>>( - x_data, - config.output_data, - reducer, - transform, - init, - config.reduce_num, - config.left_num, - config.reduce_last_dim, - reduce_index_calculator, - left_index_calculator, - dim); + auto grid_num = 8; + auto block_num = 64; #else + auto grid_num = config.grid; + auto block_num = config.block; +#endif ReduceAnyKernel<<>>( + IndexCalculator><<>>( x_data, config.output_data, reducer, @@ -973,7 +931,6 @@ static void LaunchReduceKernel(const Tx* x_data, reduce_index_calculator, left_index_calculator, dim); -#endif } if (config.should_reduce_again) { @@ -993,22 +950,9 @@ static void LaunchReduceKernel(const Tx* x_data, kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0); dim.SetRem(config.left_num % block.x, 0, 0); #ifdef PADDLE_WITH_XPU_KP - ReduceHigherDimKernel< - Ty, - Ty, - MPType, - ReduceOp, - kps::IdentityFunctor><<<8, 64, 0, stream>>>( - config.output_data, - y_data, - reducer, - kps::IdentityFunctor(), - init, - config.grid.y, - config.left_num, - config.grid.y, - dim); -#else + grid = 8; + block = 64; +#endif ReduceHigherDimKernel< Ty, Ty, @@ -1024,7 +968,6 @@ static void LaunchReduceKernel(const Tx* x_data, config.left_num, config.grid.y, dim); -#endif } } @@ -1038,7 +981,7 @@ CubTensorReduceImpl(const Tx* x_data, Ty* y_data, const TransformOp& transform, int reduce_num, - const phi::GPUContext& dev_ctx, + const KPDevice& dev_ctx, KPStream stream) { auto reducer = ReduceOp(); cub::TransformInputIterator trans_x(x_data, @@ -1077,7 +1020,7 @@ CubTensorReduceImpl(const Tx* x_data, Ty* y_data, const TransformOp& transform, int reduce_num, - const phi::GPUContext& dev_ctx, + const KPDevice& dev_ctx, KPStream stream) { PADDLE_THROW(phi::errors::InvalidArgument( "Tx should not be float16 when using cub::DeviceReduce::Reduce().")); @@ -1087,12 +1030,16 @@ template class ReduceOp, typename TransformOp> -void ReduceKernel(const phi::GPUContext& dev_ctx, +void ReduceKernel(const KPDevice& dev_ctx, const phi::DenseTensor& x, phi::DenseTensor* y, const TransformOp& transform, const std::vector& origin_reduce_dims) { +#ifdef PADDLE_WITH_XPU_KP + auto stream = dev_ctx.x_context()->xpu_stream; +#else auto stream = dev_ctx.stream(); +#endif dev_ctx.Alloc(y); auto x_dim = phi::vectorize(x.dims()); @@ -1149,11 +1096,17 @@ void ReduceKernel(const phi::GPUContext& dev_ctx, 0); #ifdef PADDLE_WITH_XPU_KP + auto grid_num = 8; + auto block_num = 64; +#else + auto grid_num = config.grid; + auto block_num = config.block; +#endif ReduceHigherDimKernel, - TransformOp><<<8, 64, 0, stream>>>( + TransformOp><<>>( x_data, config.output_data, reducer, @@ -1163,23 +1116,6 @@ void ReduceKernel(const phi::GPUContext& dev_ctx, config.left_num, config.blocking_size, dim); -#else - ReduceHigherDimKernel< - Tx, - Ty, - MPType, - ReduceOp, - TransformOp><<>>( - x_data, - config.output_data, - reducer, - transform, - reducer.initial(), - config.reduce_num, - config.left_num, - config.blocking_size, - dim); -#endif if (config.should_reduce_again) { dim3 block = dim3(config.block.x, 1, 1); @@ -1189,22 +1125,9 @@ void ReduceKernel(const phi::GPUContext& dev_ctx, dim2.SetRem(config.left_num % config.block.x, 0, 0); #ifdef PADDLE_WITH_XPU_KP - ReduceHigherDimKernel< - Ty, - Ty, - MPType, - ReduceOp, - kps::IdentityFunctor><<<8, 64, 0, stream>>>( - config.output_data, - y_data, - reducer, - kps::IdentityFunctor(config.grid.y), - reducer.initial(), - config.grid.y, - config.left_num, - config.grid.y, - dim2); -#else + grid = 8; + block = 64; +#endif ReduceHigherDimKernel< Ty, Ty, @@ -1220,7 +1143,6 @@ void ReduceKernel(const phi::GPUContext& dev_ctx, config.left_num, config.grid.y, dim2); -#endif } return; } diff --git a/paddle/phi/kernels/funcs/reduce_functor.h b/paddle/phi/kernels/funcs/reduce_functor.h index 4e83d0fa3710324f5fddd729d10cb8a541791562..b793afb63b1dca9bbd8ad09b83461567de6371ad 100644 --- a/paddle/phi/kernels/funcs/reduce_functor.h +++ b/paddle/phi/kernels/funcs/reduce_functor.h @@ -49,5 +49,106 @@ struct MaxFunctor { } }; +//////// Min Functor /////// +struct MinFunctor { + template + void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { + y->device(place) = x->minimum(dim); + } +}; + +//////// All Functor /////// +struct AllFunctor { + template + void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { + y->device(place) = x->all(dim); + } +}; + +//////// Any Functor /////// +struct AnyFunctor { + template + void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { + y->device(place) = x->any(dim); + } +}; + +struct MeanGradFunctor { + template + void operator()(const DeviceContext& place, + X* x, + Y* y, + DX* dx, + DY* dy, + const Dim& dim, + int size) { + dx->device(place) = dy->broadcast(dim) / dx->constant(size); + } +}; + +struct SumGradFunctor { + template + void operator()(const DeviceContext& place, + X* x, + Y* y, + DX* dx, + DY* dy, + const Dim& dim, + int size) { + dx->device(place) = dy->broadcast(dim); + } +}; + +struct ProdGradFunctor { + template + void operator()(const DeviceContext& place, + X* x, + Y* y, + DX* dx, + DY* dy, + const Dim& dim, + int size) { + dx->device(place) = dy->broadcast(dim) * y->broadcast(dim) * x->inverse(); + } +}; + +struct MaxOrMinGradFunctor { + template + void operator()(const DeviceContext& place, + X* x, + Y* y, + DX* dx, + DY* dy, + const Dim& dim, + int size) { + auto equals = (*x) == y->broadcast(dim); + auto ones = dx->constant(1); + auto zeros = dx->constant(0); + // If there are multiple minimum or maximum elements, the subgradient of + // each is the set [0, 1], and we pass gradient to all of them here. + dx->device(place) = dy->broadcast(dim) * equals.select(ones, zeros); + } +}; + } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/funcs/reduce_grad_functions.h b/paddle/phi/kernels/funcs/reduce_grad_functions.h index 3488b6f2f86b20e0b758f3aa75a6739c40cd81db..11197a52261d7d0fd7618d2c7c0de09b57abe0d8 100644 --- a/paddle/phi/kernels/funcs/reduce_grad_functions.h +++ b/paddle/phi/kernels/funcs/reduce_grad_functions.h @@ -41,14 +41,14 @@ void ReduceGradFunctor(const Context& dev_ctx, Eigen::array broadcast_dim; for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1; - int broad_cats_times = 1; + int broad_cast_times = 1; for (size_t i = 0; i < dims_ref.size(); ++i) { if (dims_ref[i] < 0) { dims_ref[i] = x_rank + dims_ref[i]; } reduced_dims_v[dims_ref[i]] = 1; broadcast_dim[dims_ref[i]] = x_dims[dims_ref[i]]; - broad_cats_times *= x_dims[dims_ref[i]]; + broad_cast_times *= x_dims[dims_ref[i]]; } auto reduced_dims = phi::make_ddim(reduced_dims_v); auto x_reduce = EigenTensor::From(input1, reduced_dims); @@ -62,7 +62,7 @@ void ReduceGradFunctor(const Context& dev_ctx, &x_grad, &x_reduce_grad, broadcast_dim, - broad_cats_times); + broad_cast_times); } inline void GetOriginDimFromShuffled(const DDim& src_dim, diff --git a/paddle/phi/kernels/funcs/segment_pooling.cc b/paddle/phi/kernels/funcs/segment_pooling.cc index bf4a21f37223dab5a67649406496e9828b0bcf3f..fbd744430aa11ab1a5a17c76b6d37c10c3085556 100644 --- a/paddle/phi/kernels/funcs/segment_pooling.cc +++ b/paddle/phi/kernels/funcs/segment_pooling.cc @@ -149,10 +149,19 @@ template class SegmentPoolFunctor; template class SegmentPoolFunctor; template class SegmentPoolFunctor; template class SegmentPoolFunctor; +template class SegmentPoolFunctor; +template class SegmentPoolFunctor; +template class SegmentPoolFunctor; +template class SegmentPoolFunctor; + template class SegmentPoolGradFunctor; template class SegmentPoolGradFunctor; template class SegmentPoolGradFunctor; template class SegmentPoolGradFunctor; +template class SegmentPoolGradFunctor; +template class SegmentPoolGradFunctor; +template class SegmentPoolGradFunctor; +template class SegmentPoolGradFunctor; } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/funcs/segment_pooling.cu b/paddle/phi/kernels/funcs/segment_pooling.cu index 305cd39f077bc359543b399a8775b5a92a2eb00d..95606b152672916116813c97cbbc0856d33e49a7 100644 --- a/paddle/phi/kernels/funcs/segment_pooling.cu +++ b/paddle/phi/kernels/funcs/segment_pooling.cu @@ -453,10 +453,19 @@ template class SegmentPoolFunctor; template class SegmentPoolFunctor; template class SegmentPoolFunctor; template class SegmentPoolFunctor; +template class SegmentPoolFunctor; +template class SegmentPoolFunctor; +template class SegmentPoolFunctor; +template class SegmentPoolFunctor; + template class SegmentPoolGradFunctor; template class SegmentPoolGradFunctor; template class SegmentPoolGradFunctor; template class SegmentPoolGradFunctor; +template class SegmentPoolGradFunctor; +template class SegmentPoolGradFunctor; +template class SegmentPoolGradFunctor; +template class SegmentPoolGradFunctor; } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/funcs/select_impl.cu.h b/paddle/phi/kernels/funcs/select_impl.cu.h new file mode 100644 index 0000000000000000000000000000000000000000..3a1d9b8ea7a7a36c31f31f7fc60dffc1f827d34e --- /dev/null +++ b/paddle/phi/kernels/funcs/select_impl.cu.h @@ -0,0 +1,447 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +// CUDA and HIP use same api +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#ifdef __NVCC__ +#include "cub/cub.cuh" +#endif +#ifdef __HIPCC__ +#include +namespace cub = hipcub; +#endif + +#include +#include "paddle/fluid/memory/malloc.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/core/ddim.h" +#include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/primitive/kernel_primitives.h" + +namespace kps = phi::kps; + +namespace phi { +namespace funcs { +using Mode = kps::details::ReduceMode; + +/* +* Count how many of the data being processed by the current block are true +* 1. Load data from global memory and cast from bool to int64_t +* 2. Get result of this thread according to thread reduce +* 3. Get result of this block according to block reduce +* 4. first block store 0 and current result +*/ +template +struct NonZeroFunctor { + HOSTDEVICE NonZeroFunctor() {} + HOSTDEVICE inline T operator()(const T in) { + if (in) { + return static_cast(1); + } else { + return static_cast(0); + } + } +}; + +template +__device__ void GetBlockCountImpl(const InT *in, + OutT *out, + int num, + int repeat) { + InT in_data[VecSize]; + OutT temp[VecSize]; + OutT result = static_cast(0.0f); + using Add = kps::AddFunctor; + using Cast = NonZeroFunctor; + int store_fix = BLOCK_ID_X + repeat * GRID_NUM_X; + + kps::Init(&in_data[0], static_cast(0.0f)); + kps::ReadData(&in_data[0], in, num); + kps::ElementwiseUnary( + &temp[0], &in_data[0], Cast()); + kps::Reduce( + &result, &temp[0], Add(), true); + kps::Reduce( + &result, &result, Add(), true); + if (store_fix == 0) { + // first block's fix_size = 0; + OutT tmp = static_cast(0.0f); + kps::WriteData(out + store_fix, &tmp, 1); + } + + // store num of this block + kps::WriteData(out + store_fix + 1, &result, 1); +} + +// Count how many data is not zero in current block +template +__global__ void GetBlockCountKernel(const InT *in, + OutT *out, + int64_t numel, + int64_t main_offset) { + int data_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize; + int stride = BLOCK_NUM_X * GRID_NUM_X * VecSize; + int repeat = 0; + for (; data_offset < main_offset; data_offset += stride) { + GetBlockCountImpl( + in + data_offset, out, BLOCK_NUM_X * VecSize, repeat); + repeat++; // to get the real blockIdx + } + + int num = numel - data_offset; + if (num > 0) { + GetBlockCountImpl( + in + data_offset, out, num, repeat); + } +} + +/* +* Get block num prefix us one block, VecSize must be 2 +* 1. Each thread load 2 data : threadIdx.x and threadIdx.x + blockDimx.x +* 2. Cumsum limitation is blockDim.x must be less than 512 +*/ + +template +__device__ void CumsumImpl( + const InT *in, OutT *out, OutT *pre_cumsum, int num, Functor func) { + __shared__ OutT max_thread_data; + OutT temp[VecSize]; + InT arg[VecSize]; + OutT result[VecSize]; + // init data_pr + kps::Init(&arg[0], static_cast(0.0f)); + // set pre_cumsum + kps::Init(&temp[0], *pre_cumsum); + // load data to arg + kps::ReadData( + &arg[0], in, num, 1, BLOCK_NUM_X, 1); + // block cumsum + kps::Cumsum(&result[0], &arg[0], func); + // result = cumsum_result + pre_cumsum + kps::ElementwiseBinary( + &result[0], &result[0], &temp[0], func); + // get the last prefix sum + if ((THREAD_ID_X == BLOCK_NUM_X - 1) && !IsBoundary) { + max_thread_data = result[VecSize - 1]; + } + __syncthreads(); + // update pre_cumsum + *pre_cumsum = max_thread_data; + kps::WriteData( + out, &result[0], num, 1, BLOCK_NUM_X, 1); +} + +// Compute this store_offset of this block +template +__global__ void CumsumOneBlock( + const InT *in, OutT *out, int numel, int main_offset, Functor func) { + int stride = BLOCK_NUM_X * VecSize; + int offset = 0; + OutT pre_cumsum = static_cast(0); + for (; offset < main_offset; offset += stride) { + CumsumImpl( + in + offset, out + offset, &pre_cumsum, BLOCK_NUM_X * VecSize, func); + } + + int num = numel - offset; + if (num > 0) { + CumsumImpl( + in + offset, out + offset, &pre_cumsum, num, func); + } +} + +template +struct SelectCaller { + __device__ void inline operator()(OutT *store_data, + const MT *mask_data, + const InT *in, + Functor func, + int num, + int data_offset) { + // where_index op + IdT index_reg[VecSize]; + // Set data index of global + kps::InitWithDataIndex(&index_reg[0], data_offset); + // Get store data according to mask_idt + kps::OperatorTernary( + store_data, mask_data, &index_reg[0], func, VecSize); + } +}; + +template +struct SelectCaller { // masked_select + __device__ void inline operator()(OutT *store_data, + const MT *mask_data, + const InT *in, + Functor func, + int num, + int data_offset) { + InT in_data[VecSize]; + kps::ReadData(&in_data[0], in, num); + // Get store data according to mask_idt + kps::OperatorTernary( + store_data, mask_data, &in_data[0], func, VecSize); + } +}; + +/** +* Get mask's index if mask == true +*/ +template // SelectType = 1 Mask_select else where_index +__device__ void +SelectKernelImpl(OutT *out, + const MT *mask, + const InT *in, + Functor func, + int num, + int data_offset, + int store_rank) { + const int kCVecSize = 2; + // each thread cumsum 2 data + using IdT = int64_t; + // Set index data type + using Add = kps::AddFunctor; // for cumsum + using Cast = NonZeroFunctor; // for mask + + IdT init_idx = static_cast(0.0f); + MT init_mask = static_cast(0.0f); + + IdT num_thread[kCVecSize]; + IdT cumsum_thread[kCVecSize]; + + OutT store_data[VecSize * phi::DDim::kMaxRank]; + MT mask_data[VecSize]; + IdT mask_idt[VecSize]; + // init data_pr + kps::Init(&cumsum_thread[0], init_idx); + kps::Init(&num_thread[0], init_idx); + kps::Init(&mask_data[0], init_mask); + // Load mask + kps::ReadData(&mask_data[0], mask, num); + // Cast from MT to int + kps::ElementwiseUnary( + &mask_idt[0], &mask_data[0], Cast()); + // Get the num of thread only num_thread[1] has data + kps::Reduce( + &num_thread[0], &mask_idt[0], Add(), true); + // Get cumsum_thread cumsum from 0 to num_thread cumsum_thread[0] is the + // thread_fix + kps::Cumsum(&cumsum_thread[0], &num_thread[0], Add()); + // Get store data(index) according to mask_idt + SelectCaller + compute; + compute(&store_data[0], &mask_data[0], in, func, num, data_offset); + // get thread_fix + int thread_fix = + (static_cast(cumsum_thread[0] - num_thread[0]) * store_rank); + // get how many data need to store + int store_num = static_cast(num_thread[0]) * store_rank; + // thread store num data, each thread may has different num + kps::details::WriteData(out + thread_fix, &store_data[0], store_num); +} + +template +__global__ void SelectKernel(OutT *out, + const MT *mask, + const InT *in, + CT *cumsum, + Functor func, + const int64_t numel, + int64_t main_offset, + int store_rank) { + int data_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize; + int stride = BLOCK_NUM_X * GRID_NUM_X * VecSize; + int repeat = 0; + int size = VecSize * BLOCK_ID_X; + for (; data_offset < main_offset; data_offset += stride) { + // Cumsum index + int idx_cumsum = repeat * GRID_NUM_X + BLOCK_ID_X; + // niuliling todo: us ReadData API + int block_store_offset = cumsum[idx_cumsum]; + SelectKernelImpl( + out + block_store_offset * store_rank, + mask + data_offset, + in + data_offset, + func, + size, + data_offset, + store_rank); + repeat++; + } + + int num = numel - data_offset; + if (num > 0) { + // Cumsum index + int idx_cumsum = repeat * GRID_NUM_X + BLOCK_ID_X; + // niuliling todo: us ReadData API + int block_store_offset = static_cast(cumsum[idx_cumsum]); + SelectKernelImpl( + out + block_store_offset * store_rank, + mask + data_offset, + in + data_offset, + func, + num, + data_offset, + store_rank); + } +} + +inline int64_t Floor(int64_t in, int64_t div) { return in / div * div; } + +// SelectData = 1 then masked_select; SelectData = 0 then where_index +template +void SelectKernel(const KPDevice &dev_ctx, + const DenseTensor &condition, + const DenseTensor &in_data, + DenseTensor *out, + Functor func) { + const MT *cond_data = condition.data(); + const int64_t numel = condition.numel(); + auto dims = condition.dims(); + int rank = SelectData ? 1 : dims.size(); + const InT *in_data_ptr = SelectData ? in_data.data() : nullptr; + // calculate the inclusive prefix sum of "true_num_array" + // to get the index of "out" tensor, + // and the total number of cond_data[i]==true. + // Example: + // condition: F T T F F F T T + // before: 0 1 1 0 0 0 1 1 + // after: 0 1 2 2 2 2 3 4 + // out: 1 2 6 7 + // alloc for cpu + using CT = int64_t; // set Count_data Type + const int t_size = sizeof(CT); + + const paddle::platform::CUDAPlace &cuda_place = dev_ctx.GetPlace(); + paddle::platform::CPUPlace cpu_place = paddle::platform::CPUPlace(); + + // 1.1 get stored data num of per block + int total_true_num = 0; // init + const int kVecSize = 4; +#ifdef PADDLE_WITH_XPU_KP + int block = 64; + auto stream = dev_ctx.x_context()->xpu_stream; + const int num_per_block = kVecSize * block; + const int need_grids = (numel + num_per_block - 1) / num_per_block; + const int grid = std::min(need_grids, 8); +#else + const int block = 256; + const int num_per_block = kVecSize * block; + const int need_grids = (numel + num_per_block - 1) / num_per_block; + const int grid = std::min(need_grids, 256); + auto stream = dev_ctx.stream(); +#endif + const int64_t main_offset = Floor(numel, num_per_block); + // 1.2 alloc tmp data for CoutBlock + const int size_count_block = need_grids + 1; + std::vector dims_vec = {size_count_block * 2}; + ScalarArray dims_array(dims_vec); + DenseTensor count_mem = phi::Empty(dev_ctx, dims_array); + CT *count_data = count_mem.data(); + // 1.3 launch CountKernl + GetBlockCountKernel<<>>( + cond_data, count_data, numel, main_offset); + // 2.1 alloc cumsum data for CoutBlock prefix + DenseTensor cumsum_mem = phi::Empty(dev_ctx, dims_array); + CT *cumsum_data = cumsum_mem.data(); + // 2.2 get prefix of count_data for real out_index + const int kCumVesize = 2; + const int block_c = 256; + const int main_offset_c = Floor(size_count_block, (kCumVesize * block_c)); + using Add = kps::AddFunctor; + CumsumOneBlock<<<1, block_c, 0, stream>>>( + count_data, cumsum_data, size_count_block, main_offset_c, Add()); + // 3.1 set temp ptr for in; + // 3.1 alloc for out + // 3.1.1 get true_num for gpu place the last cumsum is the true_num + paddle::memory::Copy(cpu_place, + &total_true_num, + cuda_place, + cumsum_data + need_grids, + t_size, + dev_ctx.stream()); + + dev_ctx.Wait(); + // 3.1.2 allock for out with total_true_num + std::vector out_dim = {static_cast(total_true_num)}; + if (SelectData == 0) { // where_index + out_dim.push_back(rank); + } + out->Resize(phi::make_ddim(out_dim)); + auto out_data = out->mutable_data(cuda_place); + // 3.2 get true data's index according to cond_data and cumsum_data + if (total_true_num <= 0) return; + SelectKernel<<>>(out_data, + cond_data, + in_data_ptr, + cumsum_data, + func, + numel, + main_offset, + rank); +} + +} // namespace funcs +} // namespace phi + +#endif diff --git a/paddle/phi/kernels/funcs/sparse/convolution.h b/paddle/phi/kernels/funcs/sparse/convolution.h index 68fe8880a971dd7a56d677a5567bb053f5ba117a..19f1f3d3cd2fadff918da25cc873944e927a473a 100644 --- a/paddle/phi/kernels/funcs/sparse/convolution.h +++ b/paddle/phi/kernels/funcs/sparse/convolution.h @@ -93,7 +93,7 @@ inline HOSTDEVICE void IndexToPoint( } inline void GetOutShape(const DDim& x_dims, - const DDim& kernel_dims, + const std::vector& kernel_sizes, const std::vector& paddings, const std::vector& dilations, const std::vector& strides, @@ -102,17 +102,17 @@ inline void GetOutShape(const DDim& x_dims, x_dims.size(), 5, phi::errors::InvalidArgument("the shape of x should be (N, D, H, W, C)")); - PADDLE_ENFORCE_EQ(kernel_dims.size(), + PADDLE_ENFORCE_EQ(kernel_sizes.size(), 5, phi::errors::InvalidArgument( "the shape of kernel should be (D, H, W, C, OC)")); // infer out shape (*out_dims)[0] = x_dims[0]; - (*out_dims)[4] = kernel_dims[4]; + (*out_dims)[4] = kernel_sizes[4]; for (int i = 1; i < 4; i++) { (*out_dims)[i] = (x_dims[i] + 2 * paddings[i - 1] - - dilations[i - 1] * (kernel_dims[i - 1] - 1) - 1) / + dilations[i - 1] * (kernel_sizes[i - 1] - 1) - 1) / strides[i - 1] + 1; } @@ -131,7 +131,7 @@ template inline void SubmPreProcess(const Context& dev_ctx, const SparseCooTensor& x, const DenseTensor& kernel, - const SparseCooTensor& out_grad, + const DenseTensor& out_grad, const int in_channels, const int out_channels, const int half_kernel_size, @@ -142,11 +142,11 @@ inline void SubmPreProcess(const Context& dev_ctx, blas.GEMM(CblasTrans, CblasNoTrans, x.non_zero_elements().dims()[1], - out_grad.non_zero_elements().dims()[1], + out_grad.dims()[1], x.non_zero_elements().dims()[0], static_cast(1), x.non_zero_elements().data(), - out_grad.non_zero_elements().data(), + out_grad.data(), static_cast(0), d_kernel_ptr + half_kernel_size * in_channels * out_channels); @@ -155,16 +155,36 @@ inline void SubmPreProcess(const Context& dev_ctx, T* x_grad_ptr = x_grad->data(); blas.GEMM(CblasNoTrans, CblasTrans, - out_grad.non_zero_elements().dims()[0], + out_grad.dims()[0], in_channels, - out_grad.non_zero_elements().dims()[1], + out_grad.dims()[1], static_cast(1), - out_grad.non_zero_elements().data(), + out_grad.data(), kernel.data() + half_kernel_size * in_channels * out_channels, static_cast(0), x_grad_ptr); } +inline const std::vector PoolResetKernel( + const std::vector& kernel_sizes, + const int in_channels, + const int out_channels) { + std::vector res(kernel_sizes); + res.resize(5); + res[3] = in_channels; + res[4] = out_channels; + return res; +} + +inline void PrefixSum(const int* counter, int* offsets, const int n) { + int offset = 0; + for (int i = 0; i < n; i++) { + offsets[i] = offset; + offset += counter[i]; + } + offsets[n] = offset; +} + } // namespace sparse } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/funcs/tril_triu_compute.h b/paddle/phi/kernels/funcs/tril_triu_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..d2b6f1e559d2b5bfc333b60359f5b1e56e9aaadb --- /dev/null +++ b/paddle/phi/kernels/funcs/tril_triu_compute.h @@ -0,0 +1,48 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +namespace phi { +namespace funcs { + +template +class TrilTriuCompute { + public: + HOSTDEVICE TrilTriuCompute(const T* in, + const int diagonal, + const bool lower, + const int64_t H, + const int64_t W, + T* out) + : in_(in), diagonal_(diagonal), lower_(lower), H_(H), W_(W), out_(out) {} + + HOSTDEVICE void operator()(int64_t idx) { + const int64_t row = (idx / W_) % H_; + const int64_t col = idx % W_; + const bool mask = + lower_ ? (col - row > diagonal_) : (col - row < diagonal_); + out_[idx] = mask ? static_cast(0) : in_[idx]; + } + + private: + const T* in_; + const int diagonal_; + const bool lower_; + const int64_t H_; + const int64_t W_; + T* out_; +}; +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/gather_grad_kernel.h b/paddle/phi/kernels/gather_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..e53da7b471c7b82efef2319915cc57537ee824b5 --- /dev/null +++ b/paddle/phi/kernels/gather_grad_kernel.h @@ -0,0 +1,31 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/scalar.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void GatherGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& index, + const DenseTensor& out_grad, + const Scalar& axis, + bool overwrite, + DenseTensor* x_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/reduce_max_kernel.h b/paddle/phi/kernels/gather_kernel.h similarity index 61% rename from paddle/phi/kernels/reduce_max_kernel.h rename to paddle/phi/kernels/gather_kernel.h index 7560473d43c718a80cfb8911cd250ef8fc74d82c..78ac09125b69298c59622fc69469ba8d28cae919 100644 --- a/paddle/phi/kernels/reduce_max_kernel.h +++ b/paddle/phi/kernels/gather_kernel.h @@ -14,25 +14,16 @@ #pragma once +#include "paddle/phi/common/scalar.h" #include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/infermeta/binary.h" -#include "paddle/phi/infermeta/unary.h" -#include "paddle/phi/kernels/empty_kernel.h" namespace phi { template -void MaxRawKernel(const Context& dev_ctx, +void GatherKernel(const Context& dev_ctx, const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, + const DenseTensor& index, + const Scalar& axis, DenseTensor* out); -template -void MaxKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - DenseTensor* out); } // namespace phi diff --git a/paddle/phi/kernels/gelu_grad_kernel.h b/paddle/phi/kernels/gelu_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..fd70e8d54bc8d004373efd1874f4b07a9ebde6a8 --- /dev/null +++ b/paddle/phi/kernels/gelu_grad_kernel.h @@ -0,0 +1,31 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#ifndef _USE_MATH_DEFINES +#define _USE_MATH_DEFINES // use M_2_SQRTPI on Windows +#endif + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void GeluGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + bool approximate, + DenseTensor* x_grad); +} // namespace phi diff --git a/paddle/phi/kernels/gelu_kernel.h b/paddle/phi/kernels/gelu_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..bc106a04031fbcc2a96209e170d60eda8cc7b5e1 --- /dev/null +++ b/paddle/phi/kernels/gelu_kernel.h @@ -0,0 +1,32 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#ifndef _USE_MATH_DEFINES +#define _USE_MATH_DEFINES // use M_2_SQRTPI on Windows +#endif + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +#define GELU_CONSTANT 0.044715 + +template +void GeluKernel(const Context& dev_ctx, + const DenseTensor& x, + bool approximate, + DenseTensor* out); +} // namespace phi diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu index 00792b8ab607036112295f2dd4018c69eb78680a..c912d0c4686ff3fee88925f4d7121f38f24a5485 100644 --- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu @@ -73,7 +73,7 @@ void ActivationGradGPUImpl(const Context& dev_ctx, } } -#define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(name, functor_class) \ +#define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(name, functor_class) \ template \ void name##GradKernel(const Context& dev_ctx, \ const DenseTensor& x, \ @@ -84,7 +84,7 @@ void ActivationGradGPUImpl(const Context& dev_ctx, dev_ctx, &x, nullptr, &dout, dx, functor); \ } -#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX( \ +#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX( \ name, functor_class, attr) \ template \ void name##GradKernel(const Context& dev_ctx, \ @@ -99,7 +99,7 @@ void ActivationGradGPUImpl(const Context& dev_ctx, dev_ctx, &x, nullptr, &dout, dx, functor); \ } -#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX( \ +#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX( \ name, functor_class, attr1, attr2) \ template \ void name##GradKernel(const Context& dev_ctx, \ @@ -116,7 +116,7 @@ void ActivationGradGPUImpl(const Context& dev_ctx, dev_ctx, &x, nullptr, &dout, dx, functor); \ } -#define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(name, functor_class) \ +#define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(name, functor_class) \ template \ void name##GradKernel(const Context& dev_ctx, \ const DenseTensor& out, \ @@ -127,7 +127,7 @@ void ActivationGradGPUImpl(const Context& dev_ctx, dev_ctx, nullptr, &out, &dout, dx, functor); \ } -#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepOut( \ +#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT( \ name, functor_class, attr) \ template \ void name##GradKernel(const Context& dev_ctx, \ @@ -142,32 +142,87 @@ void ActivationGradGPUImpl(const Context& dev_ctx, dev_ctx, nullptr, &out, &dout, dx, functor); \ } -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(Relu, CudaReluGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(Tanh, CudaTanhGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Cos, CudaCosGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Tan, CudaTanGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Acos, CudaAcosGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Sin, CudaSinGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Asin, CudaAsinGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Atan, CudaAtanGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Sinh, CudaSinhGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Cosh, CudaCoshGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Asinh, CudaAsinhGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Acosh, CudaAcoshGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Atanh, CudaAtanhGradFunctor); - -DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(LeakyRelu, +#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT( \ + name, functor_class, attr1, attr2) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& out, \ + const DenseTensor& dout, \ + float attr1, \ + float attr2, \ + DenseTensor* dx) { \ + funcs::functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr1; \ + *(attrs[1].second) = attr2; \ + ActivationGradGPUImpl>( \ + dev_ctx, nullptr, &out, &dout, dx, functor); \ + } + +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu, CudaReluGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh, CudaTanhGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Sigmoid, CudaSigmoidGradFunctor); + +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Cos, CudaCosGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Tan, CudaTanGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Acos, CudaAcosGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Sin, CudaSinGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Asin, CudaAsinGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Atan, CudaAtanGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Sinh, CudaSinhGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Cosh, CudaCoshGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Asinh, CudaAsinhGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Acosh, CudaAcoshGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Atanh, CudaAtanhGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink, CudaTanhShrinkGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Silu, CudaSiluGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid, CudaLogSigmoidGradFunctor); + +DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu, CudaLeakyReluGradFunctor, alpha); -DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(ThresholdedRelu, +DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(ThresholdedRelu, CudaThresholdedReluGradFunctor, threshold); +DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink, + CudaSoftShrinkGradFunctor, + lambda); +DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(HardShrink, + CudaHardShrinkGradFunctor, + threshold); -DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX(BRelu, +DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu, CudaBReluGradFunctor, t_min, t_max); +DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(HardSigmoid, + CudaHardSigmoidGradFunctor, + slope, + offset); + +template +void EluGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& dout, + float alpha, + DenseTensor* dx) { + dev_ctx.template Alloc(dx); + std::vector ins = {&dout, &out}; + std::vector outs = {dx}; + if (alpha > 0) { + funcs::CudaELUGradFunctor functor; + functor.alpha = alpha; + funcs::ElementwiseKernel(dev_ctx, ins, &outs, functor); + } else { + funcs::CudaELUGradNegativeAlphaFunctor functor; + functor.alpha = alpha; + ins.push_back(&x); + funcs::ElementwiseKernel(dev_ctx, ins, &outs, functor); + } +} + } // namespace phi #ifdef PADDLE_WITH_HIP @@ -234,3 +289,14 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_double_grad, LeakyReluDoubleGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(thresholded_relu_grad, ThresholdedReluGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(soft_shrink_grad, SoftShrinkGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_shrink_grad, HardShrinkGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_shrink_grad, TanhShrinkGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(silu_grad, SiluGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_grad, EluGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_double_grad, EluDoubleGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_grad, SigmoidGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_double_grad, SigmoidDoubleGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_triple_grad, SigmoidTripleGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_sigmoid_grad, HardSigmoidGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(logsigmoid_grad, LogSigmoidGradKernel) diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu index 3c340a89f5746bd8de31826f7639e6ed0b7391f6..6b598c764debb059072ba3ae3ac90e6985479133 100644 --- a/paddle/phi/kernels/gpu/activation_kernel.cu +++ b/paddle/phi/kernels/gpu/activation_kernel.cu @@ -38,12 +38,13 @@ void ActivationGPUImpl(const Context& dev_ctx, funcs::ElementwiseKernel(dev_ctx, ins, &outs, functor); } -#define DEFINE_GPU_ACTIVATION_KERNEL(name, functor_class) \ - template \ - void name##Kernel( \ - const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { \ - functor_class functor; \ - ActivationGPUImpl(dev_ctx, x, out, functor); \ +#define DEFINE_GPU_ACTIVATION_KERNEL(name, functor_class) \ + template \ + void name##Kernel( \ + const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { \ + funcs::functor_class functor; \ + ActivationGPUImpl>( \ + dev_ctx, x, out, functor); \ } #define DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(name, functor_class, attr) \ @@ -75,26 +76,39 @@ void ActivationGPUImpl(const Context& dev_ctx, dev_ctx, x, out, functor); \ } -DEFINE_GPU_ACTIVATION_KERNEL(Cos, funcs::CudaCosFunctor) -DEFINE_GPU_ACTIVATION_KERNEL(Tan, funcs::CudaTanFunctor) -DEFINE_GPU_ACTIVATION_KERNEL(Acos, funcs::CudaAcosFunctor) -DEFINE_GPU_ACTIVATION_KERNEL(Sin, funcs::CudaSinFunctor) -DEFINE_GPU_ACTIVATION_KERNEL(Asin, funcs::CudaAsinFunctor) -DEFINE_GPU_ACTIVATION_KERNEL(Atan, funcs::CudaAtanFunctor) -DEFINE_GPU_ACTIVATION_KERNEL(Sinh, funcs::CudaSinhFunctor) -DEFINE_GPU_ACTIVATION_KERNEL(Cosh, funcs::CudaCoshFunctor) -DEFINE_GPU_ACTIVATION_KERNEL(Asinh, funcs::CudaAsinhFunctor) -DEFINE_GPU_ACTIVATION_KERNEL(Acosh, funcs::CudaAcoshFunctor) -DEFINE_GPU_ACTIVATION_KERNEL(Atanh, funcs::CudaAtanhFunctor) -DEFINE_GPU_ACTIVATION_KERNEL(Relu, funcs::CudaReluFunctor) -DEFINE_GPU_ACTIVATION_KERNEL(Tanh, funcs::CudaTanhFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Cos, CudaCosFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Tan, CudaTanFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Acos, CudaAcosFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Sin, CudaSinFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Asin, CudaAsinFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Atan, CudaAtanFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Sinh, CudaSinhFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Cosh, CudaCoshFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Asinh, CudaAsinhFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Acosh, CudaAcoshFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Atanh, CudaAtanhFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Relu, CudaReluFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Tanh, CudaTanhFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(TanhShrink, CudaTanhShrinkFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Silu, CudaSiluFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Sigmoid, CudaSigmoidFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(LogSigmoid, CudaLogSigmoidFunctor) DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, CudaLeakyReluFunctor, alpha) DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu, CudaThresholdedReluFunctor, threshold) +DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(HardShrink, + CudaHardShrinkFunctor, + threshold) +DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(SoftShrink, CudaSoftShrinkFunctor, lambda) +DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(Elu, CudaELUFunctor, alpha) DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, CudaBReluFunctor, t_min, t_max) +DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(HardSigmoid, + CudaHardSigmoidFunctor, + slope, + offset) } // namespace phi @@ -142,3 +156,11 @@ PD_REGISTER_ACTIVATION_KERNEL(tanh, TanhKernel) PD_REGISTER_ACTIVATION_KERNEL(brelu, BReluKernel) PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedReluKernel) PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel) +PD_REGISTER_ACTIVATION_KERNEL(hard_shrink, HardShrinkKernel) +PD_REGISTER_ACTIVATION_KERNEL(soft_shrink, SoftShrinkKernel) +PD_REGISTER_ACTIVATION_KERNEL(tanh_shrink, TanhShrinkKernel) +PD_REGISTER_ACTIVATION_KERNEL(elu, EluKernel) +PD_REGISTER_ACTIVATION_KERNEL(silu, SiluKernel) +PD_REGISTER_ACTIVATION_KERNEL(sigmoid, SigmoidKernel) +PD_REGISTER_ACTIVATION_KERNEL(logsigmoid, LogSigmoidKernel) +PD_REGISTER_ACTIVATION_KERNEL(hard_sigmoid, HardSigmoidKernel) diff --git a/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu b/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..6e8712462928d43077766a2fd03aee51d9a4cd8c --- /dev/null +++ b/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu @@ -0,0 +1,320 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/cumprod_grad_kernel.h" + +#include +#include "paddle/fluid/operators/math/inclusive_scan.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/complex_functors.h" +#include "paddle/phi/kernels/funcs/cumprod.h" +#include "paddle/phi/kernels/funcs/elementwise_functor.h" +#include "paddle/phi/kernels/funcs/for_range.h" +// NOTE(@xiongkun): use of IsComplex<> +#include "paddle/fluid/framework/data_type.h" + +namespace phi { + +template +struct CumprodGradFunctorExceptFirstZero { + HOSTDEVICE CumprodGradFunctorExceptFirstZero( + const T *x, + const T *y, + const T *dy_mul_y_reversed_cumsum, + const uint8_t *zero_mask, + size_t mid_dim, + size_t inner_dim, + T *dx, + int64_t *first_zero_idx, + T *x_filled_one) + : x_(x), + y_(y), + dy_mul_y_reversed_cumsum_(dy_mul_y_reversed_cumsum), + zero_mask_(zero_mask), + mid_dim_(mid_dim), + inner_dim_(inner_dim), + dx_(dx), + first_zero_idx_(first_zero_idx), + x_filled_one_(x_filled_one) {} + + HOSTDEVICE void operator()(size_t idx) const { + auto inner_idx = idx % inner_dim_; + auto outer_idx = idx / (mid_dim_ * inner_dim_); + auto mid_idx = (idx - inner_idx) / inner_dim_ % mid_dim_; + auto mask = zero_mask_[idx]; + bool should_fill_one = true; + + if (mask == 0) { + dx_[idx] = dy_mul_y_reversed_cumsum_[idx] / x_[idx]; + if (mid_idx == mid_dim_ - 1) { + // record first zero position as -1, i.e., no zero + first_zero_idx_[outer_idx * inner_dim_ + inner_idx] = -1; + } + } else if (mid_idx > 0) { // mask > 0 + if (zero_mask_[idx - inner_dim_] > 0) { // not first zero + dx_[idx] = 0; + should_fill_one = false; + } else { + // idx is the first zero position, it should be recorded + dx_[idx] = y_[idx - inner_dim_]; + first_zero_idx_[outer_idx * inner_dim_ + inner_idx] = mid_idx; + } + } else { // the first zero position is index 0 + dx_[idx] = 1; + first_zero_idx_[outer_idx * inner_dim_ + inner_idx] = 0; + } + + x_filled_one_[idx] = should_fill_one ? 1 : x_[idx]; + } + + private: + const T *x_; + const T *y_; + const T *dy_mul_y_reversed_cumsum_; + const uint8_t *zero_mask_; + size_t mid_dim_; + size_t inner_dim_; + T *dx_; + int64_t *first_zero_idx_; + T *x_filled_one_; +}; + +template +struct FillFirstZeroPositionGradFunctor { + HOSTDEVICE FillFirstZeroPositionGradFunctor(const int64_t *first_zero_idx, + const T *grad_value, + size_t mid_dim, + size_t inner_dim, + T *dx) + : first_zero_idx_(first_zero_idx), + grad_value_(grad_value), + mid_dim_(mid_dim), + inner_dim_(inner_dim), + dx_(dx) {} + + HOSTDEVICE void operator()(size_t idx) const { + auto outer_idx = idx / inner_dim_; + auto inner_idx = idx % inner_dim_; + auto mid_idx = first_zero_idx_[idx]; + if (mid_idx >= 0) { + auto full_idx = + outer_idx * mid_dim_ * inner_dim_ + mid_idx * inner_dim_ + inner_idx; + dx_[full_idx] *= grad_value_[full_idx]; + } + } + + private: + const int64_t *first_zero_idx_; + const T *grad_value_; + size_t mid_dim_; + size_t inner_dim_; + T *dx_; +}; + +template +void CumprodGradKernel(const Context &dev_ctx, + const DenseTensor &x, + const DenseTensor &out, + const DenseTensor &dout, + int dim, + DenseTensor *dx) { + const auto *y = &out; + const auto *dy = &dout; + + size_t outer_dim, mid_dim, inner_dim; + GetCumprodDimInfo(x.dims(), dim, &outer_dim, &mid_dim, &inner_dim); + if (outer_dim == 0 || mid_dim == 0 || inner_dim == 0) return; + + size_t numel = outer_dim * mid_dim * inner_dim; + + const auto *x_data = x.data(); + const auto *y_data = y->data(); + const auto *dy_data = dy->data(); + + auto place = dev_ctx.GetPlace(); + auto *dx_data = dev_ctx.template Alloc(dx); + + // deal with complex + const T *x_data_deal; + const T *y_data_deal; + Allocator::AllocationPtr x_conj; + Allocator::AllocationPtr y_conj; + if (paddle::framework::IsComplex::value) { + x_conj = const_cast(dev_ctx.GetAllocator()) + .Allocate(numel * sizeof(T)); + auto *x_data_conj = reinterpret_cast(x_conj->ptr()); + y_conj = const_cast(dev_ctx.GetAllocator()) + .Allocate(numel * sizeof(T)); + auto *y_data_conj = reinterpret_cast(y_conj->ptr()); + + phi::funcs::ForRange for_range_x(dev_ctx, numel); + phi::funcs::ConjFunctor functor_x(x_data, numel, x_data_conj); + for_range_x(functor_x); + + phi::funcs::ForRange for_range_y(dev_ctx, numel); + phi::funcs::ConjFunctor functor_y(y_data, numel, y_data_conj); + for_range_y(functor_y); + x_data_deal = x_data_conj; + y_data_deal = y_data_conj; + } else { + x_data_deal = x_data; + y_data_deal = y_data; + } + +// Step 1: find cummax-ed zero mask of x +#ifdef PADDLE_WITH_CUDA + const auto &exec_policy = thrust::cuda::par.on(dev_ctx.stream()); +#else + const auto &exec_policy = thrust::hip::par.on(dev_ctx.stream()); +#endif + auto zero_mask_without_cummax = + const_cast(dev_ctx.GetAllocator()) + .Allocate(numel * sizeof(uint8_t)); + auto *zero_mask_without_cummax_data = + reinterpret_cast(zero_mask_without_cummax->ptr()); + thrust::transform(exec_policy, + thrust::device_pointer_cast(x_data_deal), + thrust::device_pointer_cast(x_data_deal) + numel, + thrust::device_pointer_cast(zero_mask_without_cummax_data), + funcs::IsZeroFunctor()); + + auto zero_mask = const_cast(dev_ctx.GetAllocator()) + .Allocate(numel * sizeof(uint8_t)); + auto *zero_mask_data = reinterpret_cast(zero_mask->ptr()); + paddle::operators::math::InclusiveScan( + zero_mask_without_cummax_data, + zero_mask_data, + outer_dim, + mid_dim, + inner_dim, + static_cast(0), + cub::Max(), + /*reverse=*/false, + dev_ctx); + zero_mask_without_cummax = nullptr; + + // Step 2: calculate reversed cumsum(dy * y) + auto dy_mul_y = const_cast(dev_ctx.GetAllocator()) + .Allocate(numel * sizeof(T)); + auto *dy_mul_y_data = reinterpret_cast(dy_mul_y->ptr()); + thrust::transform(exec_policy, + thrust::device_pointer_cast(dy_data), + thrust::device_pointer_cast(dy_data) + numel, + thrust::device_pointer_cast(y_data_deal), + thrust::device_pointer_cast(dy_mul_y_data), + funcs::MultiplyFunctor()); + + auto dy_mul_y_reversed_cumsum = + const_cast(dev_ctx.GetAllocator()) + .Allocate(numel * sizeof(T)); + auto *dy_mul_y_reversed_cumsum_data = + reinterpret_cast(dy_mul_y_reversed_cumsum->ptr()); + paddle::operators::math::InclusiveScan( + dy_mul_y_data, + dy_mul_y_reversed_cumsum_data, + outer_dim, + mid_dim, + inner_dim, + static_cast(0), + cub::Sum(), + /*reverse=*/true, + dev_ctx); + + // Step 3: calculate the gradient value except the first zero position. + // The gradient value of the first zero position is filled with out[idx-1], + // while the gradient value of the other positions are calculated out + // completely. This functor also: + // (1) find the first zero index, i.e., first_zero_idx_data. + // (2) fill x_filled_one, which satifies + // x_filled_one[i] = x[i], i > pos + // x_filled_one[i] = 1, i <= pos + auto first_zero_idx = const_cast(dev_ctx.GetAllocator()) + .Allocate(numel * sizeof(int64_t)); + auto *first_zero_idx_data = + reinterpret_cast(first_zero_idx->ptr()); + auto *x_filled_one_data = dy_mul_y_data; // reuse former allocated memory + phi::funcs::ForRange for_range(dev_ctx, numel); + CumprodGradFunctorExceptFirstZero functor_except_first_zero( + x_data_deal, + y_data_deal, + dy_mul_y_reversed_cumsum_data, + zero_mask_data, + mid_dim, + inner_dim, + dx_data, + first_zero_idx_data, + x_filled_one_data); + for_range(functor_except_first_zero); + + // Step 4: calculate cumprod of x_filled_one + auto *x_filled_one_cumprod_data = + dy_mul_y_reversed_cumsum_data; // reuse former allocated memory + paddle::operators::math::InclusiveScan>( + x_filled_one_data, + x_filled_one_cumprod_data, + outer_dim, + mid_dim, + inner_dim, + static_cast(1), + funcs::MultiplyFunctor(), + /*reverse=*/false, + dev_ctx); + + // Step 5: calculate reversed cumsum(dy * x_filled_one_cumprod) + auto *dy_mul_x_filled_one_cumprod = + dy_mul_y_data; // reuse former allocated memory + thrust::transform(exec_policy, + thrust::device_pointer_cast(dy_data), + thrust::device_pointer_cast(dy_data) + numel, + thrust::device_pointer_cast(x_filled_one_cumprod_data), + thrust::device_pointer_cast(dy_mul_x_filled_one_cumprod), + funcs::MultiplyFunctor()); + auto *dy_mul_x_filled_one_cumprod_reversed_cumsum = + dy_mul_y_reversed_cumsum_data; // reuse former allocated memory + paddle::operators::math::InclusiveScan( + dy_mul_x_filled_one_cumprod, + dy_mul_x_filled_one_cumprod_reversed_cumsum, + outer_dim, + mid_dim, + inner_dim, + static_cast(0), + cub::Sum(), + /*reverse=*/true, + dev_ctx); + + // Step 6: fill zero pos gradient value + phi::funcs::ForRange for_range_fill_zero_pos_grad( + dev_ctx, outer_dim * inner_dim); + FillFirstZeroPositionGradFunctor fill_first_zero_pos_grad_functor( + first_zero_idx_data, + dy_mul_x_filled_one_cumprod_reversed_cumsum, + mid_dim, + inner_dim, + dx_data); + for_range_fill_zero_pos_grad(fill_first_zero_pos_grad_functor); +} + +} // namespace phi + +PD_REGISTER_KERNEL(cumprod_grad, + GPU, + ALL_LAYOUT, + phi::CumprodGradKernel, + float, + double, + int, + int64_t, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/gpu/cumprod_kernel.cu b/paddle/phi/kernels/gpu/cumprod_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..1bbf8972a24798e5ddebf4dac2b3745eb1a2aee0 --- /dev/null +++ b/paddle/phi/kernels/gpu/cumprod_kernel.cu @@ -0,0 +1,60 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/cumprod_kernel.h" + +#include "paddle/fluid/operators/math/inclusive_scan.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/complex_functors.h" +#include "paddle/phi/kernels/funcs/cumprod.h" +#include "paddle/phi/kernels/funcs/elementwise_functor.h" + +namespace phi { + +template +void CumprodKernel(const Context &dev_ctx, + const DenseTensor &input, + int dim, + DenseTensor *out) { + const auto *x = &input; + auto *y = out; + size_t outer_dim, mid_dim, inner_dim; + GetCumprodDimInfo(x->dims(), dim, &outer_dim, &mid_dim, &inner_dim); + + const auto *x_data = x->data(); + auto *y_data = dev_ctx.template Alloc(y); + paddle::operators::math::InclusiveScan(x_data, + y_data, + outer_dim, + mid_dim, + inner_dim, + static_cast(1), + funcs::MultiplyFunctor(), + /*reverse=*/false, + dev_ctx); +} + +} // namespace phi + +PD_REGISTER_KERNEL(cumprod, + GPU, + ALL_LAYOUT, + phi::CumprodKernel, + float, + double, + int, + int64_t, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/gpu/deformable_conv_kernel.cu b/paddle/phi/kernels/gpu/deformable_conv_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..1db6e1b7cf73375f2617c727a26e5768922777d4 --- /dev/null +++ b/paddle/phi/kernels/gpu/deformable_conv_kernel.cu @@ -0,0 +1,160 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/deformable_conv_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/impl/deformable_conv_kernel_impl.h" + +namespace phi { + +static constexpr int kNumCUDAThreads = 512; +static constexpr int kNumMaximumNumBlocks = 4096; + +static inline int NumBlocks(const int N) { + return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, + kNumMaximumNumBlocks); +} + +template +__global__ void ModulatedDeformableIm2colGpuKernel( + const int nthreads, + const T* data_im, + const T* data_offset, + const T* data_mask, + const int height, + const int width, + const int kernel_h, + const int kernel_w, + const int pad_h, + const int pad_w, + const int stride_h, + const int stride_w, + const int dilation_h, + const int dilation_w, + const int channel_per_deformable_group, + const int batch_size, + const int num_channels, + const int deformable_group, + const int height_col, + const int width_col, + T* data_col) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int offset = blockDim.x * gridDim.x; + for (size_t i = index; i < nthreads; i += offset) { + const int w_col = i % width_col; + const int h_col = (i / width_col) % height_col; + const int b_col = (i / width_col) / height_col % batch_size; + const int c_im = (i / width_col / height_col) / batch_size; + const int c_col = c_im * kernel_h * kernel_w; + + const int deformable_group_index = c_im / channel_per_deformable_group; + + const int h_in = h_col * stride_h - pad_h; + const int w_in = w_col * stride_w - pad_w; + + T* data_col_ptr = + data_col + + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col; + const T* data_im_ptr = + data_im + (b_col * num_channels + c_im) * height * width; + const T* data_offset_ptr = + data_offset + + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * + kernel_w * height_col * width_col; + const T* data_mask_ptr = + data_mask + + (b_col * deformable_group + deformable_group_index) * kernel_h * + kernel_w * height_col * width_col; + + for (int i = 0; i < kernel_h; ++i) { + for (int j = 0; j < kernel_w; ++j) { + const int data_offset_h_ptr = + ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col; + const int data_offset_w_ptr = + ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + + w_col; + const int data_mask_hw_ptr = + ((i * kernel_w + j) * height_col + h_col) * width_col + w_col; + + const T offset_h = data_offset_ptr[data_offset_h_ptr]; + const T offset_w = data_offset_ptr[data_offset_w_ptr]; + const T mask = data_mask_ptr[data_mask_hw_ptr]; + T val = static_cast(0); + const T h_im = h_in + i * dilation_h + offset_h; + const T w_im = w_in + j * dilation_w + offset_w; + if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) { + val = + DmcnIm2colBilinear(data_im_ptr, width, height, width, h_im, w_im); + } + *data_col_ptr = val * mask; + data_col_ptr += batch_size * height_col * width_col; + } + } + } +} + +template +void ModulatedDeformableIm2col(const Context& dev_ctx, + const T* data_im, + const T* data_offset, + const T* data_mask, + const std::vector& im_shape, + const std::vector& col_shape, + const std::vector& filter_shape, + const std::vector& paddings, + const std::vector& strides, + const std::vector& dilations, + const int deformable_groups, + T* data_col) { + int channel_per_deformable_group = im_shape[0] / deformable_groups; + int num_kernels = im_shape[0] * col_shape[1] * col_shape[2] * col_shape[3]; + + int blocks = NumBlocks(num_kernels); + int threads = kNumCUDAThreads; + + ModulatedDeformableIm2colGpuKernel< + T><<>>(num_kernels, + data_im, + data_offset, + data_mask, + im_shape[1], + im_shape[2], + filter_shape[2], + filter_shape[3], + paddings[0], + paddings[1], + strides[0], + strides[1], + dilations[0], + dilations[1], + channel_per_deformable_group, + col_shape[1], + im_shape[0], + deformable_groups, + col_shape[2], + col_shape[3], + data_col); +} + +} // namespace phi + +PD_REGISTER_KERNEL(deformable_conv, + GPU, + ALL_LAYOUT, + phi::DeformableConvKernel, + float, + double) {} diff --git a/paddle/phi/kernels/gpu/determinant_grad_kernel.cu b/paddle/phi/kernels/gpu/determinant_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..cce12a87fac72f5ac6edbbeb74de9fe3ae9ede09 --- /dev/null +++ b/paddle/phi/kernels/gpu/determinant_grad_kernel.cu @@ -0,0 +1,25 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/determinant_grad_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/determinant_grad_kernel_impl.h" + +PD_REGISTER_KERNEL(determinant_grad, + GPU, + ALL_LAYOUT, + phi::DeterminantGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/gpu/determinant_kernel.cu b/paddle/phi/kernels/gpu/determinant_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..25184083873952638a1f84d8d4b66262363ca9c6 --- /dev/null +++ b/paddle/phi/kernels/gpu/determinant_kernel.cu @@ -0,0 +1,21 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/determinant_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/determinant_kernel_impl.h" + +PD_REGISTER_KERNEL( + determinant, GPU, ALL_LAYOUT, phi::DeterminantKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/elementwise_kernel.cu b/paddle/phi/kernels/gpu/elementwise_kernel.cu index 2cffc68fa0648937b96095f5bd58210adaf865b3..a57d89013f921e3adb5587c70b7bbb12c383de61 100644 --- a/paddle/phi/kernels/gpu/elementwise_kernel.cu +++ b/paddle/phi/kernels/gpu/elementwise_kernel.cu @@ -13,9 +13,50 @@ // limitations under the License. #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/complex.h" +#include "paddle/phi/common/float16.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/elementwise_kernel_impl.h" +namespace phi { + +#define DEFINE_CUDA_ELEMENTWISE_OP(name) \ + template \ + void name##RawKernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& y, \ + int axis, \ + DenseTensor* out) { \ + std::vector inputs; \ + std::vector outputs; \ + inputs.emplace_back(&x); \ + inputs.emplace_back(&y); \ + outputs.emplace_back(out); \ + dev_ctx.template Alloc(out); \ + funcs::BroadcastKernel( \ + dev_ctx, inputs, &outputs, axis, funcs::name##Functor()); \ + } + +/** + * Kernels + */ + +// Create the definition of Add +DEFINE_CUDA_ELEMENTWISE_OP(Add) +// Create the definition of Subtract +DEFINE_CUDA_ELEMENTWISE_OP(Subtract) +// Create the definition of Multiply +DEFINE_CUDA_ELEMENTWISE_OP(Multiply) +// Create the definition of Divide +DEFINE_CUDA_ELEMENTWISE_OP(Divide) + +} // namespace phi + +using float16 = phi::dtype::float16; +using bfloat16 = phi::dtype::bfloat16; +using complex64 = ::phi::dtype::complex; +using complex128 = ::phi::dtype::complex; + PD_REGISTER_KERNEL(elementwise_fmax, GPU, ALL_LAYOUT, @@ -33,3 +74,55 @@ PD_REGISTER_KERNEL(elementwise_fmin, double, int, int64_t) {} + +PD_REGISTER_KERNEL(add_raw, + GPU, + ALL_LAYOUT, + phi::AddRawKernel, + float, + double, + int16_t, + int, + int64_t, + float16, + bfloat16, + complex64, + complex128) {} +PD_REGISTER_KERNEL(subtract_raw, + GPU, + ALL_LAYOUT, + phi::SubtractRawKernel, + float, + double, + int16_t, + int, + int64_t, + float16, + bfloat16, + complex64, + complex128) {} +PD_REGISTER_KERNEL(divide_raw, + GPU, + ALL_LAYOUT, + phi::DivideRawKernel, + float, + double, + int, + int64_t, + float16, + bfloat16, + complex64, + complex128) {} +PD_REGISTER_KERNEL(multiply_raw, + GPU, + ALL_LAYOUT, + phi::MultiplyRawKernel, + float, + double, + int, + int64_t, + bool, + float16, + complex64, + complex128, + bfloat16) {} diff --git a/paddle/phi/kernels/gpu/gather_grad_kernel.cu b/paddle/phi/kernels/gpu/gather_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..04149a2f9ee41e797a66eedcb2d797fb87519041 --- /dev/null +++ b/paddle/phi/kernels/gpu/gather_grad_kernel.cu @@ -0,0 +1,73 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/gather_kernel.h" + +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/gather.cu.h" +#include "paddle/phi/kernels/funcs/scatter.cu.h" + +namespace phi { + +template +void GatherGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& index, + const DenseTensor& out_grad, + const Scalar& axis, + bool overwrite, + DenseTensor* x_grad) { + const auto& index_type = index.dtype(); + auto axis_v = axis.to(); + + if (axis_v != 0) { + if (index_type == DataType::INT32) { + phi::funcs::GatherV2GradCUDAFunction( + &out_grad, &index, axis_v, x_grad, dev_ctx); + } else if (index_type == DataType::INT64) { + phi::funcs::GatherV2GradCUDAFunction( + &out_grad, &index, axis_v, x_grad, dev_ctx); + } + return; + } + + dev_ctx.template Alloc(x_grad); + auto dxt = EigenVector::Flatten(*x_grad); + auto& place = *dev_ctx.eigen_device(); + dxt.device(place) = dxt.constant(static_cast(0)); + if (out_grad.numel() == 0) return; + if (index_type == DataType::INT32) { + phi::funcs::GPUScatterAssign( + dev_ctx, out_grad, index, x_grad, overwrite); + } else if (index_type == DataType::INT64) { + phi::funcs::GPUScatterAssign( + dev_ctx, out_grad, index, x_grad, overwrite); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(gather_grad, + GPU, + ALL_LAYOUT, + phi::GatherGradKernel, + float, + double, + int64_t, + int, + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpu/gather_kernel.cu b/paddle/phi/kernels/gpu/gather_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..7e0c6cc168564e94c5af2e26a8f9ba4acc0594ed --- /dev/null +++ b/paddle/phi/kernels/gpu/gather_kernel.cu @@ -0,0 +1,70 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/gather_kernel.h" + +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/gather.cu.h" + +namespace phi { + +template +void GatherKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& index, + const Scalar& axis, + DenseTensor* out) { + const auto& index_type = index.dtype(); + auto axis_v = axis.to(); + if (axis_v != 0) { + if (index_type == phi::DataType::INT32) { + phi::funcs::GatherV2CUDAFunction( + &x, &index, axis_v, out, dev_ctx); + } else if (index_type == phi::DataType::INT64) { + phi::funcs::GatherV2CUDAFunction( + &x, &index, axis_v, out, dev_ctx); + } else if (index_type == phi::DataType::INT16) { + phi::funcs::GatherV2CUDAFunction( + &x, &index, axis_v, out, dev_ctx); + } + return; + } + + dev_ctx.template Alloc(out); + + if (x.numel() == 0) return; + if (index_type == phi::DataType::INT32) { + phi::funcs::GPUGather(dev_ctx, x, index, out); + } else if (index_type == phi::DataType::INT64) { + phi::funcs::GPUGather(dev_ctx, x, index, out); + } else if (index_type == phi::DataType::INT16) { + phi::funcs::GPUGather(dev_ctx, x, index, out); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(gather, + GPU, + ALL_LAYOUT, + phi::GatherKernel, + float, + double, + int64_t, + int, + int16_t, + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpu/gelu_funcs.h b/paddle/phi/kernels/gpu/gelu_funcs.h new file mode 100644 index 0000000000000000000000000000000000000000..2b9be7c6154354f7fd20b316610521a02801243f --- /dev/null +++ b/paddle/phi/kernels/gpu/gelu_funcs.h @@ -0,0 +1,176 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/platform/flags.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/kernels/funcs/aligned_vector.h" + +DECLARE_bool(use_fast_math); + +namespace phi { + +#ifdef __NVCC__ +template +static __device__ __forceinline__ float FP32FastTanh(float x) { +#if __CUDA_ARCH__ >= 750 && CUDA_VERSION >= 11000 + if (FastMode) { + float y; + asm("tanh.approx.f32 %0,%1; \n\t" : "=f"(y) : "f"(x)); + return y; + } +#endif + return tanhf(x); +} + +template +static __device__ __forceinline__ float FP32GeluFwd(float x) { + auto tanh_out = + FP32FastTanh(0.79788456f * x * (1.0f + 0.044715f * x * x)); + return x * 0.5f * (1.0f + tanh_out); +} + +template +static __device__ __forceinline__ float FP32GeluBwd(float x, float y_g) { + auto tanh_out = + FP32FastTanh(0.79788456f * x * (1.0f + 0.044715f * x * x)); + auto tmp = 0.5f * x * ((1.0f - tanh_out * tanh_out) * + (0.79788456f + 0.1070322243f * x * x)) + + 0.5f * (1.0f + tanh_out); + return tmp * y_g; +} + +template +static __global__ void FP16FastGeluFwdCUDAKernel(const __half* x, + __half* y, + size_t n) { + size_t offset = + static_cast(threadIdx.x + blockIdx.x * blockDim.x) * VecSize; + size_t stride = static_cast(blockDim.x * gridDim.x) * VecSize; + for (; offset < n; offset += stride) { + using ArrT = phi::AlignedVector<__half, VecSize>; + ArrT in_arr = *reinterpret_cast(x + offset); +#pragma unroll + for (int i = 0; i < VecSize; ++i) { + float tmp = __half2float(in_arr[i]); + in_arr[i] = __float2half(FP32GeluFwd(tmp)); + } + *reinterpret_cast(y + offset) = in_arr; + } +} + +template +static __global__ void FP16FastGeluBwdCUDAKernel(const __half* x, + const __half* y_g, + __half* x_g, + size_t n) { + size_t offset = + static_cast(threadIdx.x + blockIdx.x * blockDim.x) * VecSize; + size_t stride = static_cast(blockDim.x * gridDim.x) * VecSize; + for (; offset < n; offset += stride) { + using ArrT = phi::AlignedVector<__half, VecSize>; + ArrT x_in_arr = *reinterpret_cast(x + offset); + ArrT y_g_in_arr = *reinterpret_cast(y_g + offset); +#pragma unroll + for (int i = 0; i < VecSize; ++i) { + __half2 tmp_fp16_2; + tmp_fp16_2.x = x_in_arr[i]; + tmp_fp16_2.y = y_g_in_arr[i]; + float2 tmp_fp32_2 = __half22float2(tmp_fp16_2); + x_in_arr[i] = + __float2half(FP32GeluBwd(tmp_fp32_2.x, tmp_fp32_2.y)); + } + *reinterpret_cast(x_g + offset) = x_in_arr; + } +} + +static bool TryLaunchFP16FastGeluFwdVectorizeCUDAKernel( + const GPUContext& dev_ctx, const __half* x, __half* y, size_t n) { + auto is_aligned = [](const void* p, size_t alignment) { + return reinterpret_cast(p) % alignment == 0; + }; + +#define PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(__vec_size, __use_fast_math) \ + do { \ + constexpr auto kAlignment = \ + alignof(phi::AlignedVector<__half, __vec_size>); \ + if (n % __vec_size == 0 && is_aligned(x, kAlignment) && \ + is_aligned(y, kAlignment)) { \ + size_t thread = std::min(512, dev_ctx.GetMaxThreadsPerBlock()); \ + size_t block = (n / __vec_size + thread - 1) / thread; \ + block = std::min(block, dev_ctx.GetCUDAMaxGridDimSize()[0]); \ + VLOG(10) << "Use FP16 fast gelu fwd kernel, block = " << block \ + << " , thread = " << thread; \ + FP16FastGeluFwdCUDAKernel< \ + __vec_size, \ + __use_fast_math><<>>(x, y, n); \ + return true; \ + } \ + } while (0) + + if (FLAGS_use_fast_math) { + PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(8, true); + } else { + PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(8, false); + } + +#undef PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL + return false; +} + +static bool TryLaunchFP16FastGeluBwdVectorizeCUDAKernel( + const GPUContext& dev_ctx, + const __half* x, + const __half* y_g, + __half* x_g, + size_t n) { + auto is_aligned = [](const void* p, size_t alignment) { + return reinterpret_cast(p) % alignment == 0; + }; + +#define PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(__vec_size, __use_fast_math) \ + do { \ + constexpr auto kAlignment = \ + alignof(phi::AlignedVector<__half, __vec_size>); \ + if (n % __vec_size == 0 && is_aligned(x, kAlignment) && \ + is_aligned(x, kAlignment) && is_aligned(y_g, kAlignment) && \ + is_aligned(x_g, kAlignment)) { \ + size_t thread = std::min(512, dev_ctx.GetMaxThreadsPerBlock()); \ + size_t block = (n / __vec_size + thread - 1) / thread; \ + block = std::min(block, dev_ctx.GetCUDAMaxGridDimSize()[0]); \ + VLOG(10) << "Use FP16 fast gelu bwd kernel, block = " << block \ + << " , thread = " << thread; \ + FP16FastGeluBwdCUDAKernel< \ + __vec_size, \ + __use_fast_math><<>>( \ + x, y_g, x_g, n); \ + return true; \ + } \ + } while (0) + + if (FLAGS_use_fast_math) { + PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(8, true); + } else { + PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(8, false); + } + +#undef PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL + return false; +} +#endif + +} // namespace phi diff --git a/paddle/phi/kernels/gpu/gelu_grad_kernel.cu b/paddle/phi/kernels/gpu/gelu_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..1e21f8d4267bca5363d58b63e0a37d076b4d06af --- /dev/null +++ b/paddle/phi/kernels/gpu/gelu_grad_kernel.cu @@ -0,0 +1,100 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/gelu_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/broadcast_function.h" +#include "paddle/phi/kernels/gpu/gelu_funcs.h" + +DECLARE_bool(use_fast_math); + +namespace phi { + +template +struct GeluWithApproximateGradFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + inline HOSTDEVICE T operator()(T arg_x, T arg_dout) { + MPType x = static_cast(arg_x); + MPType dout = static_cast(arg_dout); + MPType one = static_cast(1); + MPType half = static_cast(0.5); + MPType kAlpha = static_cast(M_2_SQRTPI * M_SQRT1_2); + MPType kBeta = + kAlpha * static_cast(GELU_CONSTANT) * static_cast(3); + auto cube_x = x * x * x; + auto tanh_out = + tanh(kAlpha * ((static_cast(GELU_CONSTANT) * cube_x) + x)); + auto ans = + half * (one + tanh_out + + (one - tanh_out * tanh_out) * (x * kAlpha + kBeta * cube_x)); + return static_cast(ans * dout); + } +}; + +template +struct GeluWithoutApproximateGradFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + inline HOSTDEVICE T operator()(T arg_x, T arg_dout) { + MPType x = static_cast(arg_x); + MPType dout = static_cast(arg_dout); + constexpr MPType kBeta = M_2_SQRTPI * M_SQRT1_2 * static_cast(0.5); + const MPType cdf = normcdf(x); + const MPType pdf = exp(static_cast(-0.5) * x * x) * kBeta; + return static_cast(dout * (cdf + x * pdf)); + } +}; + +template +void GeluGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + bool approximate, + DenseTensor* x_grad) { + dev_ctx.template Alloc(x_grad); + std::vector ins = {&x, &out_grad}; + std::vector outs = {x_grad}; + if (approximate) { +#ifdef __NVCC__ + if (std::is_same::value) { + size_t n = x.numel(); + const auto* x_ptr = reinterpret_cast(x.data()); + const auto* y_g_ptr = reinterpret_cast(out_grad.data()); + auto* x_g_ptr = reinterpret_cast<__half*>(x_grad->data()); + if (TryLaunchFP16FastGeluBwdVectorizeCUDAKernel( + dev_ctx, x_ptr, y_g_ptr, x_g_ptr, n)) { + return; + } + } +#endif + phi::funcs::BroadcastKernel( + dev_ctx, ins, &outs, 0, GeluWithApproximateGradFunctor()); + } else { + phi::funcs::BroadcastKernel( + dev_ctx, ins, &outs, 0, GeluWithoutApproximateGradFunctor()); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(gelu_grad, + GPU, + ALL_LAYOUT, + phi::GeluGradKernel, + float, + double, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpu/gelu_kernel.cu b/paddle/phi/kernels/gpu/gelu_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..ce6dda2d6cc6526853cf563779cfe5ad1a21ffe1 --- /dev/null +++ b/paddle/phi/kernels/gpu/gelu_kernel.cu @@ -0,0 +1,90 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/gelu_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/broadcast_function.h" +#include "paddle/phi/kernels/gpu/gelu_funcs.h" + +DECLARE_bool(use_fast_math); + +namespace phi { + +template +struct GeluWithApproximateFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + inline HOSTDEVICE T operator()(T arg_x) { + // this function is tanh approximation of gelu + MPType x = static_cast(arg_x); + MPType one = static_cast(1); + MPType half = static_cast(0.5); + MPType kAlpha = static_cast(M_2_SQRTPI * M_SQRT1_2); + auto tanh_out = + tanh(kAlpha * x * (one + static_cast(GELU_CONSTANT) * x * x)); + MPType out = x * half * (one + tanh_out); + return static_cast(out); + } +}; + +template +struct GeluWithoutApproximateFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + inline HOSTDEVICE T operator()(T arg_x) { + // actual gelu with approximation = false + MPType x = static_cast(arg_x); + return static_cast(x * normcdf(x)); + } +}; + +template +void GeluKernel(const Context& dev_ctx, + const DenseTensor& x, + bool approximate, + DenseTensor* out) { + dev_ctx.template Alloc(out); + std::vector ins = {&x}; + std::vector outs = {out}; + if (approximate) { +#ifdef __NVCC__ + if (std::is_same::value) { + size_t n = x.numel(); + const auto* in_ptr = reinterpret_cast(x.data()); + auto* out_ptr = reinterpret_cast<__half*>(out->data()); + if (TryLaunchFP16FastGeluFwdVectorizeCUDAKernel( + dev_ctx, in_ptr, out_ptr, n)) { + return; + } + } +#endif + phi::funcs::BroadcastKernel( + dev_ctx, ins, &outs, 0, GeluWithApproximateFunctor()); + } else { + phi::funcs::BroadcastKernel( + dev_ctx, ins, &outs, 0, GeluWithoutApproximateFunctor()); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(gelu, + GPU, + ALL_LAYOUT, + phi::GeluKernel, + float, + double, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpu/grid_sample_grad_kernel.cu b/paddle/phi/kernels/gpu/grid_sample_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..457a348be832b006d9f224e3032c369a7fe4bb62 --- /dev/null +++ b/paddle/phi/kernels/gpu/grid_sample_grad_kernel.cu @@ -0,0 +1,324 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/grid_sample_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_info.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/gpu/grid_sample_utils.h" + +#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" + +namespace phi { + +template +static __forceinline__ __device__ void AtomicAdd( + T* data, int h, int w, int sH, int sW, int H, int W, T delta) { + if (InBounds(h, w, H, W)) { + paddle::platform::CudaAtomicAdd(data + h * sH + w * sW, delta); + } +} + +template +static __forceinline__ __device__ T +UnnormalizeWithMask(T coord, int size, bool align_corners, T* grad_in) { + if (align_corners) { + *grad_in = static_cast(size - 1) / 2; + return ((coord + 1.f) / 2) * (size - 1); + } else { + *grad_in = static_cast(size) / 2; + return ((coord + 1.f) * size - 1) / 2; + } +} + +template +static __forceinline__ __device__ T ClipIndexesWithMask(T in, + int clip_limit, + T* grad_in) { + if (in <= static_cast(0)) { + *grad_in = static_cast(0); + return static_cast(0); + } else { + T max = static_cast(clip_limit - 1); + if (in >= max) { + *grad_in = static_cast(0); + return max; + } else { + *grad_in = static_cast(1); + return in; + } + } +} + +template +static __forceinline__ __device__ T +ReflectIndexesWithMask(T in, int twice_low, int twice_high, T* grad_in) { + if (twice_low == twice_high) { + *grad_in = static_cast(0); + return static_cast(0); + } + int grad_in_mult_; + T min = static_cast(twice_low) / 2; + T span = static_cast(twice_high - twice_low) / 2; + in = in - min; + if (in < static_cast(0)) { + grad_in_mult_ = -1; + in = -in; + } else { + grad_in_mult_ = 1; + } + T extra = fmod(in, span); + int flips = static_cast(floor(in / span)); + if (flips % 2 == 0) { + *grad_in = static_cast(grad_in_mult_); + return extra + min; + } else { + *grad_in = static_cast(-grad_in_mult_); + return span - extra + min; + } +} + +template +static __forceinline__ __device__ T +ComputePositionsWithMask(T coord, + int size, + PaddingMode padding_mode, + bool align_corners, + T* grad_in) { + T grad_clip, grad_refl; + coord = UnnormalizeWithMask(coord, size, align_corners, grad_in); + if (padding_mode == PaddingMode::border) { + coord = ClipIndexesWithMask(coord, size, &grad_clip); + *grad_in = (*grad_in) * grad_clip; + } else if (padding_mode == PaddingMode::reflect) { + if (align_corners) { + coord = ReflectIndexesWithMask(coord, 0, 2 * (size - 1), &grad_refl); + } else { + coord = ReflectIndexesWithMask(coord, -1, 2 * size - 1, &grad_refl); + } + coord = ClipIndexesWithMask(coord, size, &grad_clip); + *grad_in = (*grad_in) * grad_refl * grad_clip; + } + + return coord; +} + +template +__global__ void GridSamplerCudaBackwardKernel(const int nthreads, + const T* grad_output, + const T* input, + const T* grid, + int n, + int out_c, + int out_h, + int out_w, + int in_h, + int in_w, + T* grad_input, + T* grad_grid, + const Mode mode, + const PaddingMode padding_mode, + bool align_corners) { + int inp_sN = out_c * in_h * in_w; + int inp_sC = in_h * in_w; + int inp_sH = in_w; + int inp_sW = 1; + int grid_sN = out_h * out_w * 2; + int grid_sH = out_w * 2; + int grid_sW = 2; + int grid_sCoor = 1; + + int gOut_sN = out_c * out_h * out_w; + int gOut_sC = out_h * out_w; + int gOut_sH = out_w; + int gOut_sW = 1; + + CUDA_KERNEL_LOOP(index, nthreads) { + const int w = index % out_w; + const int h = (index / out_w) % out_h; + const int n = index / (out_h * out_w); + const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW; + + T ix = grid[grid_offset]; + T iy = grid[grid_offset + grid_sCoor]; + + T gix_mult, giy_mult; + ix = ComputePositionsWithMask( + ix, in_w, padding_mode, align_corners, &gix_mult); + iy = ComputePositionsWithMask( + iy, in_h, padding_mode, align_corners, &giy_mult); + + if (mode == Mode::bilinear) { + int ix_nw = static_cast(floor(ix)); + int iy_nw = static_cast(floor(iy)); + int ix_ne = ix_nw + 1; + int iy_ne = iy_nw; + int ix_sw = ix_nw; + int iy_sw = iy_nw + 1; + int ix_se = ix_nw + 1; + int iy_se = iy_nw + 1; + + T nw = (ix_se - ix) * (iy_se - iy); + T ne = (ix - ix_sw) * (iy_sw - iy); + T sw = (ix_ne - ix) * (iy - iy_ne); + T se = (ix - ix_nw) * (iy - iy_nw); + + T gix = static_cast(0), giy = static_cast(0); + int gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW; + T* gInp_ptr_NC = grad_input + n * inp_sN; + int inp_offset_NC = n * inp_sN; + for (int c = 0; c < out_c; ++c, + inp_offset_NC += inp_sC, + gInp_ptr_NC += inp_sC, + gOut_offset += gOut_sC) { + T gOut = grad_output[gOut_offset]; + + AtomicAdd( + gInp_ptr_NC, iy_nw, ix_nw, inp_sH, inp_sW, in_h, in_w, nw * gOut); + AtomicAdd( + gInp_ptr_NC, iy_ne, ix_ne, inp_sH, inp_sW, in_h, in_w, ne * gOut); + AtomicAdd( + gInp_ptr_NC, iy_sw, ix_sw, inp_sH, inp_sW, in_h, in_w, sw * gOut); + AtomicAdd( + gInp_ptr_NC, iy_se, ix_se, inp_sH, inp_sW, in_h, in_w, se * gOut); + + if (InBounds(iy_nw, ix_nw, in_h, in_w)) { + T nw_val = input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW]; + gix -= nw_val * (iy_se - iy) * gOut; + giy -= nw_val * (ix_se - ix) * gOut; + } + if (InBounds(iy_ne, ix_ne, in_h, in_w)) { + T ne_val = input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW]; + gix += ne_val * (iy_sw - iy) * gOut; + giy -= ne_val * (ix - ix_sw) * gOut; + } + if (InBounds(iy_sw, ix_sw, in_h, in_w)) { + T sw_val = input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW]; + gix -= sw_val * (iy - iy_ne) * gOut; + giy += sw_val * (ix_ne - ix) * gOut; + } + if (InBounds(iy_se, ix_se, in_h, in_w)) { + T se_val = input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW]; + gix += se_val * (iy - iy_nw) * gOut; + giy += se_val * (ix - ix_nw) * gOut; + } + } + + if (grad_grid != nullptr) { + T* gGrid_ptr_NHW = grad_grid + index * grid_sW; + gGrid_ptr_NHW[0] = gix_mult * gix; + gGrid_ptr_NHW[1] = giy_mult * giy; + } + } else if (mode == Mode::nearest) { + int ix_nearest = static_cast(std::nearbyint(ix)); + int iy_nearest = static_cast(std::nearbyint(iy)); + + int gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW; + T* gInp_ptr_NC = grad_input + n * inp_sN; + for (int c = 0; c < out_c; + ++c, gInp_ptr_NC += inp_sC, gOut_offset += gOut_sC) { + AtomicAdd(gInp_ptr_NC, + iy_nearest, + ix_nearest, + inp_sH, + inp_sW, + in_h, + in_w, + grad_output[gOut_offset]); + } + + if (grad_grid != nullptr) { + T* gGrid_ptr_NHW = grad_grid + index * grid_sW; + gGrid_ptr_NHW[0] = static_cast(0); + gGrid_ptr_NHW[1] = static_cast(0); + } + } + } +} + +template +void GridSampleGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& grid, + const DenseTensor& out_grad, + const std::string& mode, + const std::string& padding_mode, + bool align_corners, + DenseTensor* x_grad, + DenseTensor* grid_grad) { + PaddingMode enum_padding_mode; + Mode enum_mode; + if (padding_mode == "border") { + enum_padding_mode = PaddingMode::border; + } else if (padding_mode == "reflection") { + enum_padding_mode = PaddingMode::reflect; + } else { + enum_padding_mode = PaddingMode::zeros; + } + + if (mode == "nearest") { + enum_mode = Mode::nearest; + } else { + enum_mode = Mode::bilinear; + } + + const int n = grid.dims()[0]; + const int out_h = grid.dims()[1]; + const int out_w = grid.dims()[2]; + const int c = x.dims()[1]; + const int in_h = x.dims()[2]; + const int in_w = x.dims()[3]; + + dev_ctx.template Alloc(x_grad); + phi::funcs::SetConstant()(dev_ctx, x_grad, static_cast(0)); + + T* grid_grad_data = nullptr; + if (grid_grad != nullptr) { + grid_grad_data = dev_ctx.template Alloc(grid_grad); + } + + int count = static_cast(n * out_h * out_w); + auto cu_stream = dev_ctx.stream(); + backends::gpu::GpuLaunchConfig config = + backends::gpu::GetGpuLaunchConfig1D(dev_ctx, count); + GridSamplerCudaBackwardKernel< + T><<>>( + count, + out_grad.data(), + x.data(), + grid.data(), + n, + c, + out_h, + out_w, + in_h, + in_w, + x_grad->data(), + grid_grad_data, + enum_mode, + enum_padding_mode, + align_corners); +} + +} // namespace phi + +PD_REGISTER_KERNEL(grid_sample_grad, + GPU, + ALL_LAYOUT, + phi::GridSampleGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/gpu/grid_sample_kernel.cu b/paddle/phi/kernels/gpu/grid_sample_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..f611b46911c4f1555ad27a538d8918f11ae761cc --- /dev/null +++ b/paddle/phi/kernels/gpu/grid_sample_kernel.cu @@ -0,0 +1,233 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/grid_sample_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_info.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gpu/grid_sample_utils.h" + +namespace phi { + +template +static __forceinline__ __device__ T Unnormalize(T coord, + int size, + bool align_corners) { + if (align_corners) { + return ((coord + 1.f) / 2) * (size - 1); + } else { + return ((coord + 1.f) * size - 1) / 2; + } +} + +template +static __forceinline__ __device__ T ClipIndexes(T in, int max_value) { + return min(static_cast(max_value), max(in, static_cast(0))); +} + +template +static __forceinline__ __device__ T ReflectIndexes(T in, + int twice_low, + int twice_high) { + if (twice_low == twice_high) { + return static_cast(0); + } + T min = static_cast(twice_low) / 2; + T span = static_cast(twice_high - twice_low) / 2; + in = fabs(in - min); + T extra = fmod(in, span); + int flips = static_cast(floor(in / span)); + if (flips % 2 == 0) { + return extra + min; + } else { + return span - extra + min; + } +} + +template +static __forceinline__ __device__ T ComputePositions(T coord, + int size, + PaddingMode padding_mode, + bool align_corners) { + coord = Unnormalize(coord, size, align_corners); + if (padding_mode == PaddingMode::border) { + coord = ClipIndexes(coord, size - 1); + } else if (padding_mode == PaddingMode::reflect) { + if (align_corners) { + coord = ReflectIndexes(coord, 0, 2 * (size - 1)); + } else { + coord = ReflectIndexes(coord, -1, 2 * size - 1); + } + coord = ClipIndexes(coord, size - 1); + } + return coord; +} + +template +__global__ void GridSampleCudaKernel(const int nthreads, + int n, + int out_c, + int out_h, + int out_w, + int in_h, + int in_w, + const T* input, + const T* grid, + T* output, + const Mode mode, + const PaddingMode padding_mode, + bool align_corners) { + int inp_sN = out_c * in_h * in_w; + + int inp_sC = in_h * in_w; + int inp_sH = in_w; + int inp_sW = 1; + int grid_sN = out_h * out_w * 2; + int grid_sH = out_w * 2; + int grid_sW = 2; + int grid_sCoor = 1; + int out_sN = out_c * out_h * out_w; + int out_sC = out_h * out_w; + int out_sH = out_w; + int out_sW = 1; + CUDA_KERNEL_LOOP(index, nthreads) { + const int w = index % out_w; + const int h = (index / out_w) % out_h; + const int n = index / (out_h * out_w); + const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW; + + T ix = grid[grid_offset]; + T iy = grid[grid_offset + grid_sCoor]; + + ix = ComputePositions(ix, in_w, padding_mode, align_corners); + iy = ComputePositions(iy, in_h, padding_mode, align_corners); + if (mode == Mode::bilinear) { + int ix_nw = static_cast(floor(ix)); + int iy_nw = static_cast(floor(iy)); + int ix_ne = ix_nw + 1; + int iy_ne = iy_nw; + int ix_sw = ix_nw; + int iy_sw = iy_nw + 1; + int ix_se = ix_nw + 1; + int iy_se = iy_nw + 1; + + T nw = (ix_se - ix) * (iy_se - iy); + T ne = (ix - ix_sw) * (iy_sw - iy); + T sw = (ix_ne - ix) * (iy - iy_ne); + T se = (ix - ix_nw) * (iy - iy_nw); + + auto inp_offset_NC = n * inp_sN; + + auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW; + for (int c = 0; c < out_c; + ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) { + *out_ptr_NCHW = static_cast(0); + if (InBounds(iy_nw, ix_nw, in_h, in_w)) { + *out_ptr_NCHW += + input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW] * nw; + } + if (InBounds(iy_ne, ix_ne, in_h, in_w)) { + *out_ptr_NCHW += + input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW] * ne; + } + if (InBounds(iy_sw, ix_sw, in_h, in_w)) { + *out_ptr_NCHW += + input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW] * sw; + } + if (InBounds(iy_se, ix_se, in_h, in_w)) { + *out_ptr_NCHW += + input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW] * se; + } + } + } else if (mode == Mode::nearest) { + int ix_nearest = static_cast(std::nearbyint(ix)); + int iy_nearest = static_cast(std::nearbyint(iy)); + auto inp_offset_NC = n * inp_sN; + auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW; + for (int c = 0; c < out_c; + ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) { + if (InBounds(iy_nearest, ix_nearest, in_h, in_w)) { + *out_ptr_NCHW = + input[inp_offset_NC + iy_nearest * inp_sH + ix_nearest * inp_sW]; + } else { + *out_ptr_NCHW = static_cast(0); + } + } + } + } +} + +template +void GridSampleKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& grid, + const std::string& mode, + const std::string& padding_mode, + bool align_corners, + DenseTensor* out) { + PaddingMode enum_padding_mode; + Mode enum_mode; + if (padding_mode == "border") { + enum_padding_mode = PaddingMode::border; + } else if (padding_mode == "reflection") { + enum_padding_mode = PaddingMode::reflect; + } else { + enum_padding_mode = PaddingMode::zeros; + } + + if (mode == "nearest") { + enum_mode = Mode::nearest; + } else { + enum_mode = Mode::bilinear; + } + + const int n = grid.dims()[0]; + const int out_h = grid.dims()[1]; + const int out_w = grid.dims()[2]; + const int c = x.dims()[1]; + const int in_h = x.dims()[2]; + const int in_w = x.dims()[3]; + VLOG(3) << "n: " << n << "; c: " << c << "; out_h: " << out_h + << "; out_w: " << out_w; + + auto* output_data = dev_ctx.template Alloc(out); + VLOG(3) << "out dims: " << out->dims()[0] << "; " << out->dims()[1] << "; " + << out->dims()[2] << "; " << out->dims()[3]; + + int count = static_cast(n * out_h * out_w); + auto cu_stream = dev_ctx.stream(); + backends::gpu::GpuLaunchConfig config = + backends::gpu::GetGpuLaunchConfig1D(dev_ctx, count); + GridSampleCudaKernel< + T><<>>( + count, + n, + c, + out_h, + out_w, + in_h, + in_w, + x.data(), + grid.data(), + output_data, + enum_mode, + enum_padding_mode, + align_corners); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + grid_sample, GPU, ALL_LAYOUT, phi::GridSampleKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/grid_sample_utils.h b/paddle/phi/kernels/gpu/grid_sample_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..098eb9defb54904c41f33326b54eabdda657360a --- /dev/null +++ b/paddle/phi/kernels/gpu/grid_sample_utils.h @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +namespace phi { + +enum class Mode { + bilinear, + nearest, +}; + +enum class PaddingMode { zeros, border, reflect }; + +static __forceinline__ __device__ bool InBounds(int h, int w, int H, int W) { + return h >= 0 && h < H && w >= 0 && w < W; +} + +} // namespace phi diff --git a/paddle/phi/kernels/gpu/index_select_grad_kernel.cu b/paddle/phi/kernels/gpu/index_select_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..a393eecd51242193fa3b2192ff8e8f1111d350b6 --- /dev/null +++ b/paddle/phi/kernels/gpu/index_select_grad_kernel.cu @@ -0,0 +1,141 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/index_select_grad_kernel.h" + +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_info.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/utils/data_type.h" + +namespace phi { + +using paddle::platform::PADDLE_CUDA_NUM_THREADS; + +template +__global__ void index_select_grad_cuda_kernel(const T* output_grad, + T* input_grad, + const IndexT* index, + int64_t nums, + int64_t N, + int64_t stride, + int64_t size, + int64_t delta) { + int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= N) { + return; + } + + int64_t pre_idx = idx / (stride * size); + int64_t dim_idx = idx % (stride * size) / stride; + IndexT src_dim_idx = index[dim_idx]; + int64_t input_idx = idx + (delta * pre_idx + src_dim_idx - dim_idx) * stride; + paddle::platform::CudaAtomicAdd(&input_grad[input_idx], output_grad[idx]); +} + +template +__global__ void index_select_grad_init(T* input_grad, int64_t N) { + int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= N) { + return; + } + input_grad[idx] = 0.0; +} + +template +void IndexSelectGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& index, + const DenseTensor& out_grad, + int dim, + DenseTensor* x_grad) { + auto* output_grad_data = out_grad.data(); + auto* in_grad_data = ctx.template Alloc(x_grad); + + auto input_dim = x_grad->dims(); + auto output_dim = out_grad.dims(); + dim = dim >= 0 ? dim : dim + input_dim.size(); + auto stride_dim = phi::stride(input_dim); + int64_t stride = stride_dim[dim]; + int64_t size = output_dim[dim]; + int64_t delta = input_dim[dim] - size; + const auto& index_type = index.dtype(); + + bool index_type_match = + index_type == phi::DataType::INT64 || index_type == phi::DataType::INT32; + PADDLE_ENFORCE_EQ(index_type_match, + true, + phi::errors::InvalidArgument( + "Input(Index) holds the wrong type, it holds %s, but " + "desires to be %s or %s", + index_type, + phi::DataType::INT32, + phi::DataType::INT64)); + + int64_t numel = x_grad->numel(); + int64_t index_nums = index.numel(); + int64_t out_nums = out_grad.numel(); + + auto stream = ctx.stream(); + + index_select_grad_init< + T><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, + PADDLE_CUDA_NUM_THREADS, + 0, + stream>>>(in_grad_data, numel); + + if (index_type == phi::DataType::INT64) { + const int64_t* index_data = index.data(); + index_select_grad_cuda_kernel<<< + (out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, + PADDLE_CUDA_NUM_THREADS, + 0, + stream>>>(output_grad_data, + in_grad_data, + index_data, + index_nums, + out_nums, + stride, + size, + delta); + phi::backends::gpu::GpuStreamSync(stream); + } else { + const int* index_data = index.data(); + index_select_grad_cuda_kernel<<< + (out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, + PADDLE_CUDA_NUM_THREADS, + 0, + stream>>>(output_grad_data, + in_grad_data, + index_data, + index_nums, + out_nums, + stride, + size, + delta); + phi::backends::gpu::GpuStreamSync(stream); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(index_select_grad, + GPU, + ALL_LAYOUT, + phi::IndexSelectGradKernel, + float, + double, + phi::dtype::float16, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/index_select_kernel.cu b/paddle/phi/kernels/gpu/index_select_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..f774522318acb8f44798030870886dd1dc7accc1 --- /dev/null +++ b/paddle/phi/kernels/gpu/index_select_kernel.cu @@ -0,0 +1,109 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/index_select_kernel.h" + +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_info.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/utils/data_type.h" + +namespace phi { + +using paddle::platform::PADDLE_CUDA_NUM_THREADS; + +template +__global__ void index_select_cuda_kernel(const T* input, + T* output, + const IndexT* index, + int64_t N, + int64_t stride, + int64_t size, + int64_t delta) { + int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= N) { + return; + } + + int64_t pre_idx = idx / (stride * size); + int64_t dim_idx = idx % (stride * size) / stride; + IndexT src_dim_idx = index[dim_idx]; + int64_t input_idx = idx + (delta * pre_idx + src_dim_idx - dim_idx) * stride; + output[idx] = input[input_idx]; +} + +template +void IndexSelectKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& index, + int dim, + DenseTensor* output) { + auto input_dim = x.dims(); + auto output_dim = output->dims(); + dim = dim >= 0 ? dim : dim + input_dim.size(); + auto stride_dim = phi::stride(input_dim); + int64_t stride = stride_dim[dim]; + int64_t size = output_dim[dim]; + int64_t delta = input_dim[dim] - size; + const auto& index_type = index.dtype(); + + bool index_type_match = + index_type == phi::DataType::INT64 || index_type == phi::DataType::INT32; + PADDLE_ENFORCE_EQ(index_type_match, + true, + phi::errors::InvalidArgument( + "Input(Index) holds the wrong type, it holds %s, but " + "desires to be %s or %s", + index_type, + phi::DataType::INT32, + phi::DataType::INT64)); + + auto* in_data = x.data(); + T* out_data = ctx.template Alloc(output); + + int64_t numel = output->numel(); + auto stream = ctx.stream(); + + if (index_type == phi::DataType::INT64) { + const int64_t* index_data = index.data(); + index_select_cuda_kernel<<< + (numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, + PADDLE_CUDA_NUM_THREADS, + 0, + stream>>>(in_data, out_data, index_data, numel, stride, size, delta); + phi::backends::gpu::GpuStreamSync(stream); + } else { + const int* index_data = index.data(); + index_select_cuda_kernel< + T, + int><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, + PADDLE_CUDA_NUM_THREADS, + 0, + stream>>>( + in_data, out_data, index_data, numel, stride, size, delta); + phi::backends::gpu::GpuStreamSync(stream); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(index_select, + GPU, + ALL_LAYOUT, + phi::IndexSelectKernel, + float, + double, + phi::dtype::float16, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/isclose_kernel.cu b/paddle/phi/kernels/gpu/isclose_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..34774ec715c48de953945f94624c2c3cfe742d30 --- /dev/null +++ b/paddle/phi/kernels/gpu/isclose_kernel.cu @@ -0,0 +1,22 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/isclose_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/isclose_kernel_impl.h" + +PD_REGISTER_KERNEL( + isclose, GPU, ALL_LAYOUT, phi::IscloseKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/kldiv_loss_grad_kernel.cu b/paddle/phi/kernels/gpu/kldiv_loss_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..8ca53f021f054278b813fedb69db31f4d2c5aaf6 --- /dev/null +++ b/paddle/phi/kernels/gpu/kldiv_loss_grad_kernel.cu @@ -0,0 +1,22 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/kldiv_loss_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/kldiv_loss_grad_kernel_impl.h" +PD_REGISTER_KERNEL( + kldiv_loss_grad, GPU, ALL_LAYOUT, phi::KLDivLossGradKernel, float, double) { +} diff --git a/paddle/phi/kernels/gpu/kldiv_loss_kernel.cu b/paddle/phi/kernels/gpu/kldiv_loss_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..9388ac7071c3197c2a8774ffd5f0d670cdf8a8b2 --- /dev/null +++ b/paddle/phi/kernels/gpu/kldiv_loss_kernel.cu @@ -0,0 +1,21 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/kldiv_loss_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/kldiv_loss_kernel_impl.h" +PD_REGISTER_KERNEL( + kldiv_loss, GPU, ALL_LAYOUT, phi::KLDivLossKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/kthvalue_grad_kernel.cu b/paddle/phi/kernels/gpu/kthvalue_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..f6e96046a2bd799f4a6b8d30a239afb505582deb --- /dev/null +++ b/paddle/phi/kernels/gpu/kthvalue_grad_kernel.cu @@ -0,0 +1,70 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/kthvalue_grad_kernel.h" + +#include "paddle/fluid/operators/top_k_function_cuda.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { +static int getBlockSize(int col) { + if (col > 512) + return 1024; + else if (col > 256 && col <= 512) + return 512; + else if (col > 128 && col <= 256) + return 256; + else if (col > 64 && col <= 128) + return 128; + else + return 64; +} + +template +void KthvalueGradKernel(const Context& dev_ctx, + const DenseTensor& d_out, + const DenseTensor& x, + const DenseTensor& indices, + int k, + int axis, + bool keepdim, + DenseTensor* d_x) { + const auto& in_dims = x.dims(); + auto out_dims = indices.dims(); + if (axis < 0) axis += in_dims.size(); + T* x_grad_data = dev_ctx.template Alloc(d_x); + const T* out_grad_data = d_out.data(); + const int64_t* indices_data = indices.data(); + int pre, n, post; + paddle::operators::GetDims(in_dims, axis, &pre, &n, &post); + int block_size = getBlockSize(post * k); + int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); + const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1); + int grid_size = std::min(max_blocks, pre); + paddle::operators::AssignGradWithAxis< + T><<>>( + out_grad_data, indices_data, x_grad_data, pre, post, n, 1); +} + +} // namespace phi + +PD_REGISTER_KERNEL(kthvalue_grad, + GPU, + ALL_LAYOUT, + phi::KthvalueGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/kthvalue_kernel.cu b/paddle/phi/kernels/gpu/kthvalue_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..4218e153ec29bd1757b2405f0af638040de9bff2 --- /dev/null +++ b/paddle/phi/kernels/gpu/kthvalue_kernel.cu @@ -0,0 +1,252 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/kthvalue_kernel.h" + +#include "paddle/fluid/operators/top_k_function_cuda.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { +inline int getBlockSize(int col) { + if (col > 512) + return 1024; + else if (col > 256 && col <= 512) + return 512; + else if (col > 128 && col <= 256) + return 256; + else if (col > 64 && col <= 128) + return 128; + else + return 64; +} + +template +bool SortKthvalue(const phi::GPUContext& dev_ctx, + const DenseTensor* input_tensor, + const int64_t num_cols, + const int64_t num_rows, + const int k, + DenseTensor* out_tensor, + DenseTensor* indices_tensor) { + auto cu_stream = dev_ctx.stream(); + DenseTensor input_indices; + const std::vector dims = {num_rows, num_cols}; + auto dim = phi::make_ddim(dims); + input_indices.Resize(dim); + dev_ctx.template Alloc(&input_indices); + size_t temp_storage_bytes = -1; + int block_size = getBlockSize(num_cols); + unsigned int maxGridDimX = dev_ctx.GetCUDAMaxGridDimSize()[0]; + unsigned int grid_size = num_rows < maxGridDimX + ? static_cast(num_rows) + : maxGridDimX; + paddle::operators::InitIndex< + int64_t><<>>( + input_indices.data(), num_rows, num_cols); + cub::CountingInputIterator counting_iter(0); + cub::TransformInputIterator> + segment_offsets_t(counting_iter, + paddle::operators::SegmentOffsetIter(num_cols)); + T* sorted_values_ptr; + int64_t* sorted_indices_ptr; + DenseTensor temp_values, temp_indices; + const T* input = input_tensor->data(); + T* values = out_tensor->data(); + int64_t* indices = indices_tensor->mutable_data(dev_ctx.GetPlace()); + temp_values.Resize(dim); + temp_indices.Resize(dim); + sorted_values_ptr = dev_ctx.template Alloc(&temp_values); + sorted_indices_ptr = dev_ctx.template Alloc(&temp_indices); + auto err = + cub::DeviceSegmentedRadixSort::SortPairs(nullptr, + temp_storage_bytes, + input, + sorted_values_ptr, + input_indices.data(), + sorted_indices_ptr, + num_cols * num_rows, + num_rows, + segment_offsets_t, + segment_offsets_t + 1, + 0, + sizeof(T) * 8, + cu_stream); +#ifdef __HIPCC__ + if (err != hipSuccess) { + LOG(ERROR) << "KthvalueOP failed as could not launch " + "hipcub::DeviceSegmentedRadixSort::SortPairs, status: " + << hipGetErrorString(err); + return false; + } +#else + if (err != cudaSuccess) { + LOG(ERROR) << "KthvalueOP failed as could not launch " + "cub::DeviceSegmentedRadixSort::SortPairs, status: " + << cudaGetErrorString(err); + return false; + } +#endif + DenseTensor temp_storage; + temp_storage.Resize({static_cast(temp_storage_bytes / sizeof(uint8_t))}); + uint8_t* temp_storage_data = dev_ctx.template Alloc(&temp_storage); + + err = cub::DeviceSegmentedRadixSort::SortPairs(temp_storage_data, + temp_storage_bytes, + input, + sorted_values_ptr, + input_indices.data(), + sorted_indices_ptr, + num_cols * num_rows, + num_rows, + segment_offsets_t, + segment_offsets_t + 1, + 0, + sizeof(T) * 8, + cu_stream); +#ifdef __HIPCC__ + if (err != hipSuccess) { + LOG(ERROR) << "KthvalueOP failed as could not launch " + "hipcub::DeviceSegmentedRadixSort::SortPairs, " + << temp_storage_bytes << ", status: " << hipGetErrorString(err); + return false; + } +#else + if (err != cudaSuccess) { + LOG(ERROR) << "KthvalueOP failed as could not launch " + "cub::DeviceSegmentedRadixSort::SortPairs, " + << temp_storage_bytes << ", status: " << cudaGetErrorString(err); + return false; + } +#endif + auto& dev = *dev_ctx.eigen_device(); + const Eigen::DSizes slice_indices{0, k - 1}; + const Eigen::DSizes slice_sizes{num_rows, 1}; + auto e_indices = EigenMatrix::From(*indices_tensor, dim); + auto e_tmp_indices = + EigenMatrix::From(static_cast(temp_indices)); + std::vector odims = {static_cast(num_rows), static_cast(1)}; + dim = phi::make_ddim(odims); + auto e_values = EigenMatrix::From(*out_tensor, dim); + auto e_tmp_values = + EigenMatrix::From(static_cast(temp_values)); + + funcs::EigenSlice, int64_t, 2>::Eval( + dev, e_indices, e_tmp_indices, slice_indices, slice_sizes); + funcs::EigenSlice, T, 2>::Eval( + dev, e_values, e_tmp_values, slice_indices, slice_sizes); + return true; +} + +template +void KthvalueKernel(const Context& dev_ctx, + const DenseTensor& x, + int k, + int axis, + bool keepdim, + DenseTensor* output, + DenseTensor* indices) { + const auto& in_dims = x.dims(); + if (axis < 0) axis += in_dims.size(); + auto out_dims = output->dims(); + const T* input_data = x.data(); + T* output_data = dev_ctx.template Alloc(output); + int64_t* indices_data = dev_ctx.template Alloc(indices); + + if (axis == in_dims.size() - 1) { + const int64_t& input_height = + phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); + const int64_t& input_width = in_dims[in_dims.size() - 1]; + PADDLE_ENFORCE_EQ( + SortKthvalue( + dev_ctx, &x, input_width, input_height, k, output, indices), + true, + phi::errors::External("KthvalueOP: Error when use cub sorting")); + return; + } else { + std::vector trans; + for (int i = 0; i < axis; i++) { + trans.emplace_back(i); + } + trans.emplace_back(in_dims.size() - 1); + for (int i = axis + 1; i < in_dims.size() - 1; i++) { + trans.emplace_back(i); + } + trans.emplace_back(axis); + if (!keepdim) { + std::vector tmp_out_shape; + for (int i = 0; i < axis; i++) { + tmp_out_shape.emplace_back(in_dims[i]); + } + tmp_out_shape.emplace_back(1); + for (int i = axis + 1; i < in_dims.size(); i++) { + tmp_out_shape.emplace_back(in_dims[i]); + } + DDim tmp_out_dims = phi::make_ddim(tmp_out_shape); + output->Resize(tmp_out_dims); + indices->Resize(tmp_out_dims); + } + DDim trans_dims(in_dims); + DDim trans_out_dims(in_dims); + for (int i = 0; i < trans.size(); i++) { + trans_dims[i] = in_dims[trans[i]]; + trans_out_dims[i] = in_dims[trans[i]]; + } + trans_out_dims[in_dims.size() - 1] = 1; + DenseTensor trans_input; + trans_input.mutable_data(trans_dims, dev_ctx.GetPlace()); + int ndims = trans.size(); + funcs::TransCompute( + ndims, dev_ctx, x, &trans_input, trans); + DenseTensor trans_ind, trans_out; + trans_ind.mutable_data(trans_out_dims, dev_ctx.GetPlace()); + trans_out.mutable_data(trans_out_dims, dev_ctx.GetPlace()); + const int64_t input_height = + phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); + const int64_t input_width = trans_dims[trans_dims.size() - 1]; + PADDLE_ENFORCE_EQ( + SortKthvalue(dev_ctx, + &trans_input, + input_width, + input_height, + k, + &trans_out, + &trans_ind), + true, + phi::errors::External("KthvalueOP: Error when use cub sorting")); + funcs::TransCompute( + ndims, dev_ctx, trans_ind, indices, trans); + funcs::TransCompute( + ndims, dev_ctx, trans_out, output, trans); + if (!keepdim) { + output->Resize(out_dims); + indices->Resize(out_dims); + } + } +} +} // namespace phi + +PD_REGISTER_KERNEL(kthvalue, + GPU, + ALL_LAYOUT, + phi::KthvalueKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..c3f7a5261712a1d33bb4ad47dd080a489b303717 --- /dev/null +++ b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu @@ -0,0 +1,139 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/layer_norm_grad_kernel.h" + +#include "paddle/fluid/operators/layer_norm_kernel.cu.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/layer_norm_util.h" + +namespace phi { + +template +void LayerNormGradKernel(const Context &dev_ctx, + const DenseTensor &x, + const DenseTensor &mean, + const DenseTensor &variance, + paddle::optional scale_opt, + paddle::optional bias_opt, + const DenseTensor &out_grad, + float epsilon, + int begin_norm_axis, + bool is_test, + DenseTensor *x_grad, + DenseTensor *scale_grad, + DenseTensor *bias_grad) { + using U = paddle::operators::LayerNormParamType; + // d_x, d_scale, d_bias may be nullptr + auto *d_x = x_grad; + auto *d_scale = scale_grad; + auto *d_bias = bias_grad; + + auto *scale = scale_opt.get_ptr(); + auto *bias = bias_opt.get_ptr(); + auto *d_y = &out_grad; + + const auto &x_dims = x.dims(); + auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis); + int64_t batch_size = static_cast(matrix_dim[0]); + int64_t feature_size = static_cast(matrix_dim[1]); + + auto *x_data = x.data(); + auto *d_y_data = d_y->data(); + + auto *mean_data = mean.data(); + auto *var_data = variance.data(); + + auto *d_x_data = (d_x == nullptr ? nullptr : dev_ctx.template Alloc(d_x)); + + auto x_dtype = x.dtype(); + + phi::DataType scale_bias_dtype; + if (scale != nullptr) { + scale_bias_dtype = scale->dtype(); + } else { + // FIXME(zengjinle): do not find a better way to get the right + // data type of the d_scale and d_bias if scale == nullptr. + if (bias != nullptr) { + scale_bias_dtype = bias->dtype(); + } else { + scale_bias_dtype = x_dtype; + } + } + +#define PADDLE_LAUNCH_LAYERNORM_BWD(ScaleBiasT, IsScaleBiasSameDTypeWithX) \ + do { \ + auto *scale_data = \ + (scale == nullptr ? nullptr : scale->data()); \ + auto *d_scale_data = \ + (d_scale == nullptr ? nullptr \ + : dev_ctx.template Alloc(d_scale)); \ + auto *d_bias_data = \ + (d_bias == nullptr ? nullptr \ + : dev_ctx.template Alloc(d_bias)); \ + auto *d_x_data = \ + (d_x == nullptr ? nullptr : dev_ctx.template Alloc(d_x)); \ + paddle::operators::LayerNormBackward( \ + x_data, \ + d_y_data, \ + scale_data, \ + mean_data, \ + var_data, \ + d_x_data, \ + d_scale_data, \ + d_bias_data, \ + epsilon, \ + batch_size, \ + feature_size, \ + dev_ctx); \ + } while (0) + + if (scale_bias_dtype == x_dtype) { + PADDLE_LAUNCH_LAYERNORM_BWD(T, true); + } else { + PADDLE_LAUNCH_LAYERNORM_BWD(U, false); + } + +#undef PADDLE_LAUNCH_LAYERNORM_BWD +} + +} // namespace phi + +#ifdef PADDLE_WITH_HIP +// MIOPEN do not support double +PD_REGISTER_KERNEL(layer_norm_grad, + GPU, + ALL_LAYOUT, + phi::LayerNormGradKernel, + float, + phi::dtype::float16) {} +#elif CUDNN_VERSION_MIN(8, 1, 0) +PD_REGISTER_KERNEL(layer_norm_grad, + GPU, + ALL_LAYOUT, + phi::LayerNormGradKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} +#else +PD_REGISTER_KERNEL(layer_norm_grad, + GPU, + ALL_LAYOUT, + phi::LayerNormGradKernel, + float, + double, + phi::dtype::float16) {} +#endif diff --git a/paddle/phi/kernels/gpu/layer_norm_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..d87b7c2193811cd6cf8138d1904c7fce01d3884a --- /dev/null +++ b/paddle/phi/kernels/gpu/layer_norm_kernel.cu @@ -0,0 +1,229 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/layer_norm_kernel.h" + +#include "paddle/fluid/operators/layer_norm_kernel.cu.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/layer_norm_util.h" + +namespace phi { + +template +void LayerNormDirectCUDAFunctor::operator()(gpuStream_t stream, + const T *input, + std::vector input_shape, + const T *bias, + const T *scale, + T *output, + T *mean, + T *variance, + int begin_norm_axis, + float eps) { + const auto x_dims = phi::make_ddim(input_shape); + auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis); + int64_t batch_size = static_cast(matrix_dim[0]); + int64_t feature_size = static_cast(matrix_dim[1]); + switch (paddle::operators::GetDesiredBlockDim(feature_size)) { + FIXED_BLOCK_DIM_CASE(paddle::operators::LayerNormForward< + T, + T, + kBlockDim><<>>( + input, scale, bias, output, mean, variance, eps, feature_size)); + default: + PADDLE_THROW(phi::errors::InvalidArgument( + "Product from begin_norm_axis to end in layer_norm must be larger " + "than 1")); + break; + } +} + +template class LayerNormDirectCUDAFunctor; + +template +void LayerNormKernel(const Context &dev_ctx, + const DenseTensor &x, + paddle::optional scale_opt, + paddle::optional bias_opt, + float epsilon, + int begin_norm_axis, + bool is_test, + DenseTensor *y, + DenseTensor *mean, + DenseTensor *var) { + using U = paddle::operators::LayerNormParamType; + auto *scale = scale_opt.get_ptr(); + auto *bias = bias_opt.get_ptr(); + + const auto x_dims = x.dims(); + auto *x_data = x.data(); + auto *y_data = dev_ctx.template Alloc(y); + auto *mean_data = dev_ctx.template Alloc(mean); + auto *var_data = dev_ctx.template Alloc(var); + + auto *void_scale_data = (scale == nullptr ? nullptr : scale->data()); + auto *void_bias_data = (bias == nullptr ? nullptr : bias->data()); + + auto x_dtype = x.dtype(); + phi::DataType scale_bias_dtype; + if (void_scale_data != nullptr) { + scale_bias_dtype = scale->dtype(); + if (void_bias_data != nullptr) { + PADDLE_ENFORCE_EQ( + scale->dtype(), + bias->dtype(), + phi::errors::InvalidArgument("Thie Scale and Bias of layer_norm op " + "should have the same data type.")); + } + } else { + scale_bias_dtype = (void_bias_data != nullptr ? bias->dtype() : x_dtype); + } + + bool is_scale_bias_same_dtype_with_x = x_dtype == scale_bias_dtype; + if (!is_scale_bias_same_dtype_with_x) { + PADDLE_ENFORCE_EQ(scale_bias_dtype, + paddle::experimental::CppTypeToDataType::Type(), + phi::errors::InvalidArgument( + "Unsupported data type of Scale and Bias")); + } + + auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis); + int64_t batch_size = static_cast(matrix_dim[0]); + int64_t feature_size = static_cast(matrix_dim[1]); + + auto stream = dev_ctx.stream(); + +#define PADDLE_LAUNCH_LAYERNORM_FWD(ScaleBiasT, IsScaleBiasSameDTypeWithX) \ + do { \ + switch (paddle::operators::GetDesiredBlockDim(feature_size)) { \ + FIXED_BLOCK_DIM_CASE(paddle::operators::LayerNormForward< \ + T, \ + U, \ + kBlockDim, \ + IsScaleBiasSameDTypeWithX><<>>( \ + x_data, \ + static_cast(void_scale_data), \ + static_cast(void_bias_data), \ + y_data, \ + mean_data, \ + var_data, \ + epsilon, \ + feature_size)); \ + default: \ + PADDLE_THROW(phi::errors::InvalidArgument( \ + "Product from begin_norm_axis to end must be larger than 1")); \ + break; \ + } \ + } while (0) + +#ifdef PADDLE_WITH_CUDA + bool can_call_1024_kernel = false; + if (feature_size == 1024 && scale != nullptr && bias != nullptr) { + can_call_1024_kernel = true; + } + if (can_call_1024_kernel) { + const int WARPS_M = 4; + const int WARPS_N = 1; + const int THREADS_PER_WARP = 32; + const int BYTES_PER_LDG = 16; + const int VecSize = BYTES_PER_LDG / sizeof(T); + + const int THREADS_PER_CTA = WARPS_N * THREADS_PER_WARP * WARPS_M; + const int ROWS_PER_CTA = WARPS_M; + + const int grid = static_cast( + std::ceil(batch_size / static_cast(ROWS_PER_CTA))); + if (is_scale_bias_same_dtype_with_x) { + paddle::operators::ln_fwd_1024_kernel< + T, + U, + T, + VecSize, + WARPS_M, + WARPS_N, + BYTES_PER_LDG><<>>( + batch_size, + feature_size, + epsilon, + x_data, + static_cast(void_scale_data), + static_cast(void_bias_data), + mean_data, + var_data, + y_data); + } else { + paddle::operators::ln_fwd_1024_kernel< + T, + U, + U, + VecSize, + WARPS_M, + WARPS_N, + BYTES_PER_LDG><<>>( + batch_size, + feature_size, + epsilon, + x_data, + static_cast(void_scale_data), + static_cast(void_bias_data), + mean_data, + var_data, + y_data); + } + } else { +#endif + if (is_scale_bias_same_dtype_with_x) { + PADDLE_LAUNCH_LAYERNORM_FWD(T, true); + } else { + PADDLE_LAUNCH_LAYERNORM_FWD(U, false); + } +#ifdef PADDLE_WITH_CUDA + } +#endif + +#undef PADDLE_LAUNCH_LAYERNORM_FWD +} + +} // namespace phi + +#ifdef PADDLE_WITH_HIP +// MIOPEN do not support double +PD_REGISTER_KERNEL(layer_norm, + GPU, + ALL_LAYOUT, + phi::LayerNormKernel, + float, + phi::dtype::float16) {} +#elif CUDNN_VERSION_MIN(8, 1, 0) +PD_REGISTER_KERNEL(layer_norm, + GPU, + ALL_LAYOUT, + phi::LayerNormKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} +#else +PD_REGISTER_KERNEL(layer_norm, + GPU, + ALL_LAYOUT, + phi::LayerNormKernel, + float, + double, + phi::dtype::float16) {} +#endif diff --git a/paddle/phi/kernels/gpu/lgamma_grad_kernel.cu b/paddle/phi/kernels/gpu/lgamma_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..3e4cd21a658f103aca9bc611a2d42518245e4401 --- /dev/null +++ b/paddle/phi/kernels/gpu/lgamma_grad_kernel.cu @@ -0,0 +1,21 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/lgamma_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h" +PD_REGISTER_KERNEL( + lgamma_grad, GPU, ALL_LAYOUT, phi::LgammaGradKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/reduce_max_kernel.cc b/paddle/phi/kernels/gpu/lgamma_kernel.cu similarity index 54% rename from paddle/phi/kernels/cpu/reduce_max_kernel.cc rename to paddle/phi/kernels/gpu/lgamma_kernel.cu index f9ea0aa0faf06918253f9037282b924199e3a314..e94d67f4ce324ad9d8237a377d70a920cdbd30af 100644 --- a/paddle/phi/kernels/cpu/reduce_max_kernel.cc +++ b/paddle/phi/kernels/gpu/lgamma_kernel.cu @@ -12,28 +12,30 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/reduce_max_kernel.h" +#include "paddle/phi/kernels/lgamma_kernel.h" -#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/cpu/reduce.h" -#include "paddle/phi/kernels/funcs/reduce_functor.h" +#include "paddle/phi/kernels/funcs/elementwise_base.h" namespace phi { - +template +struct CudaLgammaFunctor { + __device__ __forceinline__ T operator()(const T x) const { + return Eigen::numext::lgamma(x); + } +}; template -void MaxRawKernel(const Context& dev_ctx, +void LgammaKernel(const Context& dev_ctx, const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, DenseTensor* out) { - auto out_dtype = x.dtype(); - phi::Reduce( - dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); + // XKTODO( add gpu kernel implementation. ) + dev_ctx.template Alloc(out); + std::vector ins = {&x}; + std::vector outs = {out}; + auto functor = CudaLgammaFunctor(); + phi::funcs::ElementwiseKernel(dev_ctx, ins, &outs, functor); } - } // namespace phi -PD_REGISTER_KERNEL( - max_raw, CPU, ALL_LAYOUT, phi::MaxRawKernel, float, double, int, int64_t) {} +PD_REGISTER_KERNEL(lgamma, GPU, ALL_LAYOUT, phi::LgammaKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..f7b282536558db524c082de11c7ca92b2bd98edc --- /dev/null +++ b/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu @@ -0,0 +1,53 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/log_softmax_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h" + +namespace phi { + +template +void LogSoftmaxGradKernel(const Context &dev_ctx, + const DenseTensor &out, + const DenseTensor &out_grad, + int axis, + DenseTensor *x_grad) { + dev_ctx.template Alloc(x_grad); + phi::SoftmaxBackwardCUDAKernelDriver( + dev_ctx, out, out_grad, axis, x_grad); +} + +} // namespace phi + +#ifdef PADDLE_WITH_HIP +PD_REGISTER_KERNEL(log_softmax_grad, + GPU, + ALL_LAYOUT, + phi::LogSoftmaxGradKernel, + float, + phi::dtype::float16, + phi::dtype::bfloat16) {} +#else +PD_REGISTER_KERNEL(log_softmax_grad, + GPU, + ALL_LAYOUT, + phi::LogSoftmaxGradKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} +#endif diff --git a/paddle/phi/kernels/gpu/log_softmax_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..d7e34c6c14e7a49f50c016d888f6fb875dca0776 --- /dev/null +++ b/paddle/phi/kernels/gpu/log_softmax_kernel.cu @@ -0,0 +1,51 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/log_softmax_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h" + +namespace phi { + +template +void LogSoftmaxKernel(const Context &dev_ctx, + const DenseTensor &x, + int axis, + DenseTensor *out) { + dev_ctx.template Alloc(out); + phi::SoftmaxForwardCUDAKernelDriver(dev_ctx, x, axis, out); +} + +} // namespace phi + +#ifdef PADDLE_WITH_HIP +PD_REGISTER_KERNEL(log_softmax, + GPU, + ALL_LAYOUT, + phi::LogSoftmaxKernel, + float, + phi::dtype::float16, + phi::dtype::bfloat16) {} +#else +PD_REGISTER_KERNEL(log_softmax, + GPU, + ALL_LAYOUT, + phi::LogSoftmaxKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} +#endif diff --git a/paddle/phi/kernels/gpu/masked_select_kernel.cu b/paddle/phi/kernels/gpu/masked_select_kernel.cu index fc4adca2f42438f464346ad83bc7e49448826bb2..b443ae6b8fb5e6c3bf5264a50d25205a419f22ad 100644 --- a/paddle/phi/kernels/gpu/masked_select_kernel.cu +++ b/paddle/phi/kernels/gpu/masked_select_kernel.cu @@ -19,34 +19,27 @@ #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/select_impl.cu.h" #include "paddle/phi/kernels/masked_select_kernel.h" namespace phi { -__global__ void SetMaskArray(const bool* mask, int32_t* mask_array, int size) { - int idx = blockDim.x * blockIdx.x + threadIdx.x; - for (; idx < size; idx += blockDim.x * gridDim.x) { - if (mask[idx]) - mask_array[idx] = 1; - else - mask_array[idx] = 0; - } -} +template +struct MaskedSelectFunctor { + HOSTDEVICE MaskedSelectFunctor() {} -template -__global__ void SelectWithPrefixMask(const int32_t* mask_prefix_sum, - const bool* mask, - const T* input, - T* out, - int size) { - int idx = blockDim.x * blockIdx.x + threadIdx.x; - for (; idx < size; idx += blockDim.x * gridDim.x) { - if (mask[idx]) { - int index = mask_prefix_sum[idx]; - out[index] = input[idx]; + HOSTDEVICE inline void operator()(OutT* out, + const MT* mask, + const InT* value, + int num) { + int store_fix = 0; + for (int idx = 0; idx < num; idx++) { + if (mask[idx]) { + out[store_fix++] = value[idx]; + } } } -} +}; template void MaskedSelectKernel(const Context& dev_ctx, @@ -68,42 +61,9 @@ void MaskedSelectKernel(const Context& dev_ctx, "value.", input_dim, mask_dim)); - - thrust::device_ptr mask_dev_ptr = - thrust::device_pointer_cast(mask_data); - thrust::device_vector mask_vec(mask_dev_ptr, mask_dev_ptr + mask_size); - auto out_size = thrust::count(mask_vec.begin(), mask_vec.end(), true); - - DDim out_dim{out_size}; - out->Resize(out_dim); - auto out_data = out->mutable_data(dev_ctx.GetPlace()); - - DenseTensor mask_array; - DenseTensor mask_prefix_sum; - mask_array.Resize(mask_dim); - mask_prefix_sum.Resize(mask_dim); - - int32_t* mask_array_data = - mask_array.mutable_data(dev_ctx.GetPlace()); - int32_t* mask_prefix_sum_data = - mask_prefix_sum.mutable_data(dev_ctx.GetPlace()); - int threads = 512; - int grid = (mask_size + threads - 1) / threads; - auto stream = dev_ctx.stream(); - SetMaskArray<<>>( - mask_data, mask_array_data, mask_size); - - thrust::device_ptr mask_array_dev_ptr = - thrust::device_pointer_cast(mask_array_data); - thrust::device_vector mask_array_vec(mask_array_dev_ptr, - mask_array_dev_ptr + mask_size); - thrust::exclusive_scan(thrust::device, - mask_array_vec.begin(), - mask_array_vec.end(), - mask_prefix_sum_data); - - SelectWithPrefixMask<<>>( - mask_prefix_sum_data, mask_data, input_data, out_data, mask_size); + using Functor = MaskedSelectFunctor; + phi::funcs::SelectKernel( + dev_ctx, mask, x, out, Functor()); } } // namespace phi diff --git a/paddle/phi/kernels/gpu/math_kernel.cu b/paddle/phi/kernels/gpu/math_kernel.cu deleted file mode 100644 index af9d5574aa9feaf4d44482bbf0e75f31a5139595..0000000000000000000000000000000000000000 --- a/paddle/phi/kernels/gpu/math_kernel.cu +++ /dev/null @@ -1,176 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/phi/kernels/math_kernel.h" - -#include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/kernels/funcs/broadcast_function.h" -#include "paddle/phi/kernels/funcs/elementwise_functor.h" -#include "paddle/phi/kernels/gpu/reduce.h" - -#ifdef __NVCC__ -#include "cub/cub.cuh" -#endif -#ifdef __HIPCC__ -#include -namespace cub = hipcub; -#endif - -#include "paddle/phi/common/complex.h" -#include "paddle/phi/common/float16.h" -#include "paddle/phi/core/compat/convert_utils.h" -#include "paddle/phi/core/enforce.h" -#include "paddle/phi/core/kernel_registry.h" - -namespace phi { - -#define DEFINE_CUDA_ELEMENTWISE_OP(name) \ - template \ - void name##RawKernel(const Context& dev_ctx, \ - const DenseTensor& x, \ - const DenseTensor& y, \ - int axis, \ - DenseTensor* out) { \ - std::vector inputs; \ - std::vector outputs; \ - inputs.emplace_back(&x); \ - inputs.emplace_back(&y); \ - outputs.emplace_back(out); \ - dev_ctx.template Alloc(out); \ - funcs::BroadcastKernel( \ - dev_ctx, inputs, &outputs, axis, funcs::name##Functor()); \ - } - -/** - * Kernels - */ - -template -void MeanRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out) { - auto out_dtype = x.dtype(); - phi::Reduce( - dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); -} - -template -void SumRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DataType out_dtype, - DenseTensor* out) { - phi::Reduce( - dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); -} - -// Create the definition of Add -DEFINE_CUDA_ELEMENTWISE_OP(Add) -// Create the definition of Subtract -DEFINE_CUDA_ELEMENTWISE_OP(Subtract) -// Create the definition of Multiply -DEFINE_CUDA_ELEMENTWISE_OP(Multiply) -// Create the definition of Divide -DEFINE_CUDA_ELEMENTWISE_OP(Divide) - -} // namespace phi - -using float16 = phi::dtype::float16; -using bfloat16 = phi::dtype::bfloat16; -using complex64 = ::phi::dtype::complex; -using complex128 = ::phi::dtype::complex; - -PD_REGISTER_KERNEL(add_raw, - GPU, - ALL_LAYOUT, - phi::AddRawKernel, - float, - double, - int16_t, - int, - int64_t, - float16, - bfloat16, - complex64, - complex128) {} -PD_REGISTER_KERNEL(subtract_raw, - GPU, - ALL_LAYOUT, - phi::SubtractRawKernel, - float, - double, - int16_t, - int, - int64_t, - float16, - bfloat16, - complex64, - complex128) {} -PD_REGISTER_KERNEL(divide_raw, - GPU, - ALL_LAYOUT, - phi::DivideRawKernel, - float, - double, - int, - int64_t, - float16, - bfloat16, - complex64, - complex128) {} -PD_REGISTER_KERNEL(multiply_raw, - GPU, - ALL_LAYOUT, - phi::MultiplyRawKernel, - float, - double, - int, - int64_t, - bool, - float16, - complex64, - complex128, - bfloat16) {} -PD_REGISTER_KERNEL(sum_raw, - GPU, - ALL_LAYOUT, - phi::SumRawKernel, - bool, - float, - double, - float16, - bfloat16, - int16_t, - int, - int64_t, - complex64, - complex128) { - kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); -} - -PD_REGISTER_KERNEL(mean_raw, - GPU, - ALL_LAYOUT, - phi::MeanRawKernel, - float, - double, - bool, - float16, - int, - int64_t) {} diff --git a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu index 2009547fc8d6f18c488faab5fd57cc985990229b..66ba30f7ce6945693a974733c77a47f0d328e50b 100644 --- a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu +++ b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu @@ -23,12 +23,12 @@ #include "paddle/phi/backends/dynload/cusolver.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/abs_kernel.h" +#include "paddle/phi/kernels/elementwise_kernel.h" #include "paddle/phi/kernels/full_kernel.h" #include "paddle/phi/kernels/funcs/broadcast_function.h" #include "paddle/phi/kernels/funcs/compare_functors.h" #include "paddle/phi/kernels/impl/matrix_rank_kernel_impl.h" -#include "paddle/phi/kernels/math_kernel.h" -#include "paddle/phi/kernels/reduce_max_kernel.h" +#include "paddle/phi/kernels/reduce_kernel.h" namespace phi { diff --git a/paddle/phi/kernels/gpu/mode_grad_kernel.cu b/paddle/phi/kernels/gpu/mode_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..43502621c2d3a878a144de1878aa09b8d64b6a47 --- /dev/null +++ b/paddle/phi/kernels/gpu/mode_grad_kernel.cu @@ -0,0 +1,85 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/mode_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/mode.h" + +namespace phi { + +template +__global__ void AssignGradWithAxis(const T* grad_out, + const int64_t* indices, + T* grad_in, + int pre, + int post, + int raw_height, + int k) { + // raw_height is the length of topk axis + for (int i = blockIdx.x; i < pre; i += gridDim.x) { + int base_index = i * post * k; + int base_grad = i * post * raw_height; + for (int j = threadIdx.x; j < raw_height * post; j += blockDim.x) { + grad_in[base_grad + j] = static_cast(0); + } + __syncthreads(); + for (int j = threadIdx.x; j < k * post; j += blockDim.x) { + int64_t idx_ij = indices[base_index + j]; + int64_t in_ij = base_grad + (idx_ij * post) + (j % post); + grad_in[in_ij] = grad_out[base_index + j]; + } + } +} + +template +void ModeGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& indices, + const DenseTensor& out_grad, + int axis, + bool keepdim, + DenseTensor* x_grad) { + const auto& in_dims = x.dims(); + auto out_dims = indices.dims(); + + if (axis < 0) axis += in_dims.size(); + // allocate the cuda memory for the x_grad + T* x_grad_data = dev_ctx.template Alloc(x_grad); + const T* out_grad_data = out_grad.data(); + const int64_t* indices_data = indices.data(); + + int pre, n, post; + funcs::GetDims(in_dims, axis, &pre, &n, &post); + + // calcluate the block and grid num + int block_size = funcs::ComputeBlockSize(post); + int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); + const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1); + int grid_size = std::min(max_blocks, pre); + AssignGradWithAxis<<>>( + out_grad_data, indices_data, x_grad_data, pre, post, n, 1); +} + +} // namespace phi + +PD_REGISTER_KERNEL(mode_grad, + GPU, + ALL_LAYOUT, + phi::ModeGradKernel, + float, + double, + int32_t, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/mode_kernel.cu b/paddle/phi/kernels/gpu/mode_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..629b9722cd6bcfe12d0fb5a7e8be6439f5ea286f --- /dev/null +++ b/paddle/phi/kernels/gpu/mode_kernel.cu @@ -0,0 +1,119 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/mode_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/mode.h" + +namespace phi { + +template +void ModeKernel(const Context& dev_ctx, + const DenseTensor& x, + int axis, + bool keepdim, + DenseTensor* out, + DenseTensor* indices) { + // get the input dims + const auto& in_dims = x.dims(); + // calcluate the real axis + if (axis < 0) axis += in_dims.size(); + + auto out_dims = out->dims(); + + const T* input_data = x.data(); + T* output_data = dev_ctx.template Alloc(out); + int64_t* indices_data = dev_ctx.template Alloc(indices); + + if (axis == in_dims.size() - 1) { + const int64_t& input_height = + phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); + const int64_t& input_width = in_dims[in_dims.size() - 1]; + funcs::GetModebySort( + dev_ctx, &x, input_width, input_height, output_data, indices_data); + } else { + std::vector trans_axis; + for (int i = 0; i < axis; i++) { + trans_axis.emplace_back(i); + } + trans_axis.emplace_back(in_dims.size() - 1); + for (int i = axis + 1; i < in_dims.size() - 1; i++) { + trans_axis.emplace_back(i); + } + trans_axis.emplace_back(axis); + + if (!keepdim) { + std::vector tmp_out_shape; + for (int i = 0; i < axis; i++) { + tmp_out_shape.emplace_back(in_dims[i]); + } + tmp_out_shape.emplace_back(1); + for (int i = axis + 1; i < in_dims.size(); i++) { + tmp_out_shape.emplace_back(in_dims[i]); + } + DDim tmp_out_dim = phi::make_ddim(tmp_out_shape); + out->Resize(tmp_out_dim); + indices->Resize(tmp_out_dim); + } + + DDim trans_shape(in_dims); + DDim trans_out_shape(in_dims); + for (int i = 0; i < trans_axis.size(); i++) { + trans_shape[i] = in_dims[trans_axis[i]]; + trans_out_shape[i] = in_dims[trans_axis[i]]; + } + trans_out_shape[in_dims.size() - 1] = 1; + + // second step, tranpose the input + DenseTensor trans_input; + trans_input.Resize(trans_shape); + dev_ctx.template Alloc(&trans_input); + + int ndims = trans_axis.size(); + funcs::TransCompute( + ndims, dev_ctx, x, &trans_input, trans_axis); + DenseTensor trans_ind; + trans_ind.Resize(trans_out_shape); + int64_t* trans_ind_data = dev_ctx.template Alloc(&trans_ind); + + DenseTensor trans_out; + trans_out.Resize(trans_out_shape); + T* trans_out_data = dev_ctx.template Alloc(&trans_out); + + const int64_t input_height = + phi::product(phi::slice_ddim(trans_shape, 0, trans_shape.size() - 1)); + const int64_t input_width = trans_shape[trans_shape.size() - 1]; + funcs::GetModebySort(dev_ctx, + &trans_input, + input_width, + input_height, + trans_out_data, + trans_ind_data); + // last step, tranpose back the indices and output + funcs::TransCompute( + ndims, dev_ctx, trans_ind, indices, trans_axis); + funcs::TransCompute(ndims, dev_ctx, trans_out, out, trans_axis); + if (!keepdim) { + out->Resize(out_dims); + indices->Resize(out_dims); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + mode, GPU, ALL_LAYOUT, phi::ModeKernel, float, double, int32_t, int64_t) {} diff --git a/paddle/phi/kernels/gpu/multiplex_grad_kernel.cu b/paddle/phi/kernels/gpu/multiplex_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..21576ab608d269340322782c8113c6054c791e74 --- /dev/null +++ b/paddle/phi/kernels/gpu/multiplex_grad_kernel.cu @@ -0,0 +1,68 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/multiplex_grad_kernel.h" + +#include "paddle/phi/api/lib/utils/tensor_utils.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" + +namespace phi { + +template +void MultiplexGradKernel(const Context& ctx, + const DenseTensor& ids, + const DenseTensor& out_grad, + std::vector ins_grad) { + size_t idx = -1UL; + for (size_t i = 0; i < ins_grad.size(); i++) { + if (ins_grad[i]) { + ctx.template Alloc(ins_grad[i]); + auto t = phi::EigenVector::Flatten(*ins_grad[i]); + t.device(*ctx.eigen_device()) = t.constant(static_cast(0)); + idx = i; + } + } + if (idx == -1UL) return; + + auto rows = ins_grad[idx]->dims()[0]; + auto cols = ins_grad[idx]->numel() / rows; + DenseTensor index_t_cpu; + paddle::framework::TensorCopySync(ids, phi::CPUPlace(), &index_t_cpu); + auto* index = index_t_cpu.data(); + auto stream = ctx.stream(); + for (auto i = 0; i < rows; i++) { + size_t k = static_cast(index[i]); + if (ins_grad[k]) { + paddle::memory::Copy(ctx.GetPlace(), + ins_grad[k]->data() + i * cols, + ctx.GetPlace(), + out_grad.data() + i * cols, + cols * sizeof(T), + stream); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(multiplex_grad, + GPU, + ALL_LAYOUT, + phi::MultiplexGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/multiplex_kernel.cu b/paddle/phi/kernels/gpu/multiplex_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..743448a46866687cf2ac68be522a306281289252 --- /dev/null +++ b/paddle/phi/kernels/gpu/multiplex_kernel.cu @@ -0,0 +1,70 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/multiplex_kernel.h" + +#include "paddle/phi/api/lib/utils/tensor_utils.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void MultiplexKernel(const Context& ctx, + const std::vector& ins, + const DenseTensor& ids, + DenseTensor* out) { + ctx.template Alloc(out); + for (size_t i = 0; i < ins.size(); ++i) { + PADDLE_ENFORCE_GT( + ins[i]->numel(), + 0, + errors::OutOfRange( + "indexing will be out of bounds with size 0 for the %d-th input.", + i)); + } + + auto rows = ins[0]->dims()[0]; + auto cols = ins[0]->numel() / rows; + DenseTensor index_t_cpu; + paddle::framework::TensorCopySync(ids, phi::CPUPlace(), &index_t_cpu); + auto* index = index_t_cpu.data(); + auto stream = ctx.stream(); + for (auto i = 0; i < rows; i++) { + int32_t k = index[i]; + PADDLE_ENFORCE_GE( + k, 0, errors::PreconditionNotMet("index must be nonnegative.")); + PADDLE_ENFORCE_LT(static_cast(k), + ins.size(), + errors::PreconditionNotMet( + "index exceeds the number of candidate tensors.")); + paddle::memory::Copy(ctx.GetPlace(), + out->data() + i * cols, + ctx.GetPlace(), + ins[k]->data() + i * cols, + cols * sizeof(T), + stream); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(multiplex, + GPU, + ALL_LAYOUT, + phi::MultiplexKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/prelu_funcs.h b/paddle/phi/kernels/gpu/prelu_funcs.h new file mode 100644 index 0000000000000000000000000000000000000000..76ee9439a2050b000b5cffd1df47581141a874c7 --- /dev/null +++ b/paddle/phi/kernels/gpu/prelu_funcs.h @@ -0,0 +1,183 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +#define CUDA_NUM_THREADS 1024 + +inline static int PADDLE_GET_BLOCKS(const int N) { + return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; +} + +template +__global__ void PReluChannelFirstWiseKernel(const T *input, + const T *alpha, + T *output, + size_t channel_num, + size_t plane_size, + size_t numel) { + CUDA_KERNEL_LOOP(index, numel) { + size_t temp = index / plane_size; + size_t channel_index = temp % channel_num; + T scale = alpha[channel_index]; + T x = input[index]; + T zero = static_cast(0); + output[index] = (x > zero) ? x : scale * x; + } +} + +template +__global__ void PReluChannelLastWiseKernel(const T *input, + const T *alpha, + T *output, + size_t channel_num, + size_t numel) { + CUDA_KERNEL_LOOP(index, numel) { + size_t channel_index = index % channel_num; + T scale = alpha[channel_index]; + T x = input[index]; + T zero = static_cast(0); + output[index] = (x > zero) ? x : scale * x; + } +} + +template +__global__ void PReluElementWiseKernel(const T *input, + const T *alpha, + T *output, + size_t spatial_size, + size_t numel) { + CUDA_KERNEL_LOOP(index, numel) { + size_t element_index = index % spatial_size; + T scale = alpha[element_index]; + T x = input[index]; + T zero = static_cast(0); + output[index] = (x > zero) ? x : scale * x; + } +} + +template +__global__ void PReluScalarKernel(const T *input, + const T *alpha, + T *output, + size_t numel) { + T scale = alpha[0]; + CUDA_KERNEL_LOOP(index, numel) { + T x = input[index]; + T zero = static_cast(0); + output[index] = (x > zero) ? x : scale * x; + } +} + +template +class PreluChannelWiseDirectCUDAFunctor { + public: + void operator()(gpuStream_t stream, + const T *input, + const T *alpha, + T *output, + size_t batch_size, + size_t channel, + bool channel_last, + size_t numel); +}; + +template +class PreluElementWiseDirectCUDAFunctor { + public: + void operator()(gpuStream_t stream, + const T *input, + const T *alpha, + T *output, + size_t batch_size, + size_t numel); +}; + +template +class PreluScalarDirectCUDAFunctor { + public: + void operator()(gpuStream_t stream, + const T *input, + const T *alpha, + T *output, + size_t numel); +}; + +template +void PreluChannelWiseDirectCUDAFunctor::operator()(gpuStream_t stream, + const T *input, + const T *alpha, + T *output, + size_t batch_size, + size_t channel, + bool channel_last, + size_t numel) { + if (channel_last) { + PReluChannelLastWiseKernel<<>>( + input, alpha, output, channel, numel); + } else { + PReluChannelFirstWiseKernel<<>>( + input, alpha, output, channel, numel / batch_size / channel, numel); + } +} + +template +void PreluElementWiseDirectCUDAFunctor::operator()(gpuStream_t stream, + const T *input, + const T *alpha, + T *output, + size_t batch_size, + size_t numel) { + PReluElementWiseKernel<<>>( + input, alpha, output, numel / batch_size, numel); +} + +template +void PreluScalarDirectCUDAFunctor::operator()(gpuStream_t stream, + const T *input, + const T *alpha, + T *output, + size_t numel) { + PReluScalarKernel<<>>( + input, alpha, output, numel); +} + +template class PreluChannelWiseDirectCUDAFunctor; +template class PreluChannelWiseDirectCUDAFunctor; +template class PreluChannelWiseDirectCUDAFunctor; + +template class PreluElementWiseDirectCUDAFunctor; +template class PreluElementWiseDirectCUDAFunctor; +template class PreluElementWiseDirectCUDAFunctor; + +template class PreluScalarDirectCUDAFunctor; +template class PreluScalarDirectCUDAFunctor; +template class PreluScalarDirectCUDAFunctor; + +} // namespace phi diff --git a/paddle/phi/kernels/gpu/prelu_grad_kernel.cu b/paddle/phi/kernels/gpu/prelu_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..d8661268e82c35f48d9877120574628c4325ae4e --- /dev/null +++ b/paddle/phi/kernels/gpu/prelu_grad_kernel.cu @@ -0,0 +1,183 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/prelu_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/tensor_meta.h" +#include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/funcs/reduce_function.h" +#include "paddle/phi/kernels/gpu/prelu_funcs.h" +#include "paddle/phi/kernels/primitive/functor_primitives.h" + +namespace phi { + +enum PRELU_MODE { Element, ChannelFirst, ChannelLast, PRELU_Scalar }; + +template +__global__ void PReluOpGradKernel(const T* x_ptr, + const T* alpha_ptr, + const T* out_grad_ptr, + T* x_grad_ptr, + T* alpha_grad_ptr, + size_t channel_num, + size_t plane_size, + size_t spatial_size, + size_t numel, + PRELU_MODE mode) { + CUDA_KERNEL_LOOP(index, numel) { + T scale; + if (mode == Element) { + size_t element_index = index % spatial_size; + scale = alpha_ptr[element_index]; + } else if (mode == ChannelFirst) { + size_t temp = index / plane_size; + size_t channel_index = temp % channel_num; + scale = alpha_ptr[channel_index]; + } else if (mode == ChannelLast) { + size_t channel_index = index % channel_num; + scale = alpha_ptr[channel_index]; + } else { + scale = alpha_ptr[0]; + } + T x = x_ptr[index]; + T out_grad = out_grad_ptr[index]; + T zero = static_cast(0); + if (x_grad_ptr != nullptr) + x_grad_ptr[index] = (x > zero) ? out_grad : scale * out_grad; + if (alpha_grad_ptr != nullptr) + alpha_grad_ptr[index] = (x > zero) ? zero : x * out_grad; + } +} + +template +class PreluOpGradFunctor { + public: + void operator()(gpuStream_t stream, + const T* x, + const T* alpha, + const T* out_grad, + T* x_grad, + T* alpha_grad, + const DDim& input_dims, + PRELU_MODE mode) { + size_t numel = 1; + for (size_t i = 0; i < input_dims.size(); ++i) { + numel *= input_dims[i]; + } + size_t plane_size = numel / input_dims[0] / input_dims[1]; + size_t spatial_size = numel / input_dims[0]; + size_t channel = + mode == ChannelLast ? input_dims[input_dims.size() - 1] : input_dims[1]; + + PReluOpGradKernel< + T><<>>( + x, + alpha, + out_grad, + x_grad, + alpha_grad, + channel, + plane_size, + spatial_size, + numel, + mode); + } +}; + +template +void PReluGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& alpha, + const DenseTensor& out_grad, + const std::string& mode, + const std::string& data_format, + DenseTensor* x_grad, + DenseTensor* alpha_grad) { + dev_ctx.template Alloc(x_grad); + + const T* x_ptr = x.data(); + const T* alpha_ptr = alpha.data(); + const T* out_grad_ptr = out_grad.data(); + T* x_grad_ptr = x_grad ? dev_ctx.template Alloc(x_grad) : nullptr; + T* alpha_grad_ptr = + alpha_grad ? dev_ctx.template Alloc(alpha_grad) : nullptr; + + if (!x_grad && !alpha_grad) return; + + int numel = x.numel(); + auto dim = x.dims(); + auto x_rank = dim.size(); + std::vector input_shape = phi::vectorize(dim); + auto stream = dev_ctx.stream(); + + T* alpha_grad_tmp_ptr; + DenseTensor alpha_grad_tmp; + if (alpha_grad_ptr == nullptr) { + alpha_grad_tmp_ptr = alpha_grad_ptr; + } else { + DenseTensorMeta alpha_grad_meta( + alpha_grad->dtype(), dim, alpha_grad->layout()); + alpha_grad_tmp = phi::Empty(dev_ctx, std::move(alpha_grad_meta)); + alpha_grad_tmp_ptr = alpha_grad_tmp.data(); + } + + PRELU_MODE m; + bool channel_last = false; + if (mode == "element") { + m = Element; + } else if (mode == "channel") { + channel_last = data_format == "NHWC"; + m = channel_last ? ChannelLast : ChannelFirst; + } else { + m = PRELU_Scalar; + } + PreluOpGradFunctor prelu_grad; + prelu_grad(stream, + x_ptr, + alpha_ptr, + out_grad_ptr, + x_grad_ptr, + alpha_grad_tmp_ptr, + dim, + m); + + if (alpha_grad_tmp_ptr == nullptr) return; + + std::vector reduce_dims; + for (size_t i = 0; i < dim.size(); i++) { + if (mode == "channel" && !channel_last && i == 1) continue; + if (mode == "channel" && channel_last && i == dim.size() - 1) continue; + if (mode == "element" && i != 0) continue; + reduce_dims.push_back(i); + } + + phi::funcs::ReduceKernel>( + static_cast(dev_ctx), + alpha_grad_tmp, + alpha_grad, + kps::IdentityFunctor(), + reduce_dims); +} + +} // namespace phi + +PD_REGISTER_KERNEL(prelu_grad, + GPU, + ALL_LAYOUT, + phi::PReluGradKernel, + float, + phi::dtype::float16, + double) {} diff --git a/paddle/phi/kernels/gpu/prelu_kernel.cu b/paddle/phi/kernels/gpu/prelu_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..8255a7ba2ed96dcdeb8d6e23a4637ce56d636a12 --- /dev/null +++ b/paddle/phi/kernels/gpu/prelu_kernel.cu @@ -0,0 +1,71 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/prelu_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gpu/prelu_funcs.h" + +namespace phi { + +template +void PReluKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& alpha, + const std::string& mode, + const std::string& data_format, + DenseTensor* out) { + const T* x_ptr = x.data(); + T* o_ptr = dev_ctx.template Alloc(out); + + const T* alpha_ptr = alpha.data(); + int numel = x.numel(); + auto dim = x.dims(); + auto x_rank = dim.size(); + + VLOG(4) << "dim[0]:" << dim[0] << ", dim[1]:" << dim[1] << ", dim[" + << x_rank - 1 << "]:" << dim[x_rank - 1] << ", numel:" << numel; + + if (mode == "channel") { + bool channel_last = data_format == "NHWC"; + size_t channel = channel_last ? dim[x_rank - 1] : dim[1]; + PreluChannelWiseDirectCUDAFunctor prelu_channel_wise; + prelu_channel_wise(dev_ctx.stream(), + x_ptr, + alpha_ptr, + o_ptr, + dim[0], + channel, + channel_last, + numel); + } else if (mode == "element") { + PreluElementWiseDirectCUDAFunctor prelu_element_wise; + prelu_element_wise( + dev_ctx.stream(), x_ptr, alpha_ptr, o_ptr, dim[0], numel); + } else { + PreluScalarDirectCUDAFunctor prelu_scalar; + prelu_scalar(dev_ctx.stream(), x_ptr, alpha_ptr, o_ptr, numel); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(prelu, + GPU, + ALL_LAYOUT, + phi::PReluKernel, + float, + phi::dtype::float16, + double) {} diff --git a/paddle/phi/kernels/gpu/randperm_kernel.cu b/paddle/phi/kernels/gpu/randperm_kernel.cu index d4d90cac917a2c35e26eca0d57d1c5349b878599..92948bf47c9345e53d1fea54d2be4fd8efbc6f96 100644 --- a/paddle/phi/kernels/gpu/randperm_kernel.cu +++ b/paddle/phi/kernels/gpu/randperm_kernel.cu @@ -14,37 +14,161 @@ #include "paddle/phi/kernels/randperm_kernel.h" +#ifdef __NVCC__ +#include +#include "cub/cub.cuh" +#endif +#ifdef __HIPCC__ +#include +#include +namespace cub = hipcub; +#endif +#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/funcs/for_range.h" +#include "paddle/phi/kernels/randint_kernel.h" // See Note [ Why still include the fluid headers? ] #include "paddle/fluid/memory/memcpy.h" +DECLARE_bool(use_curand); + namespace phi { +template +__global__ void SwapRepeatKernel( + int* key, T* data, int n, uint64_t seed, uint64_t offset) { + size_t idx = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + if (idx < n) return; + + bool first_repeat = false; + if (data[idx] == data[idx + 1]) { + if (idx == 0) { + first_repeat = true; + } else if (data[idx] != data[idx - 1]) { + first_repeat = true; + } + } + + if (!first_repeat) return; + + int repeat_size = 1; + for (int i = idx; i < n; ++i) { + if (data[i] == data[i + 1]) { + ++repeat_size; + } else { + break; + } + } + +#ifdef __NVCC__ + curandStatePhilox4_32_10_t state; + curand_init(seed, idx, offset, &state); + for (int i = repeat_size - 1; i > 0; i--) { + uint32_t r = curand(&state) % (i + 1); +#elif __HIPCC__ + hiprandStatePhilox4_32_10_t state; + hiprand_init(seed, idx, offset, &state); + for (int i = repeat_size - 1; i > 0; i--) { + uint32_t r = hiprand(&state) % (i + 1); +#endif + if (r != i) { + T tmp = data[idx + i]; + data[idx + i] = data[idx + r]; + data[idx + r] = tmp; + } + } +} + template void RandpermRawKernel( const Context& dev_ctx, int n, DataType dtype, int seed, DenseTensor* out) { - DenseTensor tmp; - tmp.Resize(phi::make_ddim({n})); - T* tmp_data = dev_ctx.template HostAlloc(&tmp); - - std::shared_ptr engine; - if (seed) { - engine = std::make_shared(); - engine->seed(seed); + if (FLAGS_use_curand) { + DenseTensor key; + RandintKernel(dev_ctx, + std::numeric_limits::min(), + std::numeric_limits::max(), + ScalarArray({n}), + phi::DataType::INT32, + &key); + DenseTensor key_out = Empty(dev_ctx, ScalarArray({n})); + + DenseTensor range = Empty(dev_ctx, ScalarArray({n})); + T* range_data = range.data(); + funcs::ForRange for_range(dev_ctx, n); + for_range([range_data] __device__(size_t idx) { + range_data[idx] = static_cast(idx); + }); + + out->Resize(phi::make_ddim({n})); + T* out_data = dev_ctx.template Alloc(out); + + // Refer to [Algorithm of randperm] https://osf.io/af2hy/ to + // improve performance of radix sort. + double n_d = static_cast(n); + int begin_bit = 0; + int end_bit = + std::ceil(std::log2(n_d - (6 * n_d * n_d + 1) / (12 * std::log(0.9)))); + + size_t temp_storage_bytes = 0; + cub::DeviceRadixSort::SortPairs(nullptr, + temp_storage_bytes, + key.data(), + key_out.data(), + range.data(), + out_data, + n, + begin_bit, + end_bit < 32 ? end_bit : 32, + dev_ctx.stream()); + + auto d_temp_storage = paddle::memory::Alloc(dev_ctx, temp_storage_bytes); + cub::DeviceRadixSort::SortPairs(d_temp_storage->ptr(), + temp_storage_bytes, + key.data(), + key_out.data(), + range.data(), + out_data, + n, + begin_bit, + end_bit < 32 ? end_bit : 32, + dev_ctx.stream()); + + auto gen_cuda = dev_ctx.GetGenerator(); + auto seed_offset = gen_cuda->IncrementOffset(n); + uint64_t seed = seed_offset.first; + uint64_t offset = seed_offset.second; + + auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n); + SwapRepeatKernel<<>>( + key_out.data(), out_data, n, seed, offset); } else { - engine = dev_ctx.GetHostGenerator()->GetCPUEngine(); - } + DenseTensor tmp; + tmp.Resize(phi::make_ddim({n})); + T* tmp_data = dev_ctx.template HostAlloc(&tmp); - for (int i = 0; i < n; ++i) { - tmp_data[i] = static_cast(i); - } - std::shuffle(tmp_data, tmp_data + n, *engine); + std::shared_ptr engine; + if (seed) { + engine = std::make_shared(); + engine->seed(seed); + } else { + engine = dev_ctx.GetHostGenerator()->GetCPUEngine(); + } - T* out_data = dev_ctx.template Alloc(out); - auto size = out->numel() * paddle::experimental::SizeOf(out->dtype()); - paddle::memory::Copy( - out->place(), out_data, tmp.place(), tmp_data, size, 0); + for (int i = 0; i < n; ++i) { + tmp_data[i] = static_cast(i); + } + std::shuffle(tmp_data, tmp_data + n, *engine); + + T* out_data = dev_ctx.template Alloc(out); + auto size = out->numel() * paddle::experimental::SizeOf(out->dtype()); + paddle::memory::Copy( + out->place(), out_data, tmp.place(), tmp_data, size, 0); + } } template diff --git a/paddle/phi/kernels/gpu/reduce_grad.h b/paddle/phi/kernels/gpu/reduce_grad.h index d21c8a3fa46f81c046c722db50ac62fb57cf64f4..e32101b73728f637da0626d691018842aedd62e7 100644 --- a/paddle/phi/kernels/gpu/reduce_grad.h +++ b/paddle/phi/kernels/gpu/reduce_grad.h @@ -43,5 +43,59 @@ void ReduceGrad(const GPUContext& dev_ctx, })); } +template class TransformOp> +void ReduceGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType in_dtype, + DataType out_dtype, + DenseTensor* x_grad) { + auto* in_x = &x; + auto* d_out = &out_grad; + auto* d_x = x_grad; + + auto pt_out_dtype = in_dtype; + + // get reduce_dim and reduce_num for reduce_mean_grad + int dim_size = in_x->dims().size(); + std::vector reduce_dims = + funcs::details::GetReduceDim(dims, dim_size, reduce_all); + + auto update_dims = vectorize(d_x->dims()); + int reduce_num = 1; + for (auto i : reduce_dims) { + reduce_num *= (in_x->dims())[i]; + update_dims[i] = 1; + } + // make new tensor + DenseTensor new_d_out(d_out->dtype()); + new_d_out.ShareDataWith(*d_out); + new_d_out.Resize(phi::make_ddim(update_dims)); + if (in_dtype != DataType::UNDEFINED) { + dev_ctx.Alloc(d_x, in_dtype); + } else { + dev_ctx.Alloc(d_x, d_out->dtype()); + } + + auto pt_d_out = new_d_out; + auto pt_d_x = *d_x; + if (in_dtype == DataType::UNDEFINED) { + pt_out_dtype = d_out->dtype(); + } + using MPType = typename kps::details::MPTypeTrait::Type; + + phi::ReduceGrad>( + dev_ctx, + &pt_d_out, + &pt_d_x, + pt_out_dtype, + TransformOp(reduce_num)); +} + } // namespace phi #endif diff --git a/paddle/phi/kernels/gpu/reduce_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..5256048267ea19a4cb12387ebbc582a2df1bd1b1 --- /dev/null +++ b/paddle/phi/kernels/gpu/reduce_grad_kernel.cu @@ -0,0 +1,119 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/reduce_function.h" +#include "paddle/phi/kernels/gpu/reduce_grad.h" +#include "paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h" +#include "paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h" +#include "paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h" + +namespace phi { + +template +void ReduceSumGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType in_dtype, + DataType out_dtype, + DenseTensor* x_grad) { + ReduceGradKernel(dev_ctx, + x, + out_grad, + dims, + keep_dim, + reduce_all, + in_dtype, + out_dtype, + x_grad); +} + +template +void ReduceMeanGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType in_dtype, + DataType out_dtype, + DenseTensor* x_grad) { + ReduceGradKernel(dev_ctx, + x, + out_grad, + dims, + keep_dim, + reduce_all, + in_dtype, + out_dtype, + x_grad); +} + +} // namespace phi + +PD_REGISTER_KERNEL(sum_grad, + GPU, + ALL_LAYOUT, + phi::ReduceSumGradKernel, + bool, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16, + int, + int64_t, + phi::dtype::complex, + phi::dtype::complex) {} + +PD_REGISTER_KERNEL(mean_grad, + GPU, + ALL_LAYOUT, + phi::ReduceMeanGradKernel, + bool, + float, + double, + phi::dtype::float16) {} + +PD_REGISTER_KERNEL(prod_grad, + GPU, + ALL_LAYOUT, + phi::ReduceProdGradKernel, + float, + double, + int, + int64_t) {} + +PD_REGISTER_KERNEL(max_grad, + GPU, + ALL_LAYOUT, + phi::ReduceMaxGradKernel, + float, + double, + int, + int64_t) {} + +PD_REGISTER_KERNEL(min_grad, + GPU, + ALL_LAYOUT, + phi::ReduceMinGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/reduce_kernel.cu b/paddle/phi/kernels/gpu/reduce_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..6cbe699e8e05831b049536b06b1fdadcc145537d --- /dev/null +++ b/paddle/phi/kernels/gpu/reduce_kernel.cu @@ -0,0 +1,158 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gpu/reduce.h" + +namespace phi { + +template +void MeanRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +template +void SumRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType out_dtype, + DenseTensor* out) { + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +template +void ProdRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +template +void MaxRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +template +void MinRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +template +void AllRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +template +void AnyRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +} // namespace phi + +using float16 = phi::dtype::float16; +using bfloat16 = phi::dtype::bfloat16; +using complex64 = ::phi::dtype::complex; +using complex128 = ::phi::dtype::complex; + +PD_REGISTER_KERNEL(sum_raw, + GPU, + ALL_LAYOUT, + phi::SumRawKernel, + bool, + float, + double, + float16, + bfloat16, + int16_t, + int, + int64_t, + complex64, + complex128) { + kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); +} + +PD_REGISTER_KERNEL(mean_raw, + GPU, + ALL_LAYOUT, + phi::MeanRawKernel, + float, + double, + bool, + float16, + int, + int64_t) {} + +PD_REGISTER_KERNEL(prod_raw, + GPU, + ALL_LAYOUT, + phi::ProdRawKernel, + float, + double, + int, + int64_t) {} + +PD_REGISTER_KERNEL( + max_raw, GPU, ALL_LAYOUT, phi::MaxRawKernel, float, double, int, int64_t) {} + +PD_REGISTER_KERNEL( + min_raw, GPU, ALL_LAYOUT, phi::MinRawKernel, float, double, int, int64_t) {} + +PD_REGISTER_KERNEL(all_raw, GPU, ALL_LAYOUT, phi::AllRawKernel, bool) {} + +PD_REGISTER_KERNEL(any_raw, GPU, ALL_LAYOUT, phi::AnyRawKernel, bool) {} diff --git a/paddle/phi/kernels/gpu/reduce_max_kernel.cu b/paddle/phi/kernels/gpu/reduce_max_kernel.cu index 98c3986c51dd6829287f5316ae9eb52f328372ab..ddbc08b06c84b0afe42091ddf9a53a928621ef6d 100644 --- a/paddle/phi/kernels/gpu/reduce_max_kernel.cu +++ b/paddle/phi/kernels/gpu/reduce_max_kernel.cu @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/reduce_max_kernel.h" +#include "paddle/phi/kernels/reduce_kernel.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/gpu/reduce.h" diff --git a/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu deleted file mode 100644 index 9f4ddc3cf37a744355f6f79b7cd18b3d06b80062..0000000000000000000000000000000000000000 --- a/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu +++ /dev/null @@ -1,90 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/reduce_sum_grad_kernel.h" - -#include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/funcs/reduce_function.h" -#include "paddle/phi/kernels/gpu/reduce_grad.h" - -namespace phi { - -template -void ReduceSumGradKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& out_grad, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DataType in_dtype, - DataType out_dtype, - DenseTensor* x_grad) { - auto* in_x = &x; - auto* d_out = &out_grad; - auto* d_x = x_grad; - - auto pt_out_dtype = in_dtype; - - // get reduce_dim and reduce_num for reduce_mean_grad - int dim_size = in_x->dims().size(); - std::vector reduce_dims = - funcs::details::GetReduceDim(dims, dim_size, reduce_all); - - auto update_dims = vectorize(d_x->dims()); - int reduce_num = 1; - for (auto i : reduce_dims) { - reduce_num *= (in_x->dims())[i]; - update_dims[i] = 1; - } - // make new tensor - DenseTensor new_d_out(d_out->dtype()); - new_d_out.ShareDataWith(*d_out); - new_d_out.Resize(phi::make_ddim(update_dims)); - if (in_dtype != DataType::UNDEFINED) { - dev_ctx.Alloc(d_x, in_dtype); - } else { - dev_ctx.Alloc(d_x, d_out->dtype()); - } - - auto pt_d_out = new_d_out; - auto pt_d_x = *d_x; - if (in_dtype == DataType::UNDEFINED) { - pt_out_dtype = d_out->dtype(); - } - using MPType = typename kps::details::MPTypeTrait::Type; - - phi::ReduceGrad>( - dev_ctx, - &pt_d_out, - &pt_d_x, - pt_out_dtype, - kps::IdentityFunctor(reduce_num)); -} - -} // namespace phi - -PD_REGISTER_KERNEL(sum_grad, - GPU, - ALL_LAYOUT, - phi::ReduceSumGradKernel, - bool, - float, - double, - phi::dtype::float16, - phi::dtype::bfloat16, - int, - int64_t, - phi::dtype::complex, - phi::dtype::complex) {} diff --git a/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu b/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..cf076128b69396196f59a8accd0c282322f8f49a --- /dev/null +++ b/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu @@ -0,0 +1,260 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/roi_align_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +#include "paddle/fluid/memory/memory.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" + +namespace phi { + +static constexpr int kNumCUDAThreads = 512; +static constexpr int kNumMaxinumNumBlocks = 4096; +static constexpr int kROISize = 4; + +static inline int NumBlocks(const int N) { + return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, + kNumMaxinumNumBlocks); +} + +template +__device__ void BilinearInterpolateGradient(const int height, + const int width, + T y, + T x, + T* w1, + T* w2, + T* w3, + T* w4, + int* x_low, + int* x_high, + int* y_low, + int* y_high) { + if (y < -1.0 || y > height || x < -1.0 || x > width) { + return; + } + + y = y <= 0 ? 0 : y; + x = x <= 0 ? 0 : x; + *y_low = static_cast(y); + *x_low = static_cast(x); + if (*y_low >= height - 1) { + *y_high = *y_low = height - 1; + y = static_cast(*y_low); + } else { + *y_high = *y_low + 1; + } + if (*x_low >= width - 1) { + *x_high = *x_low = width - 1; + x = static_cast(*x_low); + } else { + *x_high = *x_low + 1; + } + T ly = y - *y_low, lx = x - *x_low; + T hy = 1. - ly, hx = 1. - lx; + *w1 = hy * hx, *w2 = hy * lx, *w3 = ly * hx, *w4 = ly * lx; + + return; +} + +template +__global__ void GPURoiAlignBackward(const int nthreads, + const T* input_rois, + const T* out_grad, + const int num_rois, + const float spatial_scale, + const int channels, + const int height, + const int width, + const int pooled_height, + const int pooled_width, + const int sampling_ratio, + int* roi_batch_id_data, + T* input_grad, + const bool continuous_coordinate) { + CUDA_KERNEL_LOOP(i, nthreads) { + int pw = i % pooled_width; + int ph = (i / pooled_width) % pooled_height; + int c = (i / pooled_width / pooled_height) % channels; + int n = i / pooled_width / pooled_height / channels; + const T* offset_input_rois = input_rois + n * kROISize; + int roi_batch_ind = roi_batch_id_data[n]; + + T roi_offset = continuous_coordinate ? T(0.5) : 0; + T roi_xmin = offset_input_rois[0] * spatial_scale - roi_offset; + T roi_ymin = offset_input_rois[1] * spatial_scale - roi_offset; + T roi_xmax = offset_input_rois[2] * spatial_scale - roi_offset; + T roi_ymax = offset_input_rois[3] * spatial_scale - roi_offset; + + T roi_width = roi_xmax - roi_xmin; + T roi_height = roi_ymax - roi_ymin; + if (!continuous_coordinate) { + roi_width = max(roi_width, static_cast(1.)); + roi_height = max(roi_height, static_cast(1.)); + } + T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + + T* offset_input_grad = + input_grad + (roi_batch_ind * channels + c) * height * width; + + const T* offset_out_grad = + out_grad + (n * channels + c) * pooled_height * pooled_width; + const T out_grad_this_bin = offset_out_grad[ph * pooled_width + pw]; + + int roi_bin_grid_h = (sampling_ratio > 0) + ? sampling_ratio + : ceil(roi_height / pooled_height); + int roi_bin_grid_w = + (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); + + const T count = roi_bin_grid_h * roi_bin_grid_w; + for (int iy = 0; iy < roi_bin_grid_h; iy++) { + const T y = roi_ymin + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + const T x = roi_xmin + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + T w1 = 0, w2 = 0, w3 = 0, w4 = 0; + int x_low = -1, x_high = -1, y_low = -1, y_high = -1; + BilinearInterpolateGradient(height, + width, + y, + x, + &w1, + &w2, + &w3, + &w4, + &x_low, + &x_high, + &y_low, + &y_high); + T diff1 = out_grad_this_bin * w1 / count; + T diff2 = out_grad_this_bin * w2 / count; + T diff3 = out_grad_this_bin * w3 / count; + T diff4 = out_grad_this_bin * w4 / count; + if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { + paddle::platform::CudaAtomicAdd( + offset_input_grad + y_low * width + x_low, diff1); + paddle::platform::CudaAtomicAdd( + offset_input_grad + y_low * width + x_high, diff2); + paddle::platform::CudaAtomicAdd( + offset_input_grad + y_high * width + x_low, diff3); + paddle::platform::CudaAtomicAdd( + offset_input_grad + y_high * width + x_high, diff4); + } + } + } + } +} + +template +void RoiAlignGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& boxes, + paddle::optional boxes_num, + const DenseTensor& out_grad, + int pooled_height, + int pooled_width, + float spatial_scale, + int sampling_ratio, + bool aligned, + DenseTensor* dx) { + int rois_num = boxes.dims()[0]; + int channels = x.dims()[1]; + int height = x.dims()[2]; + int width = x.dims()[3]; + + if (!dx) { + return; + } + + DenseTensor box_batch_id_list; + box_batch_id_list.Resize({rois_num}); + int* box_batch_size = dev_ctx.template HostAlloc(&box_batch_id_list); + + auto cplace = phi::CPUPlace(); + auto gplace = dev_ctx.GetPlace(); + if (boxes_num) { + int boxes_batch_size = boxes_num->numel(); + std::vector boxes_num_list(boxes_batch_size); + paddle::memory::Copy(cplace, + boxes_num_list.data(), + gplace, + boxes_num->data(), + sizeof(int) * boxes_batch_size, + 0); + int start = 0; + for (int n = 0; n < boxes_batch_size; ++n) { + for (size_t i = start; i < start + boxes_num_list[n]; ++i) { + box_batch_size[i] = n; + } + start += boxes_num_list[n]; + } + } else { + auto boxes_lod = boxes.lod().back(); + int boxes_batch_size = boxes_lod.size() - 1; + for (int n = 0; n < boxes_batch_size; ++n) { + for (size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) { + box_batch_size[i] = n; + } + } + } + auto roi_ptr = + paddle::memory::Alloc(dev_ctx, box_batch_id_list.numel() * sizeof(int)); + int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); + int bytes = box_batch_id_list.numel() * sizeof(int); + paddle::memory::Copy( + gplace, roi_id_data, cplace, box_batch_size, bytes, dev_ctx.stream()); + dev_ctx.template Alloc(dx); + + phi::funcs::SetConstant set_zero; + set_zero(dev_ctx, dx, static_cast(0)); + + int output_grad_size = out_grad.numel(); + int blocks = NumBlocks(output_grad_size); + int threads = kNumCUDAThreads; + + if (output_grad_size > 0) { + GPURoiAlignBackward<<>>( + output_grad_size, + boxes.data(), + out_grad.data(), + rois_num, + spatial_scale, + channels, + height, + width, + pooled_height, + pooled_width, + sampling_ratio, + roi_id_data, + dx->data(), + aligned); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + roi_align_grad, GPU, ALL_LAYOUT, phi::RoiAlignGradKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/roi_align_kernel.cu b/paddle/phi/kernels/gpu/roi_align_kernel.cu index 2f906fa4f663b6da65a3e986af2214dfb49f2ec0..cb3375dee95a5992fd598fdc8ba4f5e176f357a2 100644 --- a/paddle/phi/kernels/gpu/roi_align_kernel.cu +++ b/paddle/phi/kernels/gpu/roi_align_kernel.cu @@ -18,7 +18,6 @@ #include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/common/place.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/empty_kernel.h" #include "paddle/fluid/memory/memory.h" @@ -71,7 +70,7 @@ __device__ T BilinearInterpolate( } template -__global__ void GPUROIAlignForward(const int nthreads, +__global__ void GPURoiAlignForward(const int nthreads, const T* input_data, const T* input_rois, const float spatial_scale, @@ -137,7 +136,7 @@ __global__ void GPUROIAlignForward(const int nthreads, } template -void ROIAlignKernel(const Context& dev_ctx, +void RoiAlignKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& boxes, paddle::optional boxes_num, @@ -233,7 +232,7 @@ void ROIAlignKernel(const Context& dev_ctx, int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); paddle::memory::Copy( gplace, roi_id_data, cplace, roi_batch_id_data, bytes, dev_ctx.stream()); - GPUROIAlignForward<<>>( + GPURoiAlignForward<<>>( output_size, x.data(), boxes.data(), @@ -252,4 +251,4 @@ void ROIAlignKernel(const Context& dev_ctx, } // namespace phi PD_REGISTER_KERNEL( - roi_align, GPU, ALL_LAYOUT, phi::ROIAlignKernel, float, double) {} + roi_align, GPU, ALL_LAYOUT, phi::RoiAlignKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu b/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..d093a71d23f4ea96f9d7e7de11dcfefade3788ee --- /dev/null +++ b/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu @@ -0,0 +1,165 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/roi_pool_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +#include "paddle/fluid/memory/memory.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" + +namespace phi { + +static constexpr int kNumCUDAThreads = 512; +static constexpr int kNumMaxinumNumBlocks = 4096; + +static inline int NumBlocks(const int N) { + return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, + kNumMaxinumNumBlocks); +} + +template +__global__ void GPURoiPoolBackward(const int nthreads, + const T* input_rois, + const T* output_grad, + const int64_t* arg_max_data, + const int num_rois, + const float spatial_scale, + const int channels, + const int height, + const int width, + const int pooled_height, + const int pooled_width, + int* box_batch_id_data, + T* input_grad) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int offset = blockDim.x * gridDim.x; + for (int i = index; i < nthreads; i += offset) { + int pw = i % pooled_width; + int ph = (i / pooled_width) % pooled_height; + int c = (i / pooled_width / pooled_height) % channels; + int n = i / pooled_width / pooled_height / channels; + + int roi_batch_ind = box_batch_id_data[n]; + int input_offset = (roi_batch_ind * channels + c) * height * width; + int output_offset = (n * channels + c) * pooled_height * pooled_width; + const T* offset_output_grad = output_grad + output_offset; + T* offset_input_grad = input_grad + input_offset; + const int64_t* offset_arg_max_data = arg_max_data + output_offset; + + int arg_max = offset_arg_max_data[ph * pooled_width + pw]; + if (arg_max != -1) { + paddle::platform::CudaAtomicAdd( + offset_input_grad + arg_max, + static_cast(offset_output_grad[ph * pooled_width + pw])); + } + } +} + +template +void RoiPoolGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& boxes, + paddle::optional boxes_num, + const DenseTensor& arg_max, + const DenseTensor& out_grad, + int pooled_height, + int pooled_width, + float spatial_scale, + DenseTensor* dx) { + auto x_dims = x.dims(); + int channels = x_dims[1]; + int height = x_dims[2]; + int width = x_dims[3]; + int rois_num = boxes.dims()[0]; + + if (dx) { + DenseTensor box_batch_id_list; + box_batch_id_list.Resize({rois_num}); + int* box_batch_id_data = + dev_ctx.template HostAlloc(&box_batch_id_list); + + auto gplace = dev_ctx.GetPlace(); + if (boxes_num) { + int boxes_batch_size = boxes_num->numel(); + std::vector boxes_num_list(boxes_batch_size); + paddle::memory::Copy(phi::CPUPlace(), + boxes_num_list.data(), + gplace, + boxes_num->data(), + sizeof(int) * boxes_batch_size, + 0); + int start = 0; + for (int n = 0; n < boxes_batch_size; ++n) { + for (int i = start; i < start + boxes_num_list[n]; ++i) { + box_batch_id_data[i] = n; + } + start += boxes_num_list[n]; + } + } else { + auto boxes_lod = boxes.lod().back(); + int boxes_batch_size = boxes_lod.size() - 1; + for (int n = 0; n < boxes_batch_size; ++n) { + for (size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) { + box_batch_id_data[i] = n; + } + } + } + int bytes = box_batch_id_list.numel() * sizeof(int); + auto roi_ptr = paddle::memory::Alloc(dev_ctx, bytes); + int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); + paddle::memory::Copy(gplace, + roi_id_data, + phi::CPUPlace(), + box_batch_id_data, + bytes, + dev_ctx.stream()); + + dev_ctx.template Alloc(dx); + phi::funcs::SetConstant set_zero; + set_zero(dev_ctx, dx, static_cast(0)); + + int output_grad_size = out_grad.numel(); + int blocks = NumBlocks(output_grad_size); + int threads = kNumCUDAThreads; + + if (output_grad_size > 0) { + GPURoiPoolBackward<<>>( + output_grad_size, + boxes.data(), + out_grad.data(), + arg_max.data(), + rois_num, + spatial_scale, + channels, + height, + width, + pooled_height, + pooled_width, + roi_id_data, + dx->data()); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + roi_pool_grad, GPU, ALL_LAYOUT, phi::RoiPoolGradKernel, float, double) { + kernel->InputAt(3).SetDataType(phi::DataType::INT64); +} diff --git a/paddle/phi/kernels/gpu/roi_pool_kernel.cu b/paddle/phi/kernels/gpu/roi_pool_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..ab33e2cf64751f1cd5be44fc6f759acffd2fb93d --- /dev/null +++ b/paddle/phi/kernels/gpu/roi_pool_kernel.cu @@ -0,0 +1,220 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/roi_pool_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/kernel_registry.h" + +#include "paddle/fluid/memory/memory.h" + +namespace phi { + +static constexpr int kNumCUDAThreads = 512; +static constexpr int kNumMaxinumNumBlocks = 4096; + +static inline int NumBlocks(const int N) { + return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, + kNumMaxinumNumBlocks); +} + +template +__global__ void GPURoiPoolForward(const int nthreads, + const T* input_data, + const T* input_rois, + const float spatial_scale, + const int channels, + const int height, + const int width, + const int pooled_height, + const int pooled_width, + int* box_batch_id_data, + T* output_data, + int64_t* arg_max_data) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int offset = blockDim.x * gridDim.x; + for (size_t i = index; i < nthreads; i += offset) { + int pw = i % pooled_width; + int ph = (i / pooled_width) % pooled_height; + int c = (i / pooled_width / pooled_height) % channels; + int n = i / pooled_width / pooled_height / channels; + + const T* offset_input_rois = input_rois + n * kROISize; + int box_batch_ind = box_batch_id_data[n]; + int box_start_w = round(offset_input_rois[0] * spatial_scale); + int box_start_h = round(offset_input_rois[1] * spatial_scale); + int box_end_w = round(offset_input_rois[2] * spatial_scale); + int box_end_h = round(offset_input_rois[3] * spatial_scale); + + int box_width = max(box_end_w - box_start_w + 1, 1); + int box_height = max(box_end_h - box_start_h + 1, 1); + + int hstart = static_cast(floor(static_cast(ph) * + static_cast(box_height) / + static_cast(pooled_height))); + int wstart = static_cast(floor(static_cast(pw) * + static_cast(box_width) / + static_cast(pooled_width))); + int hend = static_cast(ceil(static_cast(ph + 1) * + static_cast(box_height) / + static_cast(pooled_height))); + int wend = static_cast(ceil(static_cast(pw + 1) * + static_cast(box_width) / + static_cast(pooled_width))); + hstart = min(max(hstart + box_start_h, 0), height); + hend = min(max(hend + box_start_h, 0), height); + wstart = min(max(wstart + box_start_w, 0), width); + wend = min(max(wend + box_start_w, 0), width); + bool is_empty = (hend <= hstart) || (wend <= wstart); + + T maxval = is_empty ? 0 : -std::numeric_limits::max(); + int maxidx = -1; + const T* offset_input_data = + input_data + (box_batch_ind * channels + c) * height * width; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + int input_data_index = h * width + w; + if (offset_input_data[input_data_index] > maxval) { + maxval = offset_input_data[input_data_index]; + maxidx = input_data_index; + } + } + } + output_data[i] = maxval; + if (arg_max_data) { + arg_max_data[i] = maxidx; + } + } +} + +template +void RoiPoolKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& boxes, + paddle::optional boxes_num, + int pooled_height, + int pooled_width, + float spatial_scale, + DenseTensor* out, + DenseTensor* arg_max) { + auto x_dims = x.dims(); + int batch_size = x_dims[0]; + auto in_stride = phi::stride(x_dims); + int channels = x_dims[1]; + int height = x_dims[2]; + int width = x_dims[3]; + + int rois_num = boxes.dims()[0]; + + if (rois_num == 0) return; + + int output_size = out->numel(); + int blocks = NumBlocks(output_size); + int threads = kNumCUDAThreads; + + DenseTensor box_batch_id_list; + box_batch_id_list.Resize({rois_num}); + int* box_batch_id_data = dev_ctx.template HostAlloc(&box_batch_id_list); + auto gplace = dev_ctx.GetPlace(); + + if (boxes_num) { + int boxes_batch_size = boxes_num->numel(); + PADDLE_ENFORCE_EQ( + boxes_batch_size, + batch_size, + phi::errors::InvalidArgument( + "The batch size of input(ROIs) and input(X) must be the same but " + "received batch size of input(ROIs) and input(X) is %d and %d " + "respectively.", + boxes_batch_size, + batch_size)); + std::vector boxes_num_list(boxes_batch_size); + paddle::memory::Copy(phi::CPUPlace(), + boxes_num_list.data(), + gplace, + boxes_num->data(), + sizeof(int) * boxes_batch_size, + 0); + int start = 0; + for (int n = 0; n < boxes_batch_size; ++n) { + for (int i = start; i < start + boxes_num_list[n]; ++i) { + box_batch_id_data[i] = n; + } + start += boxes_num_list[n]; + } + } else { + auto boxes_lod = boxes.lod().back(); + int boxes_batch_size = boxes_lod.size() - 1; + PADDLE_ENFORCE_EQ( + boxes_batch_size, + batch_size, + phi::errors::InvalidArgument( + "The batch size of input(ROIs) and input(X) must be the same but " + "received batch size of input(ROIs) and input(X) is %d and %d " + "respectively.", + boxes_batch_size, + batch_size)); + + int boxes_num_with_lod = boxes_lod[boxes_batch_size]; + PADDLE_ENFORCE_EQ(rois_num, + boxes_num_with_lod, + phi::errors::InvalidArgument( + "The number of rois from input(ROIs) and its LOD " + "must be the same. Received rois %d of input(ROIs) " + "but the number of rois %d from its LOD is %d", + rois_num, + boxes_num_with_lod)); + for (int n = 0; n < boxes_batch_size; ++n) { + for (size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) { + box_batch_id_data[i] = n; + } + } + } + + int bytes = box_batch_id_list.numel() * sizeof(int); + auto box_ptr = paddle::memory::Alloc(dev_ctx, bytes); + int* box_id_data = reinterpret_cast(box_ptr->ptr()); + paddle::memory::Copy(gplace, + box_id_data, + phi::CPUPlace(), + box_batch_id_data, + bytes, + dev_ctx.stream()); + + T* output_data = dev_ctx.template Alloc(out); + int64_t* arg_max_data = dev_ctx.template Alloc(arg_max); + + GPURoiPoolForward<<>>( + output_size, + x.data(), + boxes.data(), + spatial_scale, + channels, + height, + width, + pooled_height, + pooled_width, + box_id_data, + output_data, + arg_max_data); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + roi_pool, GPU, ALL_LAYOUT, phi::RoiPoolKernel, float, double) { + kernel->OutputAt(1).SetDataType(phi::DataType::INT64); +} diff --git a/paddle/phi/kernels/gpu/roll_grad_kernel.cu b/paddle/phi/kernels/gpu/roll_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..93e9e81882c9e6eacd5f9ee91fa7541495ef2663 --- /dev/null +++ b/paddle/phi/kernels/gpu/roll_grad_kernel.cu @@ -0,0 +1,88 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/roll_grad_kernel.h" + +#include "paddle/phi/common/complex.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gpu/roll_kernel_impl.h" + +namespace phi { + +using paddle::platform::PADDLE_CUDA_NUM_THREADS; + +template +void RollGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const ScalarArray& shifts, + const std::vector& axis, + DenseTensor* x_grad) { + auto* in_data = out_grad.data(); + T* out_data = dev_ctx.template Alloc(x_grad); + int64_t numel = out_grad.numel(); + auto stream = dev_ctx.stream(); + + auto shifts_data = shifts.GetData(); + size_t nums = shifts_data.size(); + auto input_dim = out_grad.dims(); + auto stride_dim = phi::stride(input_dim); + + std::vector strides(nums), sizes(nums); + if (axis.size() == 0) { + strides[0] = 1; + sizes[0] = numel; + shifts_data[0] = ((-shifts_data[0]) % numel + numel) % numel; + } else { + for (size_t i = 0; i < nums; i++) { + int dim = axis[i] >= 0 ? axis[i] : axis[i] + input_dim.size(); + int64_t size = input_dim[dim]; + if (size != 0) { + shifts_data[i] = ((-shifts_data[i]) % size + size) % size; + strides[i] = stride_dim[dim]; + sizes[i] = size; + } + } + } + + switch (nums) { + CALL_ROLL_CUDA_KERNEL(1); + CALL_ROLL_CUDA_KERNEL(2); + CALL_ROLL_CUDA_KERNEL(3); + CALL_ROLL_CUDA_KERNEL(4); + CALL_ROLL_CUDA_KERNEL(5); + CALL_ROLL_CUDA_KERNEL(6); + CALL_ROLL_CUDA_KERNEL(7); + CALL_ROLL_CUDA_KERNEL(8); + CALL_ROLL_CUDA_KERNEL(9); + default: + PADDLE_THROW(phi::errors::InvalidArgument( + "shifts.size() should be less than 10, But received shifts.size() " + "= %d", + shifts_data.size())); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(roll_grad, + GPU, + ALL_LAYOUT, + phi::RollGradKernel, + float, + double, + int, + int64_t, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/gpu/roll_kernel.cu b/paddle/phi/kernels/gpu/roll_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..1543335d3a0c5884d6b82394253bb4e8dda8cef0 --- /dev/null +++ b/paddle/phi/kernels/gpu/roll_kernel.cu @@ -0,0 +1,90 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/roll_kernel.h" + +#include "paddle/phi/common/complex.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/utils/array.h" +#include "paddle/phi/kernels/gpu/roll_kernel_impl.h" + +namespace phi { + +using paddle::platform::PADDLE_CUDA_NUM_THREADS; + +template +void RollKernel(const Context& dev_ctx, + const DenseTensor& x, + const ScalarArray& shifts, + const std::vector& axis, + DenseTensor* out) { + auto* in_data = x.data(); + T* out_data = dev_ctx.template Alloc(out); + int64_t numel = x.numel(); + auto stream = dev_ctx.stream(); + + auto shifts_data = shifts.GetData(); + + size_t nums = shifts_data.size(); + auto input_dim = x.dims(); + auto stride_dim = phi::stride(input_dim); + + std::vector strides(nums), sizes(nums); + if (axis.size() == 0) { + strides[0] = 1; + sizes[0] = numel; + shifts_data[0] = (shifts_data[0] % numel + numel) % numel; + } else { + for (size_t i = 0; i < nums; i++) { + int dim = axis[i] >= 0 ? axis[i] : axis[i] + input_dim.size(); + int64_t size = input_dim[dim]; + + if (size != 0) { + shifts_data[i] = (shifts_data[i] % size + size) % size; + strides[i] = stride_dim[dim]; + sizes[i] = size; + } + } + } + + switch (nums) { + CALL_ROLL_CUDA_KERNEL(1); + CALL_ROLL_CUDA_KERNEL(2); + CALL_ROLL_CUDA_KERNEL(3); + CALL_ROLL_CUDA_KERNEL(4); + CALL_ROLL_CUDA_KERNEL(5); + CALL_ROLL_CUDA_KERNEL(6); + CALL_ROLL_CUDA_KERNEL(7); + CALL_ROLL_CUDA_KERNEL(8); + CALL_ROLL_CUDA_KERNEL(9); + default: + PADDLE_THROW(phi::errors::InvalidArgument( + "shifts.size() should be less than 10, But received shifts.size() " + "= %d", + shifts_data.size())); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(roll, + GPU, + ALL_LAYOUT, + phi::RollKernel, + float, + double, + int, + int64_t, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/gpu/roll_kernel_impl.h b/paddle/phi/kernels/gpu/roll_kernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..abe3ee470b4bc6b3951e1ad2da09544e319cbcac --- /dev/null +++ b/paddle/phi/kernels/gpu/roll_kernel_impl.h @@ -0,0 +1,71 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/core/utils/array.h" +#include "paddle/phi/kernels/primitive/kernel_primitives.h" + +namespace phi { + +using paddle::platform::PADDLE_CUDA_NUM_THREADS; + +template +__global__ void RollCudaKernel(const T* input, + T* output, + int64_t N, + phi::Array shifts, + phi::Array strides, + phi::Array sizes) { + int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= N) { + return; + } + + int64_t output_idx = idx; + int64_t new_dim_idx = 0; + +#pragma unroll + for (size_t i = 0; i < Rank; i++) { + new_dim_idx = (idx / strides[i]) % sizes[i] + shifts[i]; + if (new_dim_idx >= sizes[i]) { + output_idx += (shifts[i] - sizes[i]) * strides[i]; + } else { + output_idx += shifts[i] * strides[i]; + } + } + output[output_idx] = input[idx]; +} + +#define CALL_ROLL_CUDA_KERNEL(N) \ + case N: { \ + phi::Array _strides; \ + phi::Array _shifts; \ + phi::Array _sizes; \ + for (size_t idx = 0; idx < N; ++idx) { \ + _strides[idx] = strides[idx]; \ + _shifts[idx] = shifts_data[idx]; \ + _sizes[idx] = sizes[idx]; \ + } \ + RollCudaKernel< \ + T, \ + N><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, \ + PADDLE_CUDA_NUM_THREADS, \ + 0, \ + stream>>>(in_data, out_data, numel, _shifts, _strides, _sizes); \ + break; \ + } + +} // namespace phi diff --git a/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu b/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu index d9618dc159a6d3f5b24bdfcfdb219ec649e051f9..9d1769e18b4b809fbc353513a05553e0ccd97572 100644 --- a/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu @@ -24,4 +24,6 @@ PD_REGISTER_KERNEL(segment_pool_grad, ALL_LAYOUT, phi::SegmentPoolGradKernel, float, - double) {} + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/segment_pool_kernel.cu b/paddle/phi/kernels/gpu/segment_pool_kernel.cu index c38e935adf837ef00c48fa31bc1e37eea2948673..3128e534166acba6ca136331ad8efea66b18621f 100644 --- a/paddle/phi/kernels/gpu/segment_pool_kernel.cu +++ b/paddle/phi/kernels/gpu/segment_pool_kernel.cu @@ -19,5 +19,11 @@ #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_registry.h" -PD_REGISTER_KERNEL( - segment_pool, GPU, ALL_LAYOUT, phi::SegmentPoolKernel, float, double) {} +PD_REGISTER_KERNEL(segment_pool, + GPU, + ALL_LAYOUT, + phi::SegmentPoolKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/reduce_prod_kernel.cu b/paddle/phi/kernels/gpu/tril_triu_grad_kernel.cu similarity index 53% rename from paddle/phi/kernels/gpu/reduce_prod_kernel.cu rename to paddle/phi/kernels/gpu/tril_triu_grad_kernel.cu index 278d4a6e5ab79a7519e1052a2d05c6ecda62692f..bc3ef1bc623bb27ac2452d1e908c389543598011 100644 --- a/paddle/phi/kernels/gpu/reduce_prod_kernel.cu +++ b/paddle/phi/kernels/gpu/tril_triu_grad_kernel.cu @@ -12,32 +12,18 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/reduce_prod_kernel.h" +#include "paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h" +#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/gpu/reduce.h" -namespace phi { - -template -void ReduceProdKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out) { - auto out_dtype = x.dtype(); - phi::Reduce( - dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); -} - -} // namespace phi - -PD_REGISTER_KERNEL(reduce_prod, +PD_REGISTER_KERNEL(tril_triu_grad, GPU, ALL_LAYOUT, - phi::ReduceProdKernel, + phi::TrilTriuGradKernel, + bool, float, double, int, - int64_t) {} + int64_t, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpu/tril_triu_kernel.cu b/paddle/phi/kernels/gpu/tril_triu_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..8c48edf9eff25aa68abcfe0b08dd7ab659aaa0fb --- /dev/null +++ b/paddle/phi/kernels/gpu/tril_triu_kernel.cu @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/impl/tril_triu_kernel_impl.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(tril_triu, + GPU, + ALL_LAYOUT, + phi::TrilTriuKernel, + bool, + float, + double, + int, + int64_t, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu b/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu index bb04e7ee8515bb6320860e4fd20366995d26c991..f27b32ca7b8319440b62f0d03d21129133c8470c 100644 --- a/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu +++ b/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu @@ -33,27 +33,23 @@ struct GPUTruncatedNormal { T mean, std; T a_normal_cdf; T b_normal_cdf; - unsigned int seed; T numeric_min; __host__ __device__ GPUTruncatedNormal(T mean, T std, T numeric_min, int seed) : mean(mean), std(std), seed(seed), numeric_min(numeric_min) { - auto normal_cdf = [](float x) { - return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0; - }; - a_normal_cdf = normal_cdf((-2.0 - mean) / std); - b_normal_cdf = normal_cdf((2.0 - mean) / std); + a_normal_cdf = (1.0 + erff(-2.0 / sqrtf(2.0))) / 2.0; + b_normal_cdf = (1.0 + erff(2.0 / sqrtf(2.0))) / 2.0; } __host__ __device__ T operator()(const unsigned int n) const { thrust::minstd_rand rng; rng.seed(seed); - thrust::uniform_real_distribution dist(2.0 * a_normal_cdf - 1.0, - 2.0 * b_normal_cdf - 1.0); + thrust::uniform_real_distribution dist(numeric_min, 1); rng.discard(n); T value = dist(rng); - return std::sqrt(2.0) * erfinvf(value) * std + mean; + auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value; + return std::sqrt(2.0) * erfinvf(2 * p - 1) * std + mean; } }; @@ -73,21 +69,18 @@ struct TruncatedNormalOffset { seed(seed), numeric_min(numeric_min), offset_(offset) { - auto normal_cdf = [](float x) { - return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0; - }; - a_normal_cdf = normal_cdf((-2.0 - mean) / std); - b_normal_cdf = normal_cdf((2.0 - mean) / std); + a_normal_cdf = (1.0 + erff(-2.0 / sqrtf(2.0))) / 2.0; + b_normal_cdf = (1.0 + erff(2.0 / sqrtf(2.0))) / 2.0; } __host__ __device__ T operator()(const unsigned int n) const { thrust::minstd_rand rng; rng.seed(seed); - thrust::uniform_real_distribution dist(2.0 * a_normal_cdf - 1.0, - 2.0 * b_normal_cdf - 1.0); + thrust::uniform_real_distribution dist(numeric_min, 1); rng.discard(n + offset_); T value = dist(rng); - return std::sqrt(2.0) * erfinvf(value) * std + mean; + auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value; + return std::sqrt(2.0) * erfinvf(2 * p - 1) * std + mean; } }; diff --git a/paddle/phi/kernels/gpu/where_index_kernel.cu b/paddle/phi/kernels/gpu/where_index_kernel.cu index 535cb812a20ea90bdb3f07b731af52c2822f0ec2..9538533f70d597e21b393d2650d56bebd823c360 100644 --- a/paddle/phi/kernels/gpu/where_index_kernel.cu +++ b/paddle/phi/kernels/gpu/where_index_kernel.cu @@ -20,150 +20,59 @@ namespace cub = hipcub; #endif -#include "paddle/phi/kernels/where_index_kernel.h" - -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/core/ddim.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/select_impl.cu.h" +#include "paddle/phi/kernels/where_index_kernel.h" namespace phi { - -template -__global__ void GetTrueNum(const T *cond_data, - const int64_t numel, - int64_t *true_num_array) { - const int64_t tid = blockIdx.x * blockDim.x + threadIdx.x; - - for (int64_t idx = tid; idx < numel; idx += gridDim.x * blockDim.x) { - true_num_array[idx] = - static_cast(static_cast(cond_data[idx])); +template +struct IndexFunctor { + T2 stride[phi::DDim::kMaxRank]; + int dims; + explicit IndexFunctor(const phi::DDim &in_dims) { + dims = in_dims.size(); + std::vector strides_in_tmp; + strides_in_tmp.resize(dims, 1); + // get strides according to in_dims + for (T2 i = 1; i < dims; i++) { + strides_in_tmp[i] = strides_in_tmp[i - 1] * in_dims[dims - i]; + } + memcpy(stride, strides_in_tmp.data(), dims * sizeof(T2)); } -} - -template -__global__ void SetTrueIndex(int64_t *out_ptr, - const T *cond_data, - const int64_t numel, - const int64_t *stride_array, - const int64_t rank, - const int64_t *true_num_array) { - const int64_t tid = blockIdx.x * blockDim.x + threadIdx.x; - for (int64_t idx = tid; idx < numel; idx += gridDim.x * blockDim.x) { - // true_num_array is calculated by cub::InclusiveSum, - // cause the first element of true_num_array is 1, - // so we need substract 1 to get true index. - const int64_t true_index = true_num_array[idx] - 1; - if (static_cast(cond_data[idx])) { - int64_t rank_index = idx; - for (int j = 0; j < rank; j++) { - const int64_t out_index = rank_index / stride_array[j]; - out_ptr[true_index * rank + j] = out_index; - rank_index -= out_index * stride_array[j]; + HOSTDEVICE inline void operator()(OutT *out, + const T1 *mask, + const T2 *index, + const int num) { + int store_fix = 0; + for (int idx = 0; idx < num; idx++) { + if (mask[idx]) { + T2 data_index = index[idx]; + // get index + for (int rank_id = dims - 1; rank_id >= 0; --rank_id) { + out[store_fix] = static_cast(data_index / stride[rank_id]); + data_index = data_index % stride[rank_id]; + store_fix++; + } } } } -} +}; template void WhereIndexKernel(const Context &dev_ctx, const DenseTensor &condition, DenseTensor *out) { - const T *cond_data = condition.data(); - const int64_t numel = condition.numel(); + DenseTensor in_data; auto dims = condition.dims(); - const int rank = dims.size(); - - auto d_array_mem = - paddle::memory::Alloc(dev_ctx, (numel + rank) * sizeof(int64_t)); - auto h_array_mem = - paddle::memory::Alloc(phi::CPUPlace(), (rank + 1) * sizeof(int64_t)); - - // "stride_array" is an array and len(stride_array)==rank, - // each element is the stride of each dimension -- the length from i to i+1. - int64_t *h_stride_array = reinterpret_cast(h_array_mem->ptr()); - int64_t *d_stride_array = reinterpret_cast(d_array_mem->ptr()); - - // "true_num_array" is an array and len(stride_array)==numel, - // at the beginning, - // "true_num_array" will set 1 if condition[i] == true else 0, - // then it will be calculated by cub::InclusiveSum, - // so that we can get the true number before i as the out index - int64_t *d_true_num_array = d_stride_array + rank; - - // the total_true_num is the total number of condition[i] == true - int64_t *h_total_true_num = h_stride_array + rank; - - // alloce cub memory - size_t cub_size = 0; - cub::DeviceScan::InclusiveSum(nullptr, - cub_size, - d_true_num_array, - d_true_num_array, - numel, - dev_ctx.stream()); - auto cub_mem = paddle::memory::Alloc(dev_ctx, cub_size * sizeof(int64_t)); - void *cub_data = cub_mem->ptr(); - - // set d_true_num_array[i]=1 if cond_data[i]==true else 0 - const int threads = std::min(numel, static_cast(128)); - const int64_t need_grids = (numel + threads - 1) / threads; - const int grids = std::min(need_grids, static_cast(256)); - GetTrueNum<<>>( - cond_data, numel, d_true_num_array); - - // calculate the inclusive prefix sum of "true_num_array" - // to get the index of "out" tensor, - // and the total number of cond_data[i]==true. - // Example: - // condition: F T T F F F T T - // before: 0 1 1 0 0 0 1 1 - // after: 0 1 2 2 2 2 3 4 - // out: 1 2 6 7 - cub::DeviceScan::InclusiveSum(cub_data, - cub_size, - d_true_num_array, - d_true_num_array, - numel, - dev_ctx.stream()); - - // calculate each dimension's stride - h_stride_array[rank - 1] = 1; - for (int i = rank - 2; i >= 0; i--) { - h_stride_array[i] = h_stride_array[i + 1] * dims[i + 1]; - } - paddle::memory::Copy(dev_ctx.GetPlace(), - d_stride_array, - phi::CPUPlace(), - h_stride_array, - rank * sizeof(int64_t), - dev_ctx.stream()); - - // get total ture number and set output size - // the last element of cub::InclusiveSum is the total number - paddle::memory::Copy(phi::CPUPlace(), - h_total_true_num, - dev_ctx.GetPlace(), - d_true_num_array + numel - 1, - sizeof(int64_t), - dev_ctx.stream()); - dev_ctx.Wait(); - - int64_t true_num = *h_total_true_num; - out->Resize(phi::make_ddim({static_cast(true_num), rank})); - auto *out_data = dev_ctx.template Alloc(out); - - if (true_num == 0) { - return; - } - - // using true_num_array and stride_array to calculate the output index - SetTrueIndex<<>>( - out_data, cond_data, numel, d_stride_array, rank, d_true_num_array); + using Functor = IndexFunctor; + Functor index_functor = Functor(dims); + phi::funcs::SelectKernel( + dev_ctx, condition, in_data, out, index_functor); } - } // namespace phi PD_REGISTER_KERNEL(where_index, diff --git a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h index 2b2dd5118969cf35c4762f3ab774ce41c04d2e4d..77159bfc876da603f703a13592f525d808adfbbf 100644 --- a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h +++ b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h @@ -121,17 +121,10 @@ struct ReduceMaxFunctor { }; template -struct ExpSubFunctor { - HOSTDEVICE inline ExpSubFunctor() { y = static_cast(0.0f); } - - HOSTDEVICE explicit inline ExpSubFunctor(Tx y) : y((Tx)(y)) {} - +struct ExpFunctor { HOSTDEVICE inline Ty operator()(const Tx& x) const { - return static_cast(std::exp(x - y)); + return static_cast(std::exp(x)); } - - private: - Tx y; }; template @@ -293,10 +286,14 @@ __global__ void WarpSoftmaxForward(T* softmax, } // data src - AccT srcdata[kBatchSize][kLoopsV][kVSize]; - T src_tmp[kBatchSize][kLoopsV][kVSize]; - kps::Init(&srcdata[0][0][0], kLowInf); - kps::Init(&src_tmp[0][0][0], -std::numeric_limits::infinity()); + // src_data: the raw data form global memory + // sub_data: store the data obtained by (src_data - max), used by log_softmax + // exp_data: store the data obtained by (exp(sub_data)), used by softmax + T src_data[kBatchSize][kLoopsV][kVSize]; + AccT sub_data[kBatchSize][kLoopsV][kVSize]; + AccT exp_data[kBatchSize][kLoopsV][kVSize]; + kps::Init(&sub_data[0][0][0], kLowInf); + kps::Init(&src_data[0][0][0], -std::numeric_limits::infinity()); // data dst T out_tmp[kBatchSize][kLoopsV][kVSize]; @@ -313,11 +310,11 @@ __global__ void WarpSoftmaxForward(T* softmax, for (int i = 0; i < kBatchSize; ++i) { const VecT* src_v = reinterpret_cast(&src[(first_batch + i) * stride]); - VecT* reg_v = reinterpret_cast(&src_tmp[i][0][0]); + VecT* reg_v = reinterpret_cast(&src_data[i][0][0]); kps::ReadData( ®_v[0], &src_v[0], idx_max_v[i], 0, kWarpSize, 1); kps::ElementwiseUnary>( - &srcdata[i][0][0], &src_tmp[i][0][0], DataTransFunctor()); + &sub_data[i][0][0], &src_data[i][0][0], DataTransFunctor()); } // compute max @@ -327,14 +324,16 @@ __global__ void WarpSoftmaxForward(T* softmax, 1, ReduceMaxFunctor, kMode::kLocalMode>( - &max[0], &srcdata[0][0][0], ReduceMaxFunctor(), true); + &max[0], &sub_data[0][0][0], ReduceMaxFunctor(), true); WarpReduceMax(max); // compute sum #pragma unroll for (int i = 0; i < kBatchSize; ++i) { - kps::ElementwiseUnary>( - &srcdata[i][0][0], &srcdata[i][0][0], ExpSubFunctor(max[i])); + kps::ElementwiseUnary>( + &sub_data[i][0][0], &sub_data[i][0][0], UnarySubFunctor(max[i])); + kps::ElementwiseUnary>( + &exp_data[i][0][0], &sub_data[i][0][0], ExpFunctor()); } kps::Reduce, kMode::kLocalMode>( - &sum[0], &srcdata[0][0][0], kps::AddFunctor(), true); + &sum[0], &exp_data[0][0][0], kps::AddFunctor(), true); WarpReduceSum(sum); // write data to global memory @@ -352,15 +351,13 @@ __global__ void WarpSoftmaxForward(T* softmax, reinterpret_cast(&softmax[(first_batch + i) * stride]); VecT* reg_v = reinterpret_cast(&out_tmp[i][0][0]); if (LogMode) { - kps::ElementwiseUnary>( - &srcdata[i][0][0], &srcdata[i][0][0], UnaryLogFunctor()); kps::ElementwiseUnary>( &out_tmp[i][0][0], - &srcdata[i][0][0], + &sub_data[i][0][0], UnarySubFunctor(std::log(sum[i]))); } else { kps::ElementwiseUnary>( - &out_tmp[i][0][0], &srcdata[i][0][0], UnaryDivFunctor(sum[i])); + &out_tmp[i][0][0], &exp_data[i][0][0], UnaryDivFunctor(sum[i])); } kps::WriteData( &softmax_v[0], ®_v[0], idx_max_v[i], 0, kWarpSize, 1); diff --git a/paddle/phi/kernels/reduce_max_kernel.cc b/paddle/phi/kernels/grid_sample_grad_kernel.h similarity index 51% rename from paddle/phi/kernels/reduce_max_kernel.cc rename to paddle/phi/kernels/grid_sample_grad_kernel.h index de172a12d72884fb018acbb42c077efc825508ce..50a8d5be260bd387476467e2cdddaeb59f943b9b 100644 --- a/paddle/phi/kernels/reduce_max_kernel.cc +++ b/paddle/phi/kernels/grid_sample_grad_kernel.h @@ -12,28 +12,23 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/reduce_max_kernel.h" +#pragma once -#include "paddle/phi/backends/all_context.h" -#include "paddle/phi/core/kernel_registry.h" +#include + +#include "paddle/phi/core/dense_tensor.h" namespace phi { template -void MaxKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - DenseTensor* out) { - bool reduce_all = false; - MaxRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); -} +void GridSampleGradKernel(const Context &dev_ctx, + const DenseTensor &x, + const DenseTensor &grid, + const DenseTensor &out_grid, + const std::string &mode, + const std::string &padding_mode, + bool align_corners, + DenseTensor *x_grad, + DenseTensor *grid_grad); } // namespace phi - -PD_REGISTER_KERNEL( - max, CPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -PD_REGISTER_KERNEL( - max, GPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {} -#endif diff --git a/paddle/phi/kernels/grid_sample_kernel.h b/paddle/phi/kernels/grid_sample_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..2e1e9b508649b22de086a103537f4984b7f693e5 --- /dev/null +++ b/paddle/phi/kernels/grid_sample_kernel.h @@ -0,0 +1,32 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void GridSampleKernel(const Context &dev_ctx, + const DenseTensor &x, + const DenseTensor &grid, + const std::string &mode, + const std::string &padding_mode, + bool align_corners, + DenseTensor *out); + +} // namespace phi diff --git a/paddle/phi/kernels/hierarchical_sigmoid_grad_kernel.h b/paddle/phi/kernels/hierarchical_sigmoid_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..f7a327cd3f566d7d3e3da9517ba2f50d67b6ba60 --- /dev/null +++ b/paddle/phi/kernels/hierarchical_sigmoid_grad_kernel.h @@ -0,0 +1,42 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void HierarchicalSigmoidGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& w, + const DenseTensor& label, + const DenseTensor& pre_out, + const DenseTensor& out_grad, + paddle::optional path, + paddle::optional code, + paddle::optional bias, + int num_classes, + bool remote_prefetch, + int trainer_id, + const std::vector& height_sections, + const std::vector& epmap, + const std::vector& table_names, + bool is_sparse, + DenseTensor* x_grad, + DenseTensor* w_grad, + DenseTensor* bias_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/hierarchical_sigmoid_kernel.h b/paddle/phi/kernels/hierarchical_sigmoid_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..619b022904b17b3669abe61dc5ce341f6c6ae9bc --- /dev/null +++ b/paddle/phi/kernels/hierarchical_sigmoid_kernel.h @@ -0,0 +1,40 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void HierarchicalSigmoidKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& w, + const DenseTensor& label, + paddle::optional path, + paddle::optional code, + paddle::optional bias, + int num_classes, + bool remote_prefetch, + int trainer_id, + const std::vector& height_sections, + const std::vector& epmap, + const std::vector& table_names, + bool is_sparse, + DenseTensor* out, + DenseTensor* pre_out, + DenseTensor* w_out); + +} // namespace phi diff --git a/paddle/phi/kernels/impl/activation_grad_impl.h b/paddle/phi/kernels/impl/activation_grad_impl.h index a48a6226f23f8d9976dc86e59b051828b1d71b21..7d6b6dc72ea60214ff4c9974b4ff885feecb5822 100644 --- a/paddle/phi/kernels/impl/activation_grad_impl.h +++ b/paddle/phi/kernels/impl/activation_grad_impl.h @@ -202,4 +202,77 @@ void TanhTripleGradKernel(const Context& dev_ctx, d_ddx); // output } +template +void EluDoubleGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& dout, + const DenseTensor& ddx, + float alpha, + DenseTensor* dx, + DenseTensor* ddout) { + if (dx) { + dx->Resize(x.dims()); + dev_ctx.template Alloc(dx); + } + if (ddout) { + dev_ctx.template Alloc(ddout); + } + funcs::ELUGradGradFunctor functor; + functor.alpha = alpha; + functor(dev_ctx, &x, &ddx, ddout, &dout, dx); +} + +template +void SigmoidDoubleGradKernel(const Context& dev_ctx, + const DenseTensor& out, + const DenseTensor& ddx, + const DenseTensor& dout, + DenseTensor* dout_new, + DenseTensor* ddout) { + if (dout_new) { + dout_new->Resize(out.dims()); + dev_ctx.template Alloc(dout_new); + } + if (ddout) { + ddout->Resize(out.dims()); + dev_ctx.template Alloc(ddout); + } + funcs::SigmoidGradGradFunctor functor; + functor(dev_ctx, &out, &ddx, &dout, dout_new, ddout); +} + +template +void SigmoidTripleGradKernel(const Context& dev_ctx, + const DenseTensor& out, + const DenseTensor& ddx, + const DenseTensor& dout, + const DenseTensor& d_ddout, + const DenseTensor& d_dout_new, + DenseTensor* d_out_new, + DenseTensor* d_dout, + DenseTensor* d_ddx) { + if (d_dout) { + d_dout->Resize(out.dims()); + dev_ctx.template Alloc(d_dout); + } + if (d_out_new) { + d_dout->Resize(out.dims()); + dev_ctx.template Alloc(d_out_new); + } + if (d_ddx) { + d_dout->Resize(ddx.dims()); + dev_ctx.template Alloc(d_ddx); + } + funcs::SigmoidTripleGradFunctor functor; + functor(dev_ctx, + &out, + &ddx, + &dout, + &d_ddout, + &d_dout_new, + d_dout, + d_out_new, + d_ddx); +} + } // namespace phi diff --git a/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h b/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h index 9f557e746378939e32a32955e758cdc5c510f229..e3ea10705d24e90a76246d439c6d9263e072bc39 100644 --- a/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h @@ -19,18 +19,17 @@ #include "paddle/phi/kernels/cholesky_solve_kernel.h" #include "paddle/phi/kernels/complex_kernel.h" #include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/elementwise_kernel.h" #include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/expand_kernel.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/common_shape.h" #include "paddle/phi/kernels/funcs/complex_functors.h" +#include "paddle/phi/kernels/funcs/for_range.h" #include "paddle/phi/kernels/funcs/matrix_reduce.h" -#include "paddle/phi/kernels/math_kernel.h" +#include "paddle/phi/kernels/funcs/tril_triu_compute.h" #include "paddle/phi/kernels/transpose_kernel.h" -// See Note [ Why still include the fluid headers? ] -#include "paddle/fluid/operators/tril_triu_op.h" - namespace phi { template @@ -115,7 +114,7 @@ void CholeskySolveGradKernel(const Context& dev_ctx, const auto H = y_bst_dims_vec[y_bst_ndim - 2]; const auto W = y_bst_dims_vec[y_bst_ndim - 1]; phi::funcs::ForRange y_for_range(dev_ctx, dy_bst.numel()); - paddle::operators::TrilTriuCompute tril_triu_functor( + phi::funcs::TrilTriuCompute tril_triu_functor( dy_bst.data(), 0, !upper, H, W, dy_bst_upper.data()); y_for_range(tril_triu_functor); diff --git a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..d8795808a643d2741ca210b13303febd187a193a --- /dev/null +++ b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h @@ -0,0 +1,173 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" + +namespace phi { + +template +HOSTDEVICE T DmcnIm2colBilinear(const T* bottom_data, + const int data_width, + const int height, + const int width, + T h, + T w) { + int h_low = floor(h); + int w_low = floor(w); + int h_high = h_low + 1; + int w_high = w_low + 1; + + T lh = h - h_low; + T lw = w - w_low; + T hh = 1 - lh; + T hw = 1 - lw; + + T v1 = + (h_low >= 0 && w_low >= 0) ? bottom_data[h_low * data_width + w_low] : 0; + T v2 = (h_low >= 0 && w_high <= width - 1) + ? bottom_data[h_low * data_width + w_high] + : 0; + T v3 = (h_high <= height - 1 && w_low >= 0) + ? bottom_data[h_high * data_width + w_low] + : 0; + T v4 = (h_high <= height - 1 && w_high <= width - 1) + ? bottom_data[h_high * data_width + w_high] + : 0; + + T w1 = hh * hw; + T w2 = hh * lw; + T w3 = lh * hw; + T w4 = lh * lw; + + return w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4; +} + +template +void ModulatedDeformableIm2col(const Context& dev_ctx, + const T* data_im, + const T* data_offset, + const T* data_mask, + const std::vector& im_shape, + const std::vector& col_shape, + const std::vector& filter_shape, + const std::vector& paddings, + const std::vector& strides, + const std::vector& dilations, + const int deformable_groups, + T* data_col); + +template +void DeformableConvKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& offset, + const DenseTensor& filter, + const DenseTensor& mask, + const std::vector& strides, + const std::vector& paddings, + const std::vector& dilations, + int deformable_groups, + int groups, + int im2col_step, + DenseTensor* out) { + const int batch_size = static_cast(x.dims()[0]); + + std::vector filter_shape_vec(phi::vectorize(filter.dims())); + std::vector output_shape_vec(phi::vectorize(out->dims())); + + // col_shape_vec: {c_i * k_h * k_w, im2col_step, o_h, o_w} + std::vector col_buffer_shape_vec(filter_shape_vec.size()); + col_buffer_shape_vec[0] = x.dims()[1] * filter.dims()[2] * filter.dims()[3]; + col_buffer_shape_vec[1] = im2col_step; + for (size_t j = 0; j < filter_shape_vec.size() - 2; ++j) { + col_buffer_shape_vec[j + 2] = output_shape_vec[j + 2]; + } + + std::vector output_buffer_shape_vec(1); + output_buffer_shape_vec[0] = batch_size * output_shape_vec[1] * + output_shape_vec[2] * output_shape_vec[3]; + + DenseTensor col_buffer = Empty(dev_ctx, col_buffer_shape_vec); + DenseTensor output_buffer = Empty(dev_ctx, output_buffer_shape_vec); + + int64_t M = output_shape_vec[1] / groups; + int64_t N = im2col_step * output_shape_vec[2] * output_shape_vec[3]; + int64_t K = x.dims()[1] * filter_shape_vec[2] * filter_shape_vec[3] / groups; + + DenseTensor weight_3d; + weight_3d.ShareDataWith(filter).Resize(phi::make_ddim({groups, M, K})); + + DenseTensor col_buffer_3d; + col_buffer_3d.ShareDataWith(col_buffer) + .Resize(phi::make_ddim({groups, K, N})); + + DenseTensor output_4d; + output_4d.ShareDataWith(output_buffer) + .Resize(phi::make_ddim({batch_size / im2col_step, groups, M, N})); + + DDim input_shape = phi::slice_ddim(x.dims(), 1, x.dims().size()); + std::vector input_shape_vec = phi::vectorize(input_shape); + + int input_dim = x.numel() / x.dims()[0]; + int input_offset_dim = offset.numel() / offset.dims()[0]; + int input_mask_dim = mask.numel() / mask.dims()[0]; + + auto blas = phi::funcs::GetBlas(dev_ctx); + + const T* input_ptr = x.data(); + const T* offset_ptr = offset.data(); + const T* mask_ptr = mask.data(); + T* col_buffer_ptr = col_buffer.data(); + + for (int i = 0; i < batch_size / im2col_step; ++i) { + ModulatedDeformableIm2col(dev_ctx, + input_ptr + i * im2col_step * input_dim, + offset_ptr + i * im2col_step * input_offset_dim, + mask_ptr + i * im2col_step * input_mask_dim, + input_shape_vec, + col_buffer_shape_vec, + filter_shape_vec, + paddings, + strides, + dilations, + deformable_groups, + col_buffer_ptr); + DenseTensor output_3d = output_4d.Slice(i, i + 1).Resize( + phi::slice_ddim(output_4d.dims(), 1, output_4d.dims().size())); + // get the product of pixel and weight + for (int g = 0; g < groups; ++g) { + DenseTensor weight_3d_slice = weight_3d.Slice(g, g + 1).Resize( + phi::slice_ddim(weight_3d.dims(), 1, weight_3d.dims().size())); + DenseTensor col_buffer_3d_slice = + col_buffer_3d.Slice(g, g + 1).Resize(phi::slice_ddim( + col_buffer_3d.dims(), 1, col_buffer_3d.dims().size())); + DenseTensor output_3d_slice = output_3d.Slice(g, g + 1).Resize( + phi::slice_ddim(output_3d.dims(), 1, output_3d.dims().size())); + blas.MatMul(weight_3d_slice, + false, + col_buffer_3d_slice, + false, + T(1.0), + &output_3d_slice, + T(0.0)); + } + } + out->ShareDataWith(output_buffer).Resize(phi::make_ddim(output_shape_vec)); +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h b/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..e4356e9af39372cd330991502078a13520d05586 --- /dev/null +++ b/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h @@ -0,0 +1,159 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/determinant_grad_kernel.h" + +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/elementwise_kernel.h" +#include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/full_kernel.h" +#include "paddle/phi/kernels/funcs/for_range.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/matrix_inverse.h" +#include "paddle/phi/kernels/funcs/unsqueeze.h" +#include "paddle/phi/kernels/transpose_kernel.h" + +namespace phi { +namespace detail { + +template +struct FoundZeroFunctor { + FoundZeroFunctor(const T* x, int64_t numel, bool* res) + : x_(x), numel_(numel), res_(res) {} + HOSTDEVICE void operator()(size_t idx) const { + if (*res_ || idx >= static_cast(numel_)) { + // founded zero number + return; + } + *res_ = (x_[idx] == static_cast(0)); + } + const T* x_; + int64_t numel_; + bool* res_; +}; + +template +inline bool CheckMatrixInvertible(const Context& dev_ctx, + const DenseTensor* det) { + auto numel = det->numel(); + + DenseTensor dev_tensor = phi::Empty(dev_ctx, {1}); + + // set false + phi::funcs::SetConstant zero; + zero(dev_ctx, &dev_tensor, false); + + // find whether zero + phi::funcs::ForRange for_range(dev_ctx, numel); + FoundZeroFunctor functor(det->data(), numel, dev_tensor.data()); + for_range(functor); + + // copy to host + DenseTensor cpu_tensor; + phi::Copy(dev_ctx, dev_tensor, phi::CPUPlace(), false, &cpu_tensor); + + // if founded zero, the matrix is not invertible + // else the matrix is invertible + auto* res = cpu_tensor.data(); + return !(*res); +} + +} // namespace detail + +template +void DeterminantGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& out_grad, + DenseTensor* x_grad) { + auto input_dims_size = x.dims().size(); + if (input_dims_size > 2) { + PADDLE_ENFORCE_EQ( + out_grad.dims().size() + 2, + input_dims_size, + phi::errors::InvalidArgument( + "The grad tensor of det dims size should be 2 less than" + " input tensor's, but here differ %d", + input_dims_size - out_grad.dims().size())); + } else if (input_dims_size == 2) { + // input dims size 2 and grad dims size 1 is possible + PADDLE_ENFORCE_EQ( + out_grad.dims().size(), + 1, + phi::errors::InvalidArgument( + "The grad tensor of det dims size should be 2 less than" + " input tensor's, but here differ %d", + input_dims_size - out_grad.dims().size())); + } else { + // checked in forward, pass + } + + // Check Whether the matrix is invertible + // (matrix A not invertible) == (det(A)=0) + if (!detail::CheckMatrixInvertible(dev_ctx, &out)) { + // The matrix is not invertible + VLOG(3) << "The input matrix not invertible!"; + x_grad->Resize(x.dims()); + phi::Full( + dev_ctx, phi::vectorize(x.dims()), static_cast(0.0f), x_grad); + return; + } + + // The matrix is invertible + // let |A| = Determinant(A) + // Ref to https://people.maths.ox.ac.uk/gilesm/files/NA-08-01.pdf + // we set d|A| = unsqueeze(dA * |A|, [-1, -2]) * inverse(A).transpose(-2, + // -1) + + // First: inverse(A) + DenseTensor inverse_A; + // A must be square matrices! + inverse_A.Resize(x.dims()); + dev_ctx.template Alloc(&inverse_A); + + phi::funcs::MatrixInverseFunctor mat_inv; + mat_inv(dev_ctx, x, &inverse_A); + + VLOG(3) << "inverse(A) dims: " << inverse_A.dims(); + + // Second: inverse(A).transpose(-2, -1) + DenseTensor transpose_inverse_A = + phi::TransposeLast2Dim(dev_ctx, inverse_A); + + VLOG(3) << "(dA * |A|).transpose(-2, -1) dims: " + << transpose_inverse_A.dims(); + + // Third: dA * |A| + auto mul_dA_detA = phi::Multiply(dev_ctx, out_grad, out); + VLOG(3) << "dA * |A| dims: " << mul_dA_detA.dims(); + + // Fourth: unsqueeze(dA * |A|, [-1, -2]) + auto unsqueeze1 = phi::funcs::Unsqueeze(mul_dA_detA, -1); + auto unsqueeze2 = phi::funcs::Unsqueeze(unsqueeze1, -2); + VLOG(3) << "unsqueezed(dA * |A|) dims: " << unsqueeze2.dims(); + + // Finally: unsqueeze(dA * |A|) * inverse(A) + auto res = phi::Multiply(dev_ctx, unsqueeze2, transpose_inverse_A); + + VLOG(3) << "unsqueeze(dA * |A|) * inverse(A) dims: " << res.dims(); + + x_grad->Resize(x.dims()); + VLOG(3) << "d|A| dims: " << x_grad->dims(); + + phi::Copy(dev_ctx, res, dev_ctx.GetPlace(), false, x_grad); +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/determinant_kernel_impl.h b/paddle/phi/kernels/impl/determinant_kernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..f3a611b89c95c64184a953e5069c8b200317da46 --- /dev/null +++ b/paddle/phi/kernels/impl/determinant_kernel_impl.h @@ -0,0 +1,124 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/determinant_kernel.h" + +#include +#include +#include +#include +#include + +#include "paddle/phi/core/enforce.h" + +#include "paddle/fluid/framework/tensor_util.h" + +namespace phi { +namespace detail { +template +class EigenMatrix {}; + +template <> +class EigenMatrix { + public: + using MatrixType = Eigen::MatrixXf; +}; + +template <> +class EigenMatrix { + public: + using MatrixType = Eigen::MatrixXd; +}; + +inline int64_t GetBatchCount(const DDim dims) { + int64_t batch_count = 1; + auto dim_size = dims.size(); + PADDLE_ENFORCE_GE( + dim_size, + 2, + phi::errors::InvalidArgument( + "the input matrix dimension size should greater than 2.")); + + // Cumulative multiplying each dimension until the last 2 to get the batch + // count, + // for example a tensor with shape [3,3,3,3], the batch count of matrices is + // 9. + for (int64_t i = 0; i < dims.size() - 2; i++) { + batch_count *= dims[i]; + } + + return batch_count; +} +} // namespace detail + +template +struct DeterminantFunctor { + void operator()(const Context& dev_ctx, + const DenseTensor& input, + int64_t rank, + int64_t batch_count, + DenseTensor* output) { + std::vector input_vec; + std::vector output_vec; + paddle::framework::TensorToVector(input, dev_ctx, &input_vec); + for (int64_t i = 0; i < batch_count; ++i) { // maybe can be parallel + auto begin_iter = input_vec.begin() + i * rank * rank; + auto end_iter = input_vec.begin() + (i + 1) * rank * rank; + std::vector sub_vec(begin_iter, + end_iter); // get every square matrix data + typename detail::EigenMatrix::MatrixType matrix(rank, rank); + for (int64_t i = 0; i < rank; ++i) { + for (int64_t j = 0; j < rank; ++j) { + matrix(i, j) = sub_vec[rank * i + j]; + } + } + output_vec.push_back(matrix.determinant()); + } + paddle::framework::TensorFromVector(output_vec, output); + } +}; + +template +void DeterminantKernel(const Context& dev_ctx, + const DenseTensor& x, + DenseTensor* out) { + auto input_dim = vectorize(x.dims()); + auto input_dim_size = input_dim.size(); + + auto batch_count = detail::GetBatchCount(x.dims()); + VLOG(10) << "input dim:" << x.dims(); + PADDLE_ENFORCE_GE( + input_dim_size, + 2, + phi::errors::InvalidArgument( + "the input matrix dimension size should greater than 2.")); + PADDLE_ENFORCE_EQ(input_dim[input_dim_size - 1], + input_dim[input_dim_size - 2], + phi::errors::InvalidArgument( + "the input matrix should be square matrix.")); + auto rank = input_dim[input_dim_size - 1]; // square matrix length + DeterminantFunctor()(dev_ctx, x, rank, batch_count, out); + auto output_dims = phi::slice_ddim(x.dims(), 0, input_dim_size - 2); + if (input_dim_size > 2) { + out->Resize(output_dims); + } else { + // when input is a two-dimension matrix, The det value is a number. + out->Resize({1}); + } + VLOG(10) << "output dim:" << out->dims(); +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h b/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h index 5b71fd7fa3a5ecd1c864c155df2586d293d3d2e6..5e06435b28e2719c2e9fc18de034073f9674a977 100644 --- a/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h @@ -16,11 +16,11 @@ #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/kernels/complex_kernel.h" +#include "paddle/phi/kernels/elementwise_kernel.h" #include "paddle/phi/kernels/funcs/diag_functor.h" #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/unsqueeze.h" -#include "paddle/phi/kernels/math_kernel.h" #include "paddle/phi/kernels/matmul_kernel.h" #include "paddle/phi/kernels/transpose_kernel.h" diff --git a/paddle/phi/kernels/impl/isclose_kernel_impl.h b/paddle/phi/kernels/impl/isclose_kernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..25247ceaff6c0a5f52a639176ce04c0589cbbd87 --- /dev/null +++ b/paddle/phi/kernels/impl/isclose_kernel_impl.h @@ -0,0 +1,176 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/data_type.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/dense_tensor.h" + +// TODO(xiongkun): remove the header when decouple the memcpy function in phi. +#include "paddle/fluid/memory/memcpy.h" + +namespace phi { +using Tensor = DenseTensor; +template +struct GetTensorValue { + T operator()(const DeviceContext& ctx, const DenseTensor& tensor) const; +}; + +template +struct IscloseFunctor { + void operator()(const DeviceContext& ctx, + const DenseTensor& in, + const DenseTensor& other, + const float rtol, + const float atol, + bool equal_nan, + DenseTensor* output); +}; + +template +struct GetTensorValue { + T operator()(const phi::CPUContext& dev_ctx, + const DenseTensor& tensor) const { + return *(tensor.data()); + } +}; + +template +struct GetTensorValue { + T operator()(const phi::GPUContext& dev_ctx, + const DenseTensor& tensor) const { + const T* data = tensor.data(); + T value; + const auto gpu_place = dev_ctx.GetPlace(); + paddle::memory::Copy( + phi::CPUPlace(), &value, gpu_place, data, sizeof(T), dev_ctx.stream()); + return value; + } +}; + +template +struct IscloseFunctor { + void operator()(const phi::CPUContext& ctx, + const DenseTensor& in, + const DenseTensor& other, + const double rtol, + const double atol, + bool equal_nan, + DenseTensor* output) { + auto* in_a = in.data(); + auto* in_b = other.data(); + auto* out_data = ctx.template Alloc(output); + auto num = in.numel(); + // *out_data = true; + for (int i = 0; i < num; i++) { + out_data[i] = true; + } + for (int i = 0; i < num; i++) { + const T a = in_a[i], b = in_b[i]; + bool val; + if (std::isnan(a) || std::isnan(b)) { + val = equal_nan && std::isnan(a) == std::isnan(b); + } else { + T left = (a > b ? a - b : b - a); + T right = atol + (b > 0 ? rtol * b : (-rtol) * b); + T diff = (left > right ? left - right : right - left); + val = a == b || left <= right || diff <= 1e-15; + } + // *out_data &= val; + out_data[i] = val; + } + } +}; + +#if defined(__NVCC__) || defined(__HIPCC__) +template +__global__ void IscloseCUDAKernel(const T* in_data, + const T* other_data, + const double rtol, + const double atol, + bool equal_nan, + int num, + bool* out_data) { + unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; + bool val; + for (int i = idx; i < num; i += blockDim.x * gridDim.x) { + const T a = in_data[i], b = other_data[i]; + if (isnan(a) || isnan(b)) { + val = equal_nan && isnan(a) == isnan(b); + } else { + T left = (a > b ? a - b : b - a); + T right = atol + (b > 0 ? rtol * b : (-rtol) * b); + T diff = (left > right ? left - right : right - left); + val = a == b || left <= right || diff <= 1e-15; + } + out_data[i] = val; + // if (!val) *out_data = false; + } +} + +template +struct IscloseFunctor { + void operator()(const phi::GPUContext& dev_ctx, + const DenseTensor& in, + const DenseTensor& other, + const double rtol, + const double atol, + bool equal_nan, + DenseTensor* output) { + int num = in.numel(); + const T* in_data = in.data(); + const T* other_data = other.data(); + bool* out_data = dev_ctx.template Alloc(output); + int block = 1024; + int grid = (block - 1 + num) / block; + grid = (grid > block) ? block : grid; +#ifdef PADDLE_WITH_HIP + hipMemset(out_data, true, num * sizeof(bool)); +#else + cudaMemset(out_data, true, num * sizeof(bool)); +#endif + IscloseCUDAKernel<<>>( + in_data, other_data, rtol, atol, equal_nan, num, out_data); + } +}; +#endif + +template +void IscloseKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const Scalar& rtol, + const Scalar& atol, + bool equal_nan, + DenseTensor* out) { + PADDLE_ENFORCE_EQ( + atol.dtype(), + DataType::FLOAT64, + phi::errors::InvalidArgument("Input(Atol) type must be double")); + + PADDLE_ENFORCE_EQ( + rtol.dtype(), + DataType::FLOAT64, + phi::errors::InvalidArgument("Input(Rtol) type must be double")); + + IscloseFunctor()( + dev_ctx, x, y, rtol.to(), atol.to(), equal_nan, out); +} +} // namespace phi diff --git a/paddle/phi/kernels/impl/kldiv_loss_grad_kernel_impl.h b/paddle/phi/kernels/impl/kldiv_loss_grad_kernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..1ae90960ef4455deff5f9b7e08ed9eb7ff62cba3 --- /dev/null +++ b/paddle/phi/kernels/impl/kldiv_loss_grad_kernel_impl.h @@ -0,0 +1,70 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" + +namespace phi { +using Array1 = Eigen::DSizes; +template +struct KLDivLossBackward { + HOSTDEVICE KLDivLossBackward() {} + + HOSTDEVICE T operator()(const T& target, const T& grad) const { + if (target <= 0) { + return 0; + } else { + return static_cast(-1.) * grad; + } + } +}; + +template +void KLDivLossGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& label, + const DenseTensor& d_out, + const std::string& reduction, + DenseTensor* d_x) { + auto& place = *dev_ctx.eigen_device(); + auto* target = &label; + auto* input_grad = d_x; + auto* loss_grad = &d_out; + + const int n = input_grad->dims()[0]; + const int numel = input_grad->numel(); + const int expand = numel / loss_grad->numel(); + + dev_ctx.template Alloc(input_grad); + + auto target_t = phi::EigenVector::Flatten(*target); + + auto input_grad_t = phi::EigenVector::Flatten(*input_grad); + auto loss_grad_t = phi::EigenVector::Flatten(*loss_grad); + + auto loss_grad_expand = loss_grad_t.broadcast(Array1(expand)); + auto grad_t = target_t * loss_grad_expand; + input_grad_t.device(place) = + target_t.binaryExpr(grad_t, KLDivLossBackward()); + + if ("mean" == reduction) { + input_grad_t.device(place) = input_grad_t / static_cast(numel); + } else if ("batchmean" == reduction) { + input_grad_t.device(place) = input_grad_t / static_cast(n); + } +} +} // namespace phi diff --git a/paddle/phi/kernels/impl/kldiv_loss_kernel_impl.h b/paddle/phi/kernels/impl/kldiv_loss_kernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..ecd23bbfc1c4598c953555a1f0d21dd0f6a989c8 --- /dev/null +++ b/paddle/phi/kernels/impl/kldiv_loss_kernel_impl.h @@ -0,0 +1,69 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" + +namespace phi { +using Array1 = Eigen::DSizes; +template +struct KLDivLossForward { + HOSTDEVICE KLDivLossForward() {} + + HOSTDEVICE T operator()(const T& target, const T& input) const { + if (target <= 0) { + return 0; + } else { + return target * (std::log(target) - input); + } + } +}; +template +void KLDivLossKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& label, + const std::string& reduction, + DenseTensor* out) { + auto& place = *(dev_ctx.eigen_device()); + auto* input = &x; + auto* target = &label; + auto* loss = out; + + const int n = input->dims()[0]; + dev_ctx.template Alloc(loss); + + auto input_t = phi::EigenVector::Flatten(*input); + auto target_t = phi::EigenVector::Flatten(*target); + auto loss_t = phi::EigenVector::Flatten(*loss); + auto output = target_t.binaryExpr(input_t, KLDivLossForward()); + if ("none" == reduction) { + loss_t.device(place) = output; + } else if ("batchmean" == reduction) { + auto output_sum = output.sum(); + if (n > 0) { + loss_t.device(place) = output_sum / output_sum.constant(n); + } else { + loss_t.device(place) = output_sum; + } + } else if ("mean" == reduction) { + loss_t.device(place) = output.mean(); + } else if ("sum" == reduction) { + loss_t.device(place) = output.sum(); + } +} +} // namespace phi diff --git a/paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h b/paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..8fb1f1c4fa3615cbf33fb7b6e4b0609dbcc2c3a0 --- /dev/null +++ b/paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h @@ -0,0 +1,48 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "paddle/phi/kernels/funcs/for_range.h" +namespace phi { +template +struct LgammaGradFunctor { + LgammaGradFunctor(const T* dout, const T* x, T* output, int64_t numel) + : dout_(dout), x_(x), output_(output), numel_(numel) {} + + HOSTDEVICE void operator()(int64_t idx) const { + output_[idx] = dout_[idx] * Eigen::numext::digamma(x_[idx]); + } + + private: + const T* dout_; + const T* x_; + T* output_; + int64_t numel_; +}; +template +void LgammaGradKernel(const Context& dev_ctx, + const DenseTensor& d_out, + const DenseTensor& x, + DenseTensor* d_x) { + auto numel = d_out.numel(); + auto* dout_data = d_out.data(); + auto* x_data = x.data(); + auto* dx_data = + dev_ctx.template Alloc(d_x, static_cast(numel * sizeof(T))); + phi::funcs::ForRange for_range(dev_ctx, numel); + LgammaGradFunctor functor(dout_data, x_data, dx_data, numel); + for_range(functor); +} +} // namespace phi diff --git a/paddle/phi/kernels/cpu/reduce_grad.h b/paddle/phi/kernels/impl/reduce_grad.h similarity index 100% rename from paddle/phi/kernels/cpu/reduce_grad.h rename to paddle/phi/kernels/impl/reduce_grad.h diff --git a/paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h b/paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..4a74416e3916492e6d3a40e09ca347db485fff7c --- /dev/null +++ b/paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h @@ -0,0 +1,47 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/reduce_grad_kernel.h" + +#include "paddle/phi/kernels/funcs/reduce_functor.h" +#include "paddle/phi/kernels/impl/reduce_grad.h" + +namespace phi { + +template +void ReduceMaxGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const DenseTensor& out, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType in_dtype, + DataType out_dtype, + DenseTensor* x_grad) { + ReduceGradKernel(dev_ctx, + x, + out_grad, + out, + dims, + keep_dim, + reduce_all, + in_dtype, + out_dtype, + x_grad); +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h b/paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..baaa544f137366f1e0343c25bc373cc08350f7fd --- /dev/null +++ b/paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h @@ -0,0 +1,47 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/reduce_grad_kernel.h" + +#include "paddle/phi/kernels/funcs/reduce_functor.h" +#include "paddle/phi/kernels/impl/reduce_grad.h" + +namespace phi { + +template +void ReduceMinGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const DenseTensor& out, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType in_dtype, + DataType out_dtype, + DenseTensor* x_grad) { + ReduceGradKernel(dev_ctx, + x, + out_grad, + out, + dims, + keep_dim, + reduce_all, + in_dtype, + out_dtype, + x_grad); +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h b/paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..6b93e98cec0168ab55e15e3401a72738f79d3a07 --- /dev/null +++ b/paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h @@ -0,0 +1,47 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/reduce_grad_kernel.h" + +#include "paddle/phi/kernels/funcs/reduce_functor.h" +#include "paddle/phi/kernels/impl/reduce_grad.h" + +namespace phi { + +template +void ReduceProdGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const DenseTensor& out, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType in_dtype, + DataType out_dtype, + DenseTensor* x_grad) { + ReduceGradKernel(dev_ctx, + x, + out_grad, + out, + dims, + keep_dim, + reduce_all, + in_dtype, + out_dtype, + x_grad); +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h b/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h index 9b1e4b1d3a65d5c0da831a36152cff85a3353fa3..044adb0230cac4d0dc6bf9e9348968e4d7c60b5d 100644 --- a/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h @@ -21,12 +21,11 @@ #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/common_shape.h" #include "paddle/phi/kernels/funcs/complex_functors.h" +#include "paddle/phi/kernels/funcs/for_range.h" #include "paddle/phi/kernels/funcs/matrix_reduce.h" +#include "paddle/phi/kernels/funcs/tril_triu_compute.h" #include "paddle/phi/kernels/triangular_solve_kernel.h" -// See Note [ Why still include the fluid headers? ] -#include "paddle/fluid/operators/tril_triu_op.h" - namespace phi { template @@ -119,7 +118,7 @@ void TriangularSolveGradKernel(const Context& dev_ctx, const auto H = dims[dims.size() - 2]; const auto W = dims[dims.size() - 1]; phi::funcs::ForRange x_for_range(dev_ctx, dx_bst.numel()); - paddle::operators::TrilTriuCompute tril_triu_functor( + phi::funcs::TrilTriuCompute tril_triu_functor( dx_bst.data(), unitriangular, !upper, H, W, dx_bst_upper.data()); x_for_range(tril_triu_functor); diff --git a/paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h b/paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..dcc7224b5075ca77db813089af6048f0809c9f35 --- /dev/null +++ b/paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h @@ -0,0 +1,44 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/tril_triu_grad_kernel.h" + +#include "paddle/phi/kernels/funcs/for_range.h" +#include "paddle/phi/kernels/funcs/tril_triu_compute.h" + +namespace phi { + +template +void TrilTriuGradKernel(const Context& ctx, + const DenseTensor& out_grad, + int diagonal, + bool lower, + DenseTensor* x_grad) { + const auto* dout_data = out_grad.data(); + auto* dx_data = ctx.template Alloc(x_grad); + + const auto& dims = out_grad.dims(); + const auto H = dims[dims.size() - 2]; + const auto W = dims[dims.size() - 1]; + + phi::funcs::ForRange for_range( + ctx, static_cast(out_grad.numel())); + phi::funcs::TrilTriuCompute tril_triu_grad_computer( + dout_data, diagonal, lower, H, W, dx_data); + for_range(tril_triu_grad_computer); +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/tril_triu_kernel_impl.h b/paddle/phi/kernels/impl/tril_triu_kernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..959169d87cefd877a4fb056218dd761a96f23136 --- /dev/null +++ b/paddle/phi/kernels/impl/tril_triu_kernel_impl.h @@ -0,0 +1,43 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/tril_triu_kernel.h" + +#include "paddle/phi/kernels/funcs/for_range.h" +#include "paddle/phi/kernels/funcs/tril_triu_compute.h" + +namespace phi { + +template +void TrilTriuKernel(const Context& ctx, + const DenseTensor& x, + int diagonal, + bool lower, + DenseTensor* out) { + const auto* x_data = x.data(); + auto* out_data = ctx.template Alloc(out); + + const auto& dims = x.dims(); + const auto H = dims[dims.size() - 2]; + const auto W = dims[dims.size() - 1]; + phi::funcs::ForRange for_range(ctx, static_cast(x.numel())); + + phi::funcs::TrilTriuCompute tril_triu_computer( + x_data, diagonal, lower, H, W, out_data); + for_range(tril_triu_computer); +} + +} // namespace phi diff --git a/paddle/phi/kernels/index_select_grad_kernel.h b/paddle/phi/kernels/index_select_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..c3dc1595989bf2879c3e20187eaa53b6df75a7f0 --- /dev/null +++ b/paddle/phi/kernels/index_select_grad_kernel.h @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void IndexSelectGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& index, + const DenseTensor& out_grad, + int dim, + DenseTensor* x_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/index_select_kernel.h b/paddle/phi/kernels/index_select_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..124b6897311575223859fba882488a535a6310f4 --- /dev/null +++ b/paddle/phi/kernels/index_select_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void IndexSelectKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& index, + int dim, + DenseTensor* output); + +} // namespace phi diff --git a/paddle/phi/kernels/isclose_kernel.h b/paddle/phi/kernels/isclose_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..8c468da055082005568002952f695ebedda31c3c --- /dev/null +++ b/paddle/phi/kernels/isclose_kernel.h @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/scalar.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void IscloseKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const Scalar& rtol, + const Scalar& atol, + bool equal_nan, + DenseTensor* out); +} // namespace phi diff --git a/paddle/phi/kernels/kldiv_loss_grad_kernel.h b/paddle/phi/kernels/kldiv_loss_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..8f53898fa6816d198d1bc96bcf99f20752a70551 --- /dev/null +++ b/paddle/phi/kernels/kldiv_loss_grad_kernel.h @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +// XKTODO (change name) +void KLDivLossGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& label, + const DenseTensor& d_out, + const std::string& reduction, + DenseTensor* d_x); +} // namespace phi diff --git a/paddle/phi/kernels/kldiv_loss_kernel.h b/paddle/phi/kernels/kldiv_loss_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..103780ab747282f9171d2a409db9fd4e5269ba4d --- /dev/null +++ b/paddle/phi/kernels/kldiv_loss_kernel.h @@ -0,0 +1,29 @@ + +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void KLDivLossKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& label, + const std::string& reduction, + DenseTensor* out); +} // namespace phi diff --git a/paddle/phi/kernels/kthvalue_grad_kernel.h b/paddle/phi/kernels/kthvalue_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..488dde8237b0882b6606834f8a510ef360da1b24 --- /dev/null +++ b/paddle/phi/kernels/kthvalue_grad_kernel.h @@ -0,0 +1,30 @@ + +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { +template +void KthvalueGradKernel(const Context& dev_ctx, + const DenseTensor& d_out, + const DenseTensor& x, + const DenseTensor& indices, + int k, + int axis, + bool keepdim, + DenseTensor* d_x); +} // namespace phi diff --git a/paddle/phi/kernels/kthvalue_kernel.h b/paddle/phi/kernels/kthvalue_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..4809b9af4832f5d9c036d26adfc6ba5c7a808889 --- /dev/null +++ b/paddle/phi/kernels/kthvalue_kernel.h @@ -0,0 +1,30 @@ + +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void KthvalueKernel(const Context& dev_ctx, + const DenseTensor& x, + int k, + int axis, + bool keepdim, + DenseTensor* out, + DenseTensor* indices); +} // namespace phi diff --git a/paddle/phi/kernels/layer_norm_grad_kernel.h b/paddle/phi/kernels/layer_norm_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..c32be63db4178f92d9564f357c30bb28fb415516 --- /dev/null +++ b/paddle/phi/kernels/layer_norm_grad_kernel.h @@ -0,0 +1,36 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void LayerNormGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& mean, + const DenseTensor& variance, + paddle::optional scale, + paddle::optional bias, + const DenseTensor& out_grad, + float epsilon, + int begin_norm_axis, + bool is_test, + DenseTensor* x_grad, + DenseTensor* scale_grad, + DenseTensor* bias_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/layer_norm_kernel.h b/paddle/phi/kernels/layer_norm_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..c9679420bda5cf6beffb56b7ec319c1b80ac4eda --- /dev/null +++ b/paddle/phi/kernels/layer_norm_kernel.h @@ -0,0 +1,51 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/backends/gpu/gpu_decls.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void LayerNormKernel(const Context& ctx, + const DenseTensor& x, + paddle::optional scale, + paddle::optional bias, + float epsilon, + int begin_norm_axis, + bool is_test, + DenseTensor* out, + DenseTensor* mean, + DenseTensor* variance); + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +template +class LayerNormDirectCUDAFunctor { + public: + void operator()(gpuStream_t stream, + const T* input, + std::vector input_shape, + const T* bias, + const T* scale, + T* output, + T* mean, + T* variance, + int begin_norm_axis, + float eps); +}; +#endif + +} // namespace phi diff --git a/paddle/phi/kernels/lgamma_grad_kernel.h b/paddle/phi/kernels/lgamma_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..94173cc29c7a7b7c71b6c781fbe06f9c991a4197 --- /dev/null +++ b/paddle/phi/kernels/lgamma_grad_kernel.h @@ -0,0 +1,27 @@ + +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void LgammaGradKernel(const Context& dev_ctx, + const DenseTensor& d_out, + const DenseTensor& x, + DenseTensor* d_x); +} // namespace phi diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu b/paddle/phi/kernels/lgamma_kernel.h similarity index 63% rename from paddle/fluid/operators/reduce_ops/reduce_any_op.cu rename to paddle/phi/kernels/lgamma_kernel.h index 2e93e67debbd9d7f8667e0b2994fdd440401ac13..f61b3a1ce859eeee77d49ecb38ee0e96c3a9f0ee 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu +++ b/paddle/phi/kernels/lgamma_kernel.h @@ -1,4 +1,5 @@ -// Copyright (c) 2018 PaddlePaddle Authors. Any Rights Reserved. + +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,9 +13,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/reduce_ops/reduce_any_op.h" -#include "paddle/fluid/operators/reduce_ops/reduce_op.h" +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { -REGISTER_OP_CUDA_KERNEL( - reduce_any, - ops::ReduceCudaKernel); +template +void LgammaKernel(const Context& dev_ctx, + const DenseTensor& x, + DenseTensor* out); +} // namespace phi diff --git a/paddle/phi/kernels/log_softmax_grad_kernel.h b/paddle/phi/kernels/log_softmax_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..6336bc14105bb55deacbfdc20a69a56c6ceca81a --- /dev/null +++ b/paddle/phi/kernels/log_softmax_grad_kernel.h @@ -0,0 +1,27 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void LogSoftmaxGradKernel(const Context& dev_ctx, + const DenseTensor& out, + const DenseTensor& out_grad, + int axis, + DenseTensor* x_grad); +} // namespace phi diff --git a/paddle/phi/kernels/log_softmax_kernel.h b/paddle/phi/kernels/log_softmax_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..2caaa86d46c35888c5aaa944019c070f0dd64e17 --- /dev/null +++ b/paddle/phi/kernels/log_softmax_kernel.h @@ -0,0 +1,26 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void LogSoftmaxKernel(const Context& dev_ctx, + const DenseTensor& x, + int axis, + DenseTensor* out); +} // namespace phi diff --git a/paddle/phi/kernels/math_kernel.h b/paddle/phi/kernels/math_kernel.h deleted file mode 100644 index 7569cbcff087d796313c24d46ff7b7fd9cf7e2eb..0000000000000000000000000000000000000000 --- a/paddle/phi/kernels/math_kernel.h +++ /dev/null @@ -1,177 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/infermeta/binary.h" -#include "paddle/phi/infermeta/unary.h" -#include "paddle/phi/kernels/empty_kernel.h" - -namespace phi { - -template -void MeanRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out); - -template -void MeanKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - DenseTensor* out); - -template -void SumRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DataType out_dtype, - DenseTensor* out); - -template -void SumKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - DataType out_dtype, - bool keep_dim, - DenseTensor* out); - -template -void AddRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - int axis, - DenseTensor* out); - -template -void AddKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* out); - -template -void SubtractRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - int axis, - DenseTensor* out); - -template -void SubtractKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* out); - -template -void DivideRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - int axis, - DenseTensor* out); - -template -void DivideKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* out); - -template -void MultiplyRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - int axis, - DenseTensor* out); - -template -void MultiplyKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* out); - -template -DenseTensor Add(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& y) { - DenseTensor dense_out; - MetaTensor meta_out(&dense_out); - ElementwiseInferMeta(x, y, &meta_out); - AddKernel(dev_ctx, x, y, &dense_out); - return dense_out; -} - -template -DenseTensor Subtract(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& y) { - DenseTensor dense_out; - MetaTensor meta_out(&dense_out); - ElementwiseInferMeta(x, y, &meta_out); - SubtractKernel(dev_ctx, x, y, &dense_out); - return dense_out; -} - -template -DenseTensor Divide(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& y) { - DenseTensor dense_out; - MetaTensor meta_out(&dense_out); - ElementwiseInferMeta(x, y, &meta_out); - DivideKernel(dev_ctx, x, y, &dense_out); - return dense_out; -} - -template -DenseTensor Multiply(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& y) { - DenseTensor dense_out; - MetaTensor meta_out(&dense_out); - ElementwiseInferMeta(x, y, &meta_out); - MultiplyKernel(dev_ctx, x, y, &dense_out); - return dense_out; -} - -template -DenseTensor Mean(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& axis, - bool keep_dim) { - DenseTensor dense_out; - MetaTensor meta_out(&dense_out); - SumRawInferMeta(x, axis, keep_dim, false, x.dtype(), &meta_out); - MeanKernel(dev_ctx, x, axis, keep_dim, &dense_out); - return dense_out; -} - -template -DenseTensor Sum(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& axis, - DataType dtype, - bool keep_dim) { - DenseTensor dense_out; - MetaTensor meta_out(&dense_out); - SumInferMeta(x, axis, dtype, keep_dim, &meta_out); - SumKernel(dev_ctx, x, axis, dtype, keep_dim, &dense_out); - return dense_out; -} - -} // namespace phi diff --git a/paddle/phi/kernels/mode_grad_kernel.h b/paddle/phi/kernels/mode_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..ccde8c3648fa556401f1937c78039743daf43f4c --- /dev/null +++ b/paddle/phi/kernels/mode_grad_kernel.h @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void ModeGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& indices, + const DenseTensor& out_grad, + int axis, + bool keepdim, + DenseTensor* x_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/mode_kernel.h b/paddle/phi/kernels/mode_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..831c4369304e5c5d27cddf01bcba021745bf7083 --- /dev/null +++ b/paddle/phi/kernels/mode_kernel.h @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void ModeKernel(const Context& dev_ctx, + const DenseTensor& x, + int axis, + bool keepdim, + DenseTensor* out, + DenseTensor* indices); + +} // namespace phi diff --git a/paddle/phi/kernels/reduce_sum_grad_kernel.h b/paddle/phi/kernels/multiplex_grad_kernel.h similarity index 66% rename from paddle/phi/kernels/reduce_sum_grad_kernel.h rename to paddle/phi/kernels/multiplex_grad_kernel.h index ab4d63297efffc70710e496efa08f4b9c7e5f7ce..b32c9dbe100584f7076f34d848d3e5112315f83d 100644 --- a/paddle/phi/kernels/reduce_sum_grad_kernel.h +++ b/paddle/phi/kernels/multiplex_grad_kernel.h @@ -14,19 +14,14 @@ #pragma once -#include "paddle/phi/common/data_type.h" #include "paddle/phi/core/dense_tensor.h" + namespace phi { template -void ReduceSumGradKernel(const Context& dev_ctx, - const DenseTensor& x, +void MultiplexGradKernel(const Context& ctx, + const DenseTensor& ids, const DenseTensor& out_grad, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DataType in_dtype, - DataType out_dtype, - DenseTensor* x_grad); + std::vector ins_grad); } // namespace phi diff --git a/paddle/phi/kernels/multiplex_kernel.h b/paddle/phi/kernels/multiplex_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..341c6d5cabb7ce1d67090c7533bc8c45622f4786 --- /dev/null +++ b/paddle/phi/kernels/multiplex_kernel.h @@ -0,0 +1,27 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void MultiplexKernel(const Context& ctx, + const std::vector& ins, + const DenseTensor& ids, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/prelu_grad_kernel.h b/paddle/phi/kernels/prelu_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..15917e2e1f02e896d12e971e7dfa52685f57a676 --- /dev/null +++ b/paddle/phi/kernels/prelu_grad_kernel.h @@ -0,0 +1,31 @@ + +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void PReluGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& alpha, + const DenseTensor& out_grad, + const std::string& mode, + const std::string& data_format, + DenseTensor* x_grad, + DenseTensor* alpha_grad); +} // namespace phi diff --git a/paddle/phi/kernels/prelu_kernel.h b/paddle/phi/kernels/prelu_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..251332a8158dcbfa45cbb6c183e06789c21894db --- /dev/null +++ b/paddle/phi/kernels/prelu_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void PReluKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& alpha, + const std::string& mode, + const std::string& data_format, + DenseTensor* out); +} // namespace phi diff --git a/paddle/phi/kernels/primitive/compute_primitives.h b/paddle/phi/kernels/primitive/compute_primitives.h index 632ad00f6d06ed8a02b2d9677ff665c677cf8cb9..e02f4450a8babb9dd90cae6d8d1622938ae2f795 100644 --- a/paddle/phi/kernels/primitive/compute_primitives.h +++ b/paddle/phi/kernels/primitive/compute_primitives.h @@ -22,7 +22,6 @@ #endif #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" -// #include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/float16.h" namespace phi { @@ -591,7 +590,7 @@ __device__ __forceinline__ void Cumsum(OutT* out, int index = (tidx + 1) * 2 * stride - 1; if (index < (blockDim.x * 2)) { temp[index + index / 32] = - compute(temp[index + index / 2], + compute(temp[index + index / 32], temp[index - stride + (index - stride) / 32]); } } diff --git a/paddle/phi/kernels/primitive/datamover_primitives.h b/paddle/phi/kernels/primitive/datamover_primitives.h index 2f1e2f589c5122987d9776700f3aa7bd95daa7a5..1d4181f3b9a89509ada2a8fe27d584a9b5aa039c 100644 --- a/paddle/phi/kernels/primitive/datamover_primitives.h +++ b/paddle/phi/kernels/primitive/datamover_primitives.h @@ -115,6 +115,14 @@ struct BroadcastConfig { } }; +template +__device__ __forceinline__ void WriteData(T* dst, + T* __restrict__ src, + int num) { + for (int i = 0; i < num; i++) { + dst[i] = src[i]; + } +} #undef INT_BITS } // namespace details diff --git a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h index 53a8b7d0c9ef9489056ab293d97e5767b23531fe..d2cfdbdec3064c8e9cf20d101afc2adf0ed011a8 100644 --- a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h +++ b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h @@ -76,6 +76,16 @@ struct BroadcastConfig { }; #pragma pack() +template +__device__ __forceinline__ void WriteData(T* _global_ptr_ dst, + T* src, + int num) { + if (num > 0) { + LM2GM(src, dst, num * sizeof(T)); + } +} +#undef INT_BITS + } // namespace details /** diff --git a/paddle/phi/kernels/qr_kernel.h b/paddle/phi/kernels/qr_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..9c3dfb16601267ec8a1d2535f6854c2a31dba5a8 --- /dev/null +++ b/paddle/phi/kernels/qr_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void QrKernel(const Context& ctx, + const DenseTensor& x, + const std::string& mode, + DenseTensor* q, + DenseTensor* r); + +} // namespace phi diff --git a/paddle/phi/kernels/reduce_grad_kernel.h b/paddle/phi/kernels/reduce_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..ee6f3d19a094d29546e82e7138933eceb96459d0 --- /dev/null +++ b/paddle/phi/kernels/reduce_grad_kernel.h @@ -0,0 +1,79 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/data_type.h" +#include "paddle/phi/core/dense_tensor.h" +namespace phi { + +template +void ReduceSumGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType in_dtype, + DataType out_dtype, + DenseTensor* x_grad); + +template +void ReduceMeanGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType in_dtype, + DataType out_dtype, + DenseTensor* x_grad); + +template +void ReduceProdGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const DenseTensor& out, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType in_dtype, + DataType out_dtype, + DenseTensor* x_grad); + +template +void ReduceMaxGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const DenseTensor& out, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType in_dtype, + DataType out_dtype, + DenseTensor* x_grad); + +template +void ReduceMinGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const DenseTensor& out, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType in_dtype, + DataType out_dtype, + DenseTensor* x_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/reduce_kernel.cc b/paddle/phi/kernels/reduce_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..7638c782d547d6b69d0c740827abf96e3ffda0c5 --- /dev/null +++ b/paddle/phi/kernels/reduce_kernel.cc @@ -0,0 +1,165 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_kernel.h" + +#include "paddle/phi/backends/all_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void SumKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + DataType out_dtype, + bool keep_dim, + DenseTensor* out) { + bool reduce_all = false; + SumRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out_dtype, out); +} + +template +void MeanKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out) { + bool reduce_all = false; + MeanRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); +} + +template +void ProdKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out) { + bool reduce_all = false; + ProdRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); +} + +template +void MaxKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out) { + bool reduce_all = false; + MaxRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); +} + +template +void MinKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out) { + bool reduce_all = false; + MinRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); +} + +template +void AllKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out) { + bool reduce_all = false; + AllRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); +} + +template +void AnyKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out) { + bool reduce_all = false; + AnyRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); +} + +} // namespace phi + +using complex64 = ::phi::dtype::complex; +using complex128 = ::phi::dtype::complex; + +PD_REGISTER_KERNEL( + mean, CPU, ALL_LAYOUT, phi::MeanKernel, float, double, bool) {} + +PD_REGISTER_KERNEL(sum, + CPU, + ALL_LAYOUT, + phi::SumKernel, + bool, + float, + double, + phi::dtype::float16, + int16_t, + int, + int64_t, + complex64, + complex128) { + kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); +} + +PD_REGISTER_KERNEL( + prod, CPU, ALL_LAYOUT, phi::ProdKernel, float, double, int, int64_t) {} + +PD_REGISTER_KERNEL( + max, CPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {} +PD_REGISTER_KERNEL( + min, CPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t) {} +PD_REGISTER_KERNEL(all, CPU, ALL_LAYOUT, phi::AllKernel, bool) {} +PD_REGISTER_KERNEL(any, CPU, ALL_LAYOUT, phi::AnyKernel, bool) {} + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + +PD_REGISTER_KERNEL(mean, + GPU, + ALL_LAYOUT, + phi::MeanKernel, + float, + double, + bool, + int, + int64_t, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(sum, + GPU, + ALL_LAYOUT, + phi::SumKernel, + bool, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16, + int16_t, + int, + int64_t, + complex64, + complex128) { + kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); +} + +PD_REGISTER_KERNEL( + prod, GPU, ALL_LAYOUT, phi::ProdKernel, float, double, int, int64_t) {} + +PD_REGISTER_KERNEL( + max, GPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {} +PD_REGISTER_KERNEL( + min, GPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t) {} +PD_REGISTER_KERNEL(all, GPU, ALL_LAYOUT, phi::AllKernel, bool) {} +PD_REGISTER_KERNEL(any, GPU, ALL_LAYOUT, phi::AnyKernel, bool) {} +#endif diff --git a/paddle/phi/kernels/reduce_kernel.h b/paddle/phi/kernels/reduce_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..69bcb47bc98eadd46eeff5c1f92ccf9cf0c9a9d3 --- /dev/null +++ b/paddle/phi/kernels/reduce_kernel.h @@ -0,0 +1,153 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/infermeta/unary.h" + +namespace phi { +template +void SumRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType out_dtype, + DenseTensor* out); + +template +void MeanRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out); + +template +void ProdRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out); + +template +void MaxRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out); + +template +void MinRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out); + +template +void AnyRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out); + +template +void AllRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out); + +template +void SumKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + DataType out_dtype, + bool keep_dim, + DenseTensor* out); + +template +void MeanKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out); + +template +void ProdKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out); + +template +void MaxKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out); + +template +void MinKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out); + +template +void AnyKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out); + +template +void AllKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out); + +template +DenseTensor Mean(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& axis, + bool keep_dim) { + DenseTensor dense_out; + MetaTensor meta_out(&dense_out); + SumRawInferMeta(x, axis, keep_dim, false, x.dtype(), &meta_out); + MeanKernel(dev_ctx, x, axis, keep_dim, &dense_out); + return dense_out; +} + +template +DenseTensor Sum(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& axis, + DataType dtype, + bool keep_dim) { + DenseTensor dense_out; + MetaTensor meta_out(&dense_out); + SumInferMeta(x, axis, dtype, keep_dim, &meta_out); + SumKernel(dev_ctx, x, axis, dtype, keep_dim, &dense_out); + return dense_out; +} + +} // namespace phi diff --git a/paddle/phi/kernels/roi_align_grad_kernel.h b/paddle/phi/kernels/roi_align_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..eea1fa03886a4a02dbc614052e1f280c2610f1ad --- /dev/null +++ b/paddle/phi/kernels/roi_align_grad_kernel.h @@ -0,0 +1,35 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/utils/optional.h" + +namespace phi { + +template +void RoiAlignGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& boxes, + paddle::optional boxes_num, + const DenseTensor& out_grad, + int pooled_height, + int pooled_width, + float spatial_scale, + int sampling_ratio, + bool aligned, + DenseTensor* dx); + +} // namespace phi diff --git a/paddle/phi/kernels/roi_align_kernel.h b/paddle/phi/kernels/roi_align_kernel.h index 16b52c563a592f0cc23ddca94f554f5dc49e8ccf..9734da53b7f453d492cc60ee8930f54e7ca74edc 100644 --- a/paddle/phi/kernels/roi_align_kernel.h +++ b/paddle/phi/kernels/roi_align_kernel.h @@ -20,7 +20,7 @@ namespace phi { template -void ROIAlignKernel(const Context& dev_ctx, +void RoiAlignKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& boxes, paddle::optional boxes_num, diff --git a/paddle/phi/kernels/roi_pool_grad_kernel.h b/paddle/phi/kernels/roi_pool_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..d7f1c378f75c398a714f6aa4e4d857e314f47eeb --- /dev/null +++ b/paddle/phi/kernels/roi_pool_grad_kernel.h @@ -0,0 +1,34 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/utils/optional.h" + +namespace phi { + +template +void RoiPooGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& boxes, + paddle::optional boxes_num, + const DenseTensor& arg_max, + const DenseTensor& out_grad, + int pooled_height, + int pooled_width, + float spatial_scale, + DenseTensor* dx); + +} // namespace phi diff --git a/paddle/phi/kernels/roi_pool_kernel.h b/paddle/phi/kernels/roi_pool_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..c6ff6f223612a46c00abff103c3b3a193264b122 --- /dev/null +++ b/paddle/phi/kernels/roi_pool_kernel.h @@ -0,0 +1,35 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/utils/optional.h" + +namespace phi { + +static constexpr int kROISize = 4; + +template +void RoiPoolKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& boxes, + paddle::optional boxes_num, + int pooled_height, + int pooled_width, + float spatial_scale, + DenseTensor* out, + DenseTensor* arg_max); + +} // namespace phi diff --git a/paddle/phi/kernels/roll_grad_kernel.h b/paddle/phi/kernels/roll_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..331f3626e56574615a2d6b1680335638b060846d --- /dev/null +++ b/paddle/phi/kernels/roll_grad_kernel.h @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void RollGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const ScalarArray& shifts, + const std::vector& axis, + DenseTensor* x_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/roll_kernel.h b/paddle/phi/kernels/roll_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..56f32174a4c0005968acf147b2daf25914ff01b1 --- /dev/null +++ b/paddle/phi/kernels/roll_kernel.h @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void RollKernel(const Context& dev_ctx, + const DenseTensor& x, + const ScalarArray& shifts, + const std::vector& axis, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/selected_rows/assign_kernel.cc b/paddle/phi/kernels/selected_rows/assign_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..fae876facfc8fae9b2db783576444ac8bfde09a1 --- /dev/null +++ b/paddle/phi/kernels/selected_rows/assign_kernel.cc @@ -0,0 +1,49 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/selected_rows/assign_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/assign_kernel.h" + +namespace phi { +namespace sr { + +// Note: use `const paddle::optional x` +// as input if needed +template +void AssignKernel(const Context& dev_ctx, + const SelectedRows& x, + SelectedRows* out) { + out->set_rows(x.rows()); + out->set_height(x.height()); + phi::AssignKernel(dev_ctx, x.value(), out->mutable_value()); +} + +} // namespace sr +} // namespace phi + +PD_REGISTER_GENERAL_KERNEL(assign_sr, + CPU, + ALL_LAYOUT, + phi::sr::AssignKernel, + ALL_DTYPE) {} + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_REGISTER_GENERAL_KERNEL(assign_sr, + GPU, + ALL_LAYOUT, + phi::sr::AssignKernel, + ALL_DTYPE) {} +#endif diff --git a/paddle/phi/kernels/selected_rows/assign_kernel.h b/paddle/phi/kernels/selected_rows/assign_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..2ba465615a73a3036d4b029c8ecb54002b86cb97 --- /dev/null +++ b/paddle/phi/kernels/selected_rows/assign_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/selected_rows.h" + +namespace phi { +namespace sr { + +template +void AssignKernel(const Context& dev_ctx, + const SelectedRows& x, + SelectedRows* out); + +} // namespace sr +} // namespace phi diff --git a/paddle/phi/kernels/selected_rows/copy_kernel.cc b/paddle/phi/kernels/selected_rows/copy_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..cf71ab0583f6120e7bf10f26f00024b27a56ca79 --- /dev/null +++ b/paddle/phi/kernels/selected_rows/copy_kernel.cc @@ -0,0 +1,49 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/selected_rows/copy_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" +namespace phi { +namespace sr { + +template +void Copy(const Context& dev_ctx, + const SelectedRows& src, + Place dst_place, + bool blocking, + SelectedRows* dst) { + if (src.value().Holder() != dst->value().Holder() || + src.value().data() != dst->value().data()) { + dst->set_rows(src.rows()); + dst->set_height(src.height()); + } + phi::Copy( + dev_ctx, src.value(), dst_place, blocking, dst->mutable_value()); +} + +} // namespace sr +} // namespace phi + +PD_REGISTER_GENERAL_KERNEL( + copy_sr, CPU, ALL_LAYOUT, phi::sr::Copy, ALL_DTYPE) {} + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_REGISTER_GENERAL_KERNEL( + copy_sr, GPU, ALL_LAYOUT, phi::sr::Copy, ALL_DTYPE) {} +#endif diff --git a/paddle/phi/kernels/selected_rows/copy_kernel.h b/paddle/phi/kernels/selected_rows/copy_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..4aa848bea2a717ffcda4dff562ec56a702b7dbc5 --- /dev/null +++ b/paddle/phi/kernels/selected_rows/copy_kernel.h @@ -0,0 +1,31 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/selected_rows.h" +#include "paddle/phi/core/sparse_csr_tensor.h" + +namespace phi { +namespace sr { + +template +void Copy(const Context& dev_ctx, + const SelectedRows& src, + Place dst_place, + bool blocking, + SelectedRows* dst); + +} // namespace sr +} // namespace phi diff --git a/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.cc b/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..80b2a1f6678a27594f0fd3319ccb938dac67bf13 --- /dev/null +++ b/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.cc @@ -0,0 +1,99 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.h" + +#include "paddle/fluid/framework/mixed_vector.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/hierarchical_sigmoid_grad.h" + +namespace phi { +namespace sr { + +static std::vector PathToRows(const DenseTensor& path) { + std::set rows; + const int64_t* paths = path.data(); + for (int64_t i = 0; i < path.numel(); ++i) { + int64_t row = paths[i]; + if (row < 0) { + continue; + } + rows.emplace(row); + } + return std::vector(rows.begin(), rows.end()); +} + +template +void HierarchicalSigmoidGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& w, + const DenseTensor& label, + const DenseTensor& pre_out, + const DenseTensor& out_grad, + paddle::optional path, + paddle::optional code, + paddle::optional bias, + int num_classes, + bool remote_prefetch, + int trainer_id, + const std::vector& height_sections, + const std::vector& epmap, + const std::vector& table_names, + bool is_sparse, + DenseTensor* x_grad, + SelectedRows* w_grad, + DenseTensor* bias_grad) { + PADDLE_ENFORCE_NOT_NULL( + path.get_ptr(), + errors::NotFound("Custom tree must be set for sparse mode!")); + paddle::framework::Vector real_rows = PathToRows(*path); + w_grad->set_rows(real_rows); + // Build a map of id -> row_index to speed up finding the index of one id + w_grad->set_height(w.dims()[0]); + auto* w_grad_value = w_grad->mutable_value(); + phi::DDim temp_dim(w.dims()); + temp_dim[0] = real_rows.size(); + w_grad_value->Resize(temp_dim); + phi::HierarchicalSigmoidGradKernelImpl(ctx, + x, + w, + label, + pre_out, + out_grad, + path, + code, + bias, + num_classes, + remote_prefetch, + trainer_id, + height_sections, + epmap, + table_names, + is_sparse, + x_grad, + w_grad_value, + bias_grad, + w_grad); +} + +} // namespace sr +} // namespace phi + +PD_REGISTER_KERNEL(hierarchical_sigmoid_grad_sr, + CPU, + ALL_LAYOUT, + phi::sr::HierarchicalSigmoidGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.h b/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..557c8b1bc5eed2c64f3a5c16d52cead124815ffc --- /dev/null +++ b/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.h @@ -0,0 +1,45 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/selected_rows.h" + +namespace phi { +namespace sr { + +template +void HierarchicalSigmoidGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& w, + const DenseTensor& label, + const DenseTensor& pre_out, + const DenseTensor& out_grad, + paddle::optional path, + paddle::optional code, + paddle::optional bias, + int num_classes, + bool remote_prefetch, + int trainer_id, + const std::vector& height_sections, + const std::vector& epmap, + const std::vector& table_names, + bool is_sparse, + DenseTensor* x_grad, + SelectedRows* w_grad, + DenseTensor* bias_grad); + +} // namespace sr +} // namespace phi diff --git a/paddle/phi/kernels/selected_rows/shape_kernel.cc b/paddle/phi/kernels/selected_rows/shape_kernel.cc index 9bcd5d8544e2d73961d72115023446d427e8895e..67126d82042b28de8c560a55046e50029153290d 100644 --- a/paddle/phi/kernels/selected_rows/shape_kernel.cc +++ b/paddle/phi/kernels/selected_rows/shape_kernel.cc @@ -17,6 +17,7 @@ limitations under the License. */ #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/float16.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/shape_kernel.h" namespace phi { namespace sr { @@ -25,15 +26,7 @@ template void ShapeKernel(const Context& ctx, const SelectedRows& input, DenseTensor* out) { - auto in_var = input; - phi::DDim in_dims; - in_dims = in_var.value().dims(); - auto out_t = out; - out_t->Resize({in_dims.size()}); - auto out_data = ctx.template HostAlloc(out_t); - for (int i = 0; i < in_dims.size(); ++i) { - out_data[i] = in_dims[i]; - } + phi::ShapeKernel(ctx, input.value(), out); } } // namespace sr diff --git a/paddle/phi/kernels/gpu/shape_kernel.cu b/paddle/phi/kernels/shape_kernel.cc similarity index 53% rename from paddle/phi/kernels/gpu/shape_kernel.cu rename to paddle/phi/kernels/shape_kernel.cc index 39b6eaeaef2a8e80d204941dc1f3ac92907aa786..dd26a7edc9cdd8e1917bb5d88e957b3e7d545f93 100644 --- a/paddle/phi/kernels/gpu/shape_kernel.cu +++ b/paddle/phi/kernels/shape_kernel.cc @@ -13,12 +13,43 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/phi/kernels/shape_kernel.h" - -#include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/common/float16.h" +#include "paddle/phi/backends/all_context.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/impl/shape_kernel_impl.h" +namespace phi { + +template +void ShapeKernel(const Context& ctx, + const DenseTensor& input, + DenseTensor* out) { + auto in_var = &input; + phi::DDim in_dims; + in_dims = in_var->dims(); + auto out_t = out; + out_t->Resize({in_dims.size()}); + auto out_data = ctx.template HostAlloc(out_t); + for (int i = 0; i < in_dims.size(); ++i) { + out_data[i] = in_dims[i]; + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(shape, + CPU, + ALL_LAYOUT, + phi::ShapeKernel, + bool, + int, + int8_t, + uint8_t, + int64_t, + float, + double, + phi::dtype::complex, + phi::dtype::complex) {} + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PD_REGISTER_KERNEL(shape, GPU, ALL_LAYOUT, @@ -33,3 +64,4 @@ PD_REGISTER_KERNEL(shape, phi::dtype::complex, phi::dtype::complex, phi::dtype::float16) {} +#endif diff --git a/paddle/phi/kernels/sparse/convolution_grad_kernel.h b/paddle/phi/kernels/sparse/convolution_grad_kernel.h index 42bde442e1e063a355d2eabb2963865a2ff45bcb..23e059c72e77615e2c24aed961d22b3154c30449 100644 --- a/paddle/phi/kernels/sparse/convolution_grad_kernel.h +++ b/paddle/phi/kernels/sparse/convolution_grad_kernel.h @@ -27,7 +27,7 @@ void Conv3dGradKernel(const Context& dev_ctx, const SparseCooTensor& x, const DenseTensor& rulebook, const DenseTensor& kernel, - const SparseCooTensor& out_grad, + const DenseTensor& out_grad, const std::vector& paddings, const std::vector& dilations, const std::vector& strides, @@ -41,7 +41,7 @@ std::vector Conv3dGrad(const Context& dev_ctx, const SparseCooTensor& x, const DenseTensor& rulebook, const DenseTensor& kernel, - const SparseCooTensor& out_grad, + const DenseTensor& out_grad, const std::vector& paddings, const std::vector& dilations, const std::vector& strides, diff --git a/paddle/phi/kernels/sparse/cpu/convolution.h b/paddle/phi/kernels/sparse/cpu/convolution.h index 64c32df18971c4d66873b02a61220a5bed8db005..93a335e2f1c35700d2bf5ef54400c52ed54f6be2 100644 --- a/paddle/phi/kernels/sparse/cpu/convolution.h +++ b/paddle/phi/kernels/sparse/cpu/convolution.h @@ -34,7 +34,7 @@ using Dims4D = phi::funcs::sparse::Dims4D; template void ProductRuleBook(const Context& dev_ctx, const SparseCooTensor& x, - const DenseTensor& kernel, + const std::vector& kernel_sizes, const std::vector& paddings, const std::vector& dilations, const std::vector& strides, @@ -42,19 +42,19 @@ void ProductRuleBook(const Context& dev_ctx, const bool subm, DenseTensor* rulebook, DenseTensor* counter_per_kernel) { - const auto& kernel_dims = kernel.dims(); const int64_t non_zero_num = x.nnz(); const auto& non_zero_indices = x.non_zero_indices(); const int* indices_ptr = non_zero_indices.data(); int* counter_ptr = counter_per_kernel->data(); - int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2]; + int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2]; memset(counter_ptr, 0, kernel_size * sizeof(int)); int rulebook_len = 0; // calc the rulebook_len const auto& x_dims = x.dims(); const Dims4D c_x_dims(x_dims[0], x_dims[3], x_dims[2], x_dims[1]); - const Dims4D c_kernel_dims(1, kernel_dims[2], kernel_dims[1], kernel_dims[0]); + const Dims4D c_kernel_dims( + 1, kernel_sizes[2], kernel_sizes[1], kernel_sizes[0]); const Dims4D c_out_dims(out_dims[0], out_dims[3], out_dims[2], out_dims[1]); const Dims4D c_paddings(1, paddings[2], paddings[1], paddings[0]); const Dims4D c_strides(1, strides[2], strides[1], strides[0]); @@ -75,9 +75,9 @@ void ProductRuleBook(const Context& dev_ctx, auto f_calc_rulebook = [&](int* rulebook_ptr) { int kernel_index = 0, rulebook_index = 0; - for (int kz = 0; kz < kernel_dims[0]; kz++) { - for (int ky = 0; ky < kernel_dims[1]; ky++) { - for (int kx = 0; kx < kernel_dims[2]; kx++) { + for (int kz = 0; kz < kernel_sizes[0]; kz++) { + for (int ky = 0; ky < kernel_sizes[1]; ky++) { + for (int kx = 0; kx < kernel_sizes[2]; kx++) { ++kernel_index; for (int64_t i = 0; i < non_zero_num; i++) { int batch = indices_ptr[i]; diff --git a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc index 5d7b381b7cb0beef7e69608ce7f732d8cdf9d222..3348d81cf6b4bbffe7f6db24dbe12fef24cadf40 100644 --- a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc @@ -33,7 +33,7 @@ void Conv3dGradKernel(const Context& dev_ctx, const SparseCooTensor& x, const DenseTensor& rulebook, const DenseTensor& kernel, - const SparseCooTensor& out_grad, + const DenseTensor& out_grad, const std::vector& paddings, const std::vector& dilations, const std::vector& strides, @@ -113,7 +113,7 @@ void Conv3dGradKernel(const Context& dev_ctx, rulebook_len, in_channels, in_features_ptr); - Gather(out_grad.non_zero_elements().data(), + Gather(out_grad.data(), rulebook_ptr + rulebook_len * 2, rulebook_len, out_channels, diff --git a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc index 746ca04a826c020201e53803dd2ec83519cf576e..f022e4ef4bb63617018d6e6ecdf2560b72dead3a 100644 --- a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc @@ -44,8 +44,13 @@ void Conv3dKernel(const Context& dev_ctx, const auto& kernel_dims = kernel.dims(); int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2]; DDim out_dims = {1, 1, 1, 1, 1}; + std::vector kernel_sizes(kernel_dims.size()); + for (int i = 0; i < kernel_dims.size(); i++) { + kernel_sizes[i] = kernel_dims[i]; + } + phi::funcs::sparse::GetOutShape( - x_dims, kernel_dims, paddings, dilations, strides, &out_dims); + x_dims, kernel_sizes, paddings, dilations, strides, &out_dims); const int in_channels = kernel_dims[3]; const int out_channels = kernel_dims[4]; @@ -63,7 +68,7 @@ void Conv3dKernel(const Context& dev_ctx, ProductRuleBook(dev_ctx, x, - kernel, + kernel_sizes, subm_paddings, dilations, subm_strides, diff --git a/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..3010d480b55c9583ff5af9271b2e063667a69da7 --- /dev/null +++ b/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc @@ -0,0 +1,73 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/pooling.h" +#include "paddle/phi/kernels/funcs/sparse/convolution.h" + +namespace phi { +namespace sparse { + +template +void MaxPoolGradKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& rulebook, + const SparseCooTensor& out, + const DenseTensor& out_grad, + const std::vector& kernel_sizes, + DenseTensor* x_grad) { + int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2]; + const int channels = x.dims()[4]; + int rulebook_len = rulebook.dims()[1]; + const int* rulebook_ptr = rulebook.data(); + std::vector offsets(kernel_size + 1), counter(kernel_size, 0); + for (int i = 0; i < rulebook_len; i++) { + counter[rulebook_ptr[i]] += 1; + } + phi::funcs::sparse::PrefixSum(&counter[0], &offsets[0], kernel_size); + + const T* in_features_ptr = x.non_zero_elements().data(); + const T* out_features_ptr = out.non_zero_elements().data(); + const T* out_grad_ptr = out_grad.data(); + T* x_grad_ptr = x_grad->data(); + memset(x_grad_ptr, 0, sizeof(T) * x_grad->numel()); + + phi::funcs::MaxPoolGrad grad_functor; + for (int i = 0; i < kernel_size; i++) { + for (int j = 0; j < counter[i]; j++) { + int in_i = rulebook_ptr[rulebook_len + offsets[i] + j]; + int out_i = rulebook_ptr[rulebook_len * 2 + offsets[i] + j]; + for (int c = 0; c < channels; c++) { + grad_functor.compute(in_features_ptr[in_i * channels + c], + out_features_ptr[out_i * channels + c], + out_grad_ptr[out_i * channels + c], + 1, + &x_grad_ptr[in_i * channels + c]); + } + } + } +} + +} // namespace sparse +} // namespace phi + +PD_REGISTER_KERNEL(sparse_maxpool_grad, + CPU, + ALL_LAYOUT, + phi::sparse::MaxPoolGradKernel, + float, + double) { + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); +} diff --git a/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..86971242df5aeed5b0acd74f23db185e02544846 --- /dev/null +++ b/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc @@ -0,0 +1,108 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/sparse/sparse_pool_kernel.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/tensor_meta.h" +#include "paddle/phi/kernels/funcs/pooling.h" +#include "paddle/phi/kernels/funcs/sparse/convolution.h" +#include "paddle/phi/kernels/sparse/cpu/convolution.h" + +namespace phi { +namespace sparse { + +/** + * x: (N, D, H, W, C) + * kernel: (D, H, W, C, OC) + * out: (N, D, H, W, OC) +**/ +template +void MaxPoolKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const std::vector& kernel_sizes, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + SparseCooTensor* out, + DenseTensor* rulebook) { + const auto& x_dims = x.dims(); + int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2]; + const std::vector& real_kernel_sizes = + phi::funcs::sparse::PoolResetKernel(kernel_sizes, x_dims[4], x_dims[4]); + DDim out_dims = {1, 1, 1, 1, 1}; + phi::funcs::sparse::GetOutShape( + x_dims, real_kernel_sizes, paddings, dilations, strides, &out_dims); + const int in_channels = real_kernel_sizes[3]; + + DenseTensorMeta counter_meta( + DataType::INT32, {kernel_size}, DataLayout::NCHW); + DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta)); + + const T* in_features_ptr = x.non_zero_elements().data(); + // 1. product rule book + ProductRuleBook(dev_ctx, + x, + real_kernel_sizes, + paddings, + dilations, + strides, + out_dims, + false, + rulebook, + &counter_per_kernel); + + UpdateRulebookAndOutIndex( + dev_ctx, x, kernel_size, in_channels, out_dims, rulebook, out); + + int rulebook_len = rulebook->dims()[1]; + const int* rulebook_ptr = rulebook->data(); + const int* counter_ptr = counter_per_kernel.data(); + + std::vector offsets(kernel_size + 1); + phi::funcs::sparse::PrefixSum(counter_ptr, &offsets[0], kernel_size); + std::vector out_flags(out->nnz(), false); + + // 2. max pool + T* out_features_ptr = out->mutable_non_zero_elements()->data(); + phi::funcs::MaxPool max_pool_functor; + for (int i = 0; i < kernel_size; i++) { + for (int j = 0; j < counter_ptr[i]; j++) { + int in_i = rulebook_ptr[rulebook_len + offsets[i] + j]; + int out_i = rulebook_ptr[rulebook_len * 2 + offsets[i] + j]; + if (!out_flags[out_i]) { + out_flags[out_i] = true; + memcpy(&out_features_ptr[out_i * in_channels], + &in_features_ptr[in_i * in_channels], + in_channels * sizeof(T)); + } else { + for (int c = 0; c < in_channels; c++) { + max_pool_functor.compute(in_features_ptr[in_i * in_channels + c], + &out_features_ptr[out_i * in_channels + c]); + } + } + } + } +} + +} // namespace sparse +} // namespace phi + +PD_REGISTER_KERNEL(sparse_maxpool, + CPU, + ALL_LAYOUT, + phi::sparse::MaxPoolKernel, + float, + double) { + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); +} diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h index 8826fd7cf87e0a7a4a8251b4da823f18190f4a38..5b928817f64d748ec824a2c28e569181034d1072 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h +++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h @@ -23,11 +23,15 @@ limitations under the License. */ #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/kernels/funcs/index_impl.cu.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/primitive/compute_primitives.h" #include "paddle/phi/kernels/sparse/convolution_kernel.h" namespace phi { namespace sparse { +using Dims4D = phi::funcs::sparse::Dims4D; + // TODO(zhangkaihuo): After the GatherCUDAKernel is migrated to phi, replace // this kernel with phi::GatherCUDAKernel; // Vectorization can be used to improve read and write bandwidth @@ -139,5 +143,494 @@ inline int* SortedAndUniqueIndex(const Context& dev_ctx, return new_end.first; } +template +__global__ void SetFlagAndUpdateCounterKernel(const int* indexs, + const int n, + const int rulebook_len, + const int kernel_size, + T* rulebook_ptr, + int* counter_ptr) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + extern __shared__ int cache_count[]; // kernel_size + for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) { + cache_count[i] = 0; + } + __syncthreads(); + + for (int i = tid; i < n; i += gridDim.x * blockDim.x) { + int index = indexs[i]; + int kernel_index = rulebook_ptr[index]; + rulebook_ptr[index + rulebook_len] = -1; + rulebook_ptr[index + 2 * rulebook_len] = -1; + rulebook_ptr[index] = -1; + atomicAdd(&cache_count[kernel_index], 1); + } + __syncthreads(); + + for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) { + atomicSub(&counter_ptr[i], cache_count[i]); + } +} + +/** + * @brief: update the out index and indices + * unique_keys: save the index of the output feature list + * unique_values: indiates the index of key before deduplication + * out_indexs: indicates the position of the output index in the rulebook + * rulebook_len: indicates the length of rulebook + * out_dims: indicates the output dims + * out_indices: the indices of output, out_indices = IndexToPoint(unique_keys) + * rulebook_out_indexs: the output index in rulebook +**/ +template +__global__ void UpdateIndexKernel(const int* unique_keys, + const int* unique_values, + const int* out_indexs, + const int non_zero_num, + const int rulebook_len, + const Dims4D out_dims, + T* out_indices, + T* rulebook_out_indexs) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) { + const int index = unique_keys[i]; + int batch, x, y, z; + phi::funcs::sparse::IndexToPoint( + index, out_dims, &batch, &x, &y, &z); + // get out indices + out_indices[i] = batch; + out_indices[i + non_zero_num] = z; + out_indices[i + non_zero_num * 2] = y; + out_indices[i + non_zero_num * 3] = x; + + // update rulebook + int start = unique_values[i]; + int end = i == non_zero_num - 1 ? rulebook_len : unique_values[i + 1]; + // max(end-start) = kernel_size + for (int j = start; j < end; j++) { + rulebook_out_indexs[out_indexs[j]] = i; + } + } +} + +// brief: calculation the distance between start and end +template +__global__ void DistanceKernel(const T* start, const T* end, int* distance) { + if (threadIdx.x == 0) { + *distance = end - start; + } +} + +/** + * @brief product rulebook + * for input_i in x_indices: + * if input_i participate in the convolution calculation: + * infer the output_i by input_i and kernel_i + * save output_i + * + * x_indices: the indices of input features + * x_dims: the input dims + * kernel_dims: the kernel dims + * out_dims: the output dims + * non_zero_num: the number of input features + * rulebook: the rulebook to save the kernel index, input index and output index + * counter: save the number of times each location in the kernel participates in + *the caculation +**/ +template +__global__ void ProductRuleBookKernel(const T* x_indices, + const Dims4D x_dims, + const Dims4D kernel_dims, + const Dims4D out_dims, + const int64_t non_zero_num, + const Dims4D paddings, + const Dims4D dilations, + const Dims4D strides, + const bool subm, + T* rulebook, + int* counter, + int* in_indexs) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + extern __shared__ int counter_buf[]; // kernel_size + const int kernel_size = kernel_dims[3] * kernel_dims[2] * kernel_dims[1]; + const int offset = kernel_size * non_zero_num; + for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) { + counter_buf[i] = 0; + } + __syncthreads(); + + for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) { + int kernel_index = 0; + int batch = x_indices[i]; + int in_z = x_indices[i + non_zero_num]; + int in_y = x_indices[i + 2 * non_zero_num]; + int in_x = x_indices[i + 3 * non_zero_num]; + if (subm) { + in_indexs[i] = PointToIndex(batch, in_x, in_y, in_z, x_dims); + } + for (int kz = 0; kz < kernel_dims[1]; kz++) { + for (int ky = 0; ky < kernel_dims[2]; ky++) { + for (int kx = 0; kx < kernel_dims[3]; kx++) { + int in_i = -1, out_index = -1, kernel_i = -1; + if (phi::funcs::sparse::Check(x_dims, + kernel_dims, + paddings, + dilations, + strides, + in_x, + in_y, + in_z, + kx, + ky, + kz)) { + int out_z = (in_z + paddings[1] - kz * dilations[1]) / strides[1]; + int out_y = (in_y + paddings[2] - ky * dilations[2]) / strides[2]; + int out_x = (in_x + paddings[3] - kx * dilations[3]) / strides[3]; + in_i = i; + out_index = phi::funcs::sparse::PointToIndex( + batch, out_x, out_y, out_z, out_dims); + atomicAdd(&counter_buf[kernel_index], 1); + kernel_i = kernel_index; + } + rulebook[kernel_index * non_zero_num + i] = kernel_i; + rulebook[kernel_index * non_zero_num + offset + i] = in_i; + rulebook[kernel_index * non_zero_num + offset * 2 + i] = out_index; + ++kernel_index; + } + } + } + } + __syncthreads(); + for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) { + atomicAdd(&counter[i], counter_buf[i]); + } +} + +// the basic algorithm can refer to convolution_kernel.cc or +// the second paper +// example: +// 1. the rulebook: +// the kernel_index: 0, 0, 0, 1, 1, 1, 2, 2, .... +// the out_index(key): 20, 30, 33, 30, 33, 20, 25 +// 2. mark the index of out_index(value): 0, 1, 2, 3, 4, 5, 6, .... +// 3. sorted the (key, value) +// 4. unique the (key, value): +// unique_key: 20, 25, 30, 33 +// unique_values: 0, 2, 3, 5 +// the index of unique_values is: 0, 1, 2, 3 +// 5. update the out_index by unique_key, uniqe_value and the index of +// unique_value: +// the new out_index: 0, 2, 3, 2, 3, 0, 1 +template +int ProductRuleBook(const Context& dev_ctx, + const SparseCooTensor& x, + const std::vector& kernel_sizes, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + const DDim& out_dims, + const bool subm, + DenseTensor* rulebook, + DenseTensor* counter_per_kernel, + DenseTensor* offsets_per_kernel, + DenseTensor* out_index, + DenseTensor* unique_key, + DenseTensor* unique_value, + SparseCooTensor* out, + std::vector* h_counter, + std::vector* h_offsets) { + const int64_t non_zero_num = x.nnz(); + const auto& non_zero_indices = x.non_zero_indices(); + const int* indices_ptr = non_zero_indices.data(); + DenseTensor in_indexs = phi::Empty( + dev_ctx, DenseTensorMeta(DataType::INT32, {x.nnz()}, DataLayout::NCHW)); + int* counter_ptr = counter_per_kernel->data(); + int* offsets_ptr = offsets_per_kernel->data(); + int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2]; + const int rulebook_rows = 3; + const int rulebook_cols = kernel_size * non_zero_num; + rulebook->ResizeAndAllocate({rulebook_rows, rulebook_cols}); + int* rulebook_ptr = rulebook->data(); + + const auto x_dims = x.dims(); + Dims4D d_x_dims(x_dims[0], x_dims[3], x_dims[2], x_dims[1]); + Dims4D d_kernel_dims(1, kernel_sizes[2], kernel_sizes[1], kernel_sizes[0]); + Dims4D d_out_dims(out_dims[0], out_dims[3], out_dims[2], out_dims[1]); + Dims4D d_paddings(1, paddings[2], paddings[1], paddings[0]); + Dims4D d_strides(1, strides[2], strides[1], strides[0]); + Dims4D d_dilations(1, dilations[2], dilations[1], dilations[0]); + + // 1. product rule book + phi::funcs::SetConstant set_zero; + set_zero(dev_ctx, counter_per_kernel, 0); + auto config = + phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1); + + ProductRuleBookKernel<<>>(indices_ptr, + d_x_dims, + d_kernel_dims, + d_out_dims, + non_zero_num, + d_paddings, + d_dilations, + d_strides, + subm, + rulebook_ptr, + counter_ptr, + in_indexs.data()); + +// 2. remove -1 +#ifdef PADDLE_WITH_HIP + int* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()), +#else + int* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()), +#endif + rulebook_ptr, + rulebook_ptr + rulebook_rows * rulebook_cols, + -1); + + DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>( + rulebook_ptr, last, rulebook_ptr + 3 * kernel_size * non_zero_num - 1); + int rulebook_len = 0; + phi::backends::gpu::GpuMemcpyAsync( + &rulebook_len, + rulebook_ptr + 3 * kernel_size * non_zero_num - 1, + sizeof(int), +#ifdef PADDLE_WITH_HIP + hipMemcpyDeviceToHost, +#else + cudaMemcpyDeviceToHost, +#endif + dev_ctx.stream()); + rulebook_len /= 3; + dev_ctx.Wait(); + + if (subm) { + // At present, hashtable is not used to map the input and output indexes. + // At present, the intermediate output index is generated by normal + // convolution, + // and then the intermediate output index is subtracted from the input index + // to obain the rulebook. + // get difference + int32_t* A_key_ptr = rulebook_ptr + 2 * rulebook_len; + int32_t* B_key_ptr = in_indexs.data(); + DenseTensor A_val = phi::Empty( + dev_ctx, + DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW)); + DenseTensor B_val = phi::Empty( + dev_ctx, DenseTensorMeta(DataType::INT32, {x.nnz()}, DataLayout::NCHW)); + phi::IndexKernel>( + dev_ctx, &A_val, kps::IdentityFunctor()); + phi::IndexKernel>( + dev_ctx, &B_val, kps::IdentityFunctor()); + DenseTensor key_result = phi::Empty( + dev_ctx, + DenseTensorMeta(DataType::INT32, {rulebook_len + 1}, DataLayout::NCHW)); + DenseTensor val_result = phi::Empty( + dev_ctx, + DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW)); + +#ifdef PADDLE_WITH_HIP + thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()), +#else + thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()), +#endif + counter_ptr, + counter_ptr + kernel_size, + offsets_ptr); + std::vector offsets(kernel_size, 0); + // TODO(zhangkaihuo): used unified memcpy interface + phi::backends::gpu::GpuMemcpyAsync(offsets.data(), + offsets_ptr, + kernel_size * sizeof(int), +#ifdef PADDLE_WITH_HIP + hipMemcpyDeviceToHost, +#else + cudaMemcpyDeviceToHost, +#endif + dev_ctx.stream()); + dev_ctx.Wait(); + + thrust::pair end; + // Because set_diff does not support duplicate data, set_diff is performed + // separately for each segment of data. + // TODO(zhangkaihuo): Using hashtable here may get better performance, + // further tests ared needed. + for (int i = 0; i < kernel_size; i++) { + int start = offsets[i]; + int stop = i == kernel_size - 1 ? rulebook_len : offsets[i + 1]; + int* key_result_start = (i == 0 ? key_result.data() : end.first); + int* val_result_start = i == 0 ? val_result.data() : end.second; + end = +#ifdef PADDLE_WITH_HIP + thrust::set_difference_by_key(thrust::hip::par.on(dev_ctx.stream()), +#else + thrust::set_difference_by_key(thrust::cuda::par.on(dev_ctx.stream()), +#endif + A_key_ptr + start, + A_key_ptr + stop, + B_key_ptr, + B_key_ptr + x.nnz(), + A_val.data() + start, + B_val.data(), + key_result_start, + val_result_start); + } + + DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>( + key_result.data(), + end.first, + key_result.data() + rulebook_len); + int len = 0; + phi::backends::gpu::GpuMemcpyAsync(&len, + key_result.data() + rulebook_len, + sizeof(int), +#ifdef PADDLE_WITH_HIP + hipMemcpyDeviceToHost, +#else + cudaMemcpyDeviceToHost, +#endif + dev_ctx.stream()); + dev_ctx.Wait(); + // set the diff value = -1, and update counter + auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, len, 1); + SetFlagAndUpdateCounterKernel<<>>( + val_result.data(), + len, + rulebook_len, + kernel_size, + rulebook_ptr, + counter_ptr); +// remove -1 +#ifdef PADDLE_WITH_HIP + int* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()), +#else + int* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()), +#endif + rulebook_ptr, + rulebook_ptr + 3 * rulebook_len, + -1); + DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>( + rulebook_ptr, last, key_result.data() + rulebook_len); + phi::backends::gpu::GpuMemcpyAsync(&rulebook_len, + key_result.data() + rulebook_len, + sizeof(int), +#ifdef PADDLE_WITH_HIP + hipMemcpyDeviceToHost, +#else + cudaMemcpyDeviceToHost, +#endif + dev_ctx.stream()); + dev_ctx.Wait(); + rulebook_len /= 3; + } + +#ifdef PADDLE_WITH_HIP + thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()), +#else + thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()), +#endif + counter_ptr, + counter_ptr + kernel_size, + offsets_ptr); + +#ifdef PADDLE_WITH_HIP + phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0], + counter_ptr, + kernel_size * sizeof(int), + hipMemcpyDeviceToHost, + dev_ctx.stream()); + phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0], + offsets_ptr, + kernel_size * sizeof(int), + hipMemcpyDeviceToHost, + dev_ctx.stream()); +#else + phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0], + counter_ptr, + kernel_size * sizeof(int), + cudaMemcpyDeviceToHost, + dev_ctx.stream()); + phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0], + offsets_ptr, + kernel_size * sizeof(int), + cudaMemcpyDeviceToHost, + dev_ctx.stream()); +#endif + rulebook->Resize({rulebook_rows, rulebook_len}); + + // 3. sorted or merge the out index + out_index->ResizeAndAllocate({rulebook_len}); + unique_value->ResizeAndAllocate({rulebook_len}); + unique_key->ResizeAndAllocate({rulebook_len}); + int* out_index_ptr = out_index->data(); + int* unique_value_ptr = unique_value->data(); + int* unique_key_ptr = unique_key->data(); + + int* new_end = SortedAndUniqueIndex(dev_ctx, + rulebook_ptr + 2 * rulebook_len, + rulebook_len, + out_index, + unique_key, + unique_value); + // thrust::distance doesn't support stream parameters + // const int out_non_zero_num = thrust::distance(unique_key_ptr, + // new_end.first); + DistanceKernel<<<1, 1>>>( + unique_key_ptr, + new_end, + rulebook_ptr + rulebook_rows * rulebook_cols - 1); + int out_non_zero_num = 0; +#ifdef PADDLE_WITH_HIP + phi::backends::gpu::GpuMemcpyAsync( + &out_non_zero_num, + rulebook_ptr + rulebook_rows * rulebook_cols - 1, + sizeof(int), + hipMemcpyDeviceToHost, + dev_ctx.stream()); +#else + phi::backends::gpu::GpuMemcpyAsync( + &out_non_zero_num, + rulebook_ptr + rulebook_rows * rulebook_cols - 1, + sizeof(int), + cudaMemcpyDeviceToHost, + dev_ctx.stream()); +#endif + dev_ctx.Wait(); + + // 5. update out_indices and rulebook by unique_value_ptr + const int64_t sparse_dim = 4; + DenseTensorMeta indices_meta( + DataType::INT32, {sparse_dim, out_non_zero_num}, DataLayout::NCHW); + DenseTensorMeta values_meta( + x.dtype(), {out_non_zero_num, kernel_sizes[4]}, x.layout()); + phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta)); + phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta)); + + int* out_indices_ptr = out_indices.data(); + + config = + phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_non_zero_num, 1); + UpdateIndexKernel<<>>(unique_key_ptr, + unique_value_ptr, + out_index_ptr, + out_non_zero_num, + rulebook_len, + d_out_dims, + out_indices_ptr, + rulebook_ptr + 2 * rulebook_len); + out->SetMember(out_indices, out_values, out_dims, true); + return rulebook_len; +} + } // namespace sparse } // namespace phi diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu index d6d992d0f4b651b1a9a47cdddcae116a215a0e57..4db0a0b0011b5a664b66d54f6d42f2e1954ccd12 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu @@ -38,7 +38,7 @@ void Conv3dGradKernel(const Context& dev_ctx, const SparseCooTensor& x, const DenseTensor& rulebook, const DenseTensor& kernel, - const SparseCooTensor& out_grad, + const DenseTensor& out_grad, const std::vector& paddings, const std::vector& dilations, const std::vector& strides, @@ -140,12 +140,11 @@ void Conv3dGradKernel(const Context& dev_ctx, GatherKernel<<>>( - out_grad.non_zero_elements().data(), - rulebook_ptr + rulebook_len * 2, - out_grad_features_ptr, - rulebook_len, - out_channels); + dev_ctx.stream()>>>(out_grad.data(), + rulebook_ptr + rulebook_len * 2, + out_grad_features_ptr, + rulebook_len, + out_channels); const T* kernel_ptr = kernel.data(); for (int i = 0; i < kernel_size; i++) { diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu index 1a0c7e9b972145fbc98cdb9dfee0267a9eaa9f90..214e689e9370a313e66be0281db177407d7b87f0 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu @@ -12,515 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include -#include -#include -#include - -#include "paddle/phi/api/lib/utils/allocator.h" #include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/backends/gpu/gpu_info.h" -#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_meta.h" #include "paddle/phi/kernels/funcs/blas/blas.h" -#include "paddle/phi/kernels/funcs/index_impl.cu.h" -#include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/phi/kernels/primitive/compute_primitives.h" #include "paddle/phi/kernels/sparse/convolution_kernel.h" #include "paddle/phi/kernels/sparse/gpu/convolution.cu.h" namespace phi { namespace sparse { -using Dims4D = phi::funcs::sparse::Dims4D; - -__global__ void SetFlagAndUpdateCounterKernel(const int* indexs, - const int n, - const int rulebook_len, - const int kernel_size, - int* rulebook_ptr, - int* counter_ptr) { - int tid = threadIdx.x + blockIdx.x * blockDim.x; - extern __shared__ int cache_count[]; // kernel_size - for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) { - cache_count[i] = 0; - } - __syncthreads(); - - for (int i = tid; i < n; i += gridDim.x * blockDim.x) { - int index = indexs[i]; - int kernel_index = rulebook_ptr[index]; - rulebook_ptr[index + rulebook_len] = -1; - rulebook_ptr[index + 2 * rulebook_len] = -1; - rulebook_ptr[index] = -1; - atomicAdd(&cache_count[kernel_index], 1); - } - __syncthreads(); - - for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) { - atomicSub(&counter_ptr[i], cache_count[i]); - } -} - -/** - * @brief: update the out index and indices - * unique_keys: save the index of the output feature list - * unique_values: indiates the index of key before deduplication - * out_indexs: indicates the position of the output index in the rulebook - * rulebook_len: indicates the length of rulebook - * out_dims: indicates the output dims - * out_indices: the indices of output, out_indices = IndexToPoint(unique_keys) - * rulebook_out_indexs: the output index in rulebook -**/ -__global__ void UpdateIndexKernel(const int* unique_keys, - const int* unique_values, - const int* out_indexs, - const int non_zero_num, - const int rulebook_len, - const Dims4D out_dims, - int* out_indices, - int* rulebook_out_indexs) { - int tid = threadIdx.x + blockIdx.x * blockDim.x; - for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) { - const int index = unique_keys[i]; - int batch, x, y, z; - phi::funcs::sparse::IndexToPoint( - index, out_dims, &batch, &x, &y, &z); - // get out indices - out_indices[i] = batch; - out_indices[i + non_zero_num] = z; - out_indices[i + non_zero_num * 2] = y; - out_indices[i + non_zero_num * 3] = x; - - // update rulebook - int start = unique_values[i]; - int end = i == non_zero_num - 1 ? rulebook_len : unique_values[i + 1]; - // max(end-start) = kernel_size - for (int j = start; j < end; j++) { - rulebook_out_indexs[out_indexs[j]] = i; - } - } -} - -/** - * @brief product rulebook - * for input_i in x_indices: - * if input_i participate in the convolution calculation: - * infer the output_i by input_i and kernel_i - * save output_i - * - * x_indices: the indices of input features - * x_dims: the input dims - * kernel_dims: the kernel dims - * out_dims: the output dims - * non_zero_num: the number of input features - * rulebook: the rulebook to save the kernel index, input index and output index - * counter: save the number of times each location in the kernel participates in - *the caculation -**/ -__global__ void ProductRuleBookKernel(const int* x_indices, - const Dims4D x_dims, - const Dims4D kernel_dims, - const Dims4D out_dims, - const int64_t non_zero_num, - const Dims4D paddings, - const Dims4D dilations, - const Dims4D strides, - const bool subm, - int* rulebook, - int* counter, - int* in_indexs) { - int tid = threadIdx.x + blockIdx.x * blockDim.x; - extern __shared__ int counter_buf[]; // kernel_size - const int kernel_size = kernel_dims[3] * kernel_dims[2] * kernel_dims[1]; - const int offset = kernel_size * non_zero_num; - for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) { - counter_buf[i] = 0; - } - __syncthreads(); - - for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) { - int kernel_index = 0; - int batch = x_indices[i]; - int in_z = x_indices[i + non_zero_num]; - int in_y = x_indices[i + 2 * non_zero_num]; - int in_x = x_indices[i + 3 * non_zero_num]; - if (subm) { - in_indexs[i] = PointToIndex(batch, in_x, in_y, in_z, x_dims); - } - for (int kz = 0; kz < kernel_dims[1]; kz++) { - for (int ky = 0; ky < kernel_dims[2]; ky++) { - for (int kx = 0; kx < kernel_dims[3]; kx++) { - int in_i = -1, out_index = -1, kernel_i = -1; - if (phi::funcs::sparse::Check(x_dims, - kernel_dims, - paddings, - dilations, - strides, - in_x, - in_y, - in_z, - kx, - ky, - kz)) { - int out_z = (in_z + paddings[1] - kz * dilations[1]) / strides[1]; - int out_y = (in_y + paddings[2] - ky * dilations[2]) / strides[2]; - int out_x = (in_x + paddings[3] - kx * dilations[3]) / strides[3]; - in_i = i; - out_index = phi::funcs::sparse::PointToIndex( - batch, out_x, out_y, out_z, out_dims); - atomicAdd(&counter_buf[kernel_index], 1); - kernel_i = kernel_index; - } - rulebook[kernel_index * non_zero_num + i] = kernel_i; - rulebook[kernel_index * non_zero_num + offset + i] = in_i; - rulebook[kernel_index * non_zero_num + offset * 2 + i] = out_index; - ++kernel_index; - } - } - } - } - __syncthreads(); - for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) { - atomicAdd(&counter[i], counter_buf[i]); - } -} - -// brief: calculation the distance between start and end -__global__ void DistanceKernel(const int* start, - const int* end, - int* distance) { - if (threadIdx.x == 0) { - *distance = end - start; - } -} - -// the basic algorithm can refer to convolution_kernel.cc or -// the second paper -// example: -// 1. the rulebook: -// the kernel_index: 0, 0, 0, 1, 1, 1, 2, 2, .... -// the out_index(key): 20, 30, 33, 30, 33, 20, 25 -// 2. mark the index of out_index(value): 0, 1, 2, 3, 4, 5, 6, .... -// 3. sorted the (key, value) -// 4. unique the (key, value): -// unique_key: 20, 25, 30, 33 -// unique_values: 0, 2, 3, 5 -// the index of unique_values is: 0, 1, 2, 3 -// 5. update the out_index by unique_key, uniqe_value and the index of -// unique_value: -// the new out_index: 0, 2, 3, 2, 3, 0, 1 -template -int ProductRuleBook(const Context& dev_ctx, - const SparseCooTensor& x, - const DenseTensor& kernel, - const std::vector& paddings, - const std::vector& dilations, - const std::vector& strides, - const DDim& out_dims, - const bool subm, - DenseTensor* rulebook, - DenseTensor* counter_per_kernel, - DenseTensor* offsets_per_kernel, - DenseTensor* out_index, - DenseTensor* unique_key, - DenseTensor* unique_value, - SparseCooTensor* out, - std::vector* h_counter, - std::vector* h_offsets) { - const auto& kernel_dims = kernel.dims(); - const int64_t non_zero_num = x.nnz(); - const auto& non_zero_indices = x.non_zero_indices(); - const int* indices_ptr = non_zero_indices.data(); - DenseTensor in_indexs = phi::Empty( - dev_ctx, DenseTensorMeta(DataType::INT32, {x.nnz()}, DataLayout::NCHW)); - int* counter_ptr = counter_per_kernel->data(); - int* offsets_ptr = offsets_per_kernel->data(); - int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2]; - const int rulebook_rows = 3; - const int rulebook_cols = kernel_size * non_zero_num; - rulebook->ResizeAndAllocate({rulebook_rows, rulebook_cols}); - int* rulebook_ptr = rulebook->data(); - - const auto x_dims = x.dims(); - Dims4D d_x_dims(x_dims[0], x_dims[3], x_dims[2], x_dims[1]); - Dims4D d_kernel_dims(1, kernel_dims[2], kernel_dims[1], kernel_dims[0]); - Dims4D d_out_dims(out_dims[0], out_dims[3], out_dims[2], out_dims[1]); - Dims4D d_paddings(1, paddings[2], paddings[1], paddings[0]); - Dims4D d_strides(1, strides[2], strides[1], strides[0]); - Dims4D d_dilations(1, dilations[2], dilations[1], dilations[0]); - - // 1. product rule book - phi::funcs::SetConstant set_zero; - set_zero(dev_ctx, counter_per_kernel, 0); - auto config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1); - - ProductRuleBookKernel<<>>(indices_ptr, - d_x_dims, - d_kernel_dims, - d_out_dims, - non_zero_num, - d_paddings, - d_dilations, - d_strides, - subm, - rulebook_ptr, - counter_ptr, - in_indexs.data()); - -// 2. remove -1 -#ifdef PADDLE_WITH_HIP - int* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()), -#else - int* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()), -#endif - rulebook_ptr, - rulebook_ptr + rulebook_rows * rulebook_cols, - -1); - - DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>( - rulebook_ptr, last, rulebook_ptr + 3 * kernel_size * non_zero_num - 1); - int rulebook_len = 0; - phi::backends::gpu::GpuMemcpyAsync( - &rulebook_len, - rulebook_ptr + 3 * kernel_size * non_zero_num - 1, - sizeof(int), -#ifdef PADDLE_WITH_HIP - hipMemcpyDeviceToHost, -#else - cudaMemcpyDeviceToHost, -#endif - dev_ctx.stream()); - rulebook_len /= 3; - dev_ctx.Wait(); - - if (subm) { - // At present, hashtable is not used to map the input and output indexes. - // At present, the intermediate output index is generated by normal - // convolution, - // and then the intermediate output index is subtracted from the input index - // to obain the rulebook. - // get difference - int32_t* A_key_ptr = rulebook_ptr + 2 * rulebook_len; - int32_t* B_key_ptr = in_indexs.data(); - DenseTensor A_val = phi::Empty( - dev_ctx, - DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW)); - DenseTensor B_val = phi::Empty( - dev_ctx, DenseTensorMeta(DataType::INT32, {x.nnz()}, DataLayout::NCHW)); - phi::IndexKernel>( - dev_ctx, &A_val, kps::IdentityFunctor()); - phi::IndexKernel>( - dev_ctx, &B_val, kps::IdentityFunctor()); - DenseTensor key_result = phi::Empty( - dev_ctx, - DenseTensorMeta(DataType::INT32, {rulebook_len + 1}, DataLayout::NCHW)); - DenseTensor val_result = phi::Empty( - dev_ctx, - DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW)); - -#ifdef PADDLE_WITH_HIP - thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()), -#else - thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()), -#endif - counter_ptr, - counter_ptr + kernel_size, - offsets_ptr); - std::vector offsets(kernel_size, 0); - // TODO(zhangkaihuo): used unified memcpy interface - phi::backends::gpu::GpuMemcpyAsync(offsets.data(), - offsets_ptr, - kernel_size * sizeof(int), -#ifdef PADDLE_WITH_HIP - hipMemcpyDeviceToHost, -#else - cudaMemcpyDeviceToHost, -#endif - dev_ctx.stream()); - dev_ctx.Wait(); - - thrust::pair end; - // Because set_diff does not support duplicate data, set_diff is performed - // separately for each segment of data. - // TODO(zhangkaihuo): Using hashtable here may get better performance, - // further tests ared needed. - for (int i = 0; i < kernel_size; i++) { - int start = offsets[i]; - int stop = i == kernel_size - 1 ? rulebook_len : offsets[i + 1]; - int* key_result_start = (i == 0 ? key_result.data() : end.first); - int* val_result_start = i == 0 ? val_result.data() : end.second; - end = -#ifdef PADDLE_WITH_HIP - thrust::set_difference_by_key(thrust::hip::par.on(dev_ctx.stream()), -#else - thrust::set_difference_by_key(thrust::cuda::par.on(dev_ctx.stream()), -#endif - A_key_ptr + start, - A_key_ptr + stop, - B_key_ptr, - B_key_ptr + x.nnz(), - A_val.data() + start, - B_val.data(), - key_result_start, - val_result_start); - } - - DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>( - key_result.data(), - end.first, - key_result.data() + rulebook_len); - int len = 0; - phi::backends::gpu::GpuMemcpyAsync(&len, - key_result.data() + rulebook_len, - sizeof(int), -#ifdef PADDLE_WITH_HIP - hipMemcpyDeviceToHost, -#else - cudaMemcpyDeviceToHost, -#endif - dev_ctx.stream()); - dev_ctx.Wait(); - // set the diff value = -1, and update counter - auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, len, 1); - SetFlagAndUpdateCounterKernel<<>>(val_result.data(), - len, - rulebook_len, - kernel_size, - rulebook_ptr, - counter_ptr); -// remove -1 -#ifdef PADDLE_WITH_HIP - int* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()), -#else - int* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()), -#endif - rulebook_ptr, - rulebook_ptr + 3 * rulebook_len, - -1); - DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>( - rulebook_ptr, last, key_result.data() + rulebook_len); - phi::backends::gpu::GpuMemcpyAsync(&rulebook_len, - key_result.data() + rulebook_len, - sizeof(int), -#ifdef PADDLE_WITH_HIP - hipMemcpyDeviceToHost, -#else - cudaMemcpyDeviceToHost, -#endif - dev_ctx.stream()); - dev_ctx.Wait(); - rulebook_len /= 3; - } - -#ifdef PADDLE_WITH_HIP - thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()), -#else - thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()), -#endif - counter_ptr, - counter_ptr + kernel_size, - offsets_ptr); - -#ifdef PADDLE_WITH_HIP - phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0], - counter_ptr, - kernel_size * sizeof(int), - hipMemcpyDeviceToHost, - dev_ctx.stream()); - phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0], - offsets_ptr, - kernel_size * sizeof(int), - hipMemcpyDeviceToHost, - dev_ctx.stream()); -#else - phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0], - counter_ptr, - kernel_size * sizeof(int), - cudaMemcpyDeviceToHost, - dev_ctx.stream()); - phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0], - offsets_ptr, - kernel_size * sizeof(int), - cudaMemcpyDeviceToHost, - dev_ctx.stream()); -#endif - rulebook->Resize({rulebook_rows, rulebook_len}); - - // 3. sorted or merge the out index - out_index->ResizeAndAllocate({rulebook_len}); - unique_value->ResizeAndAllocate({rulebook_len}); - unique_key->ResizeAndAllocate({rulebook_len}); - int* out_index_ptr = out_index->data(); - int* unique_value_ptr = unique_value->data(); - int* unique_key_ptr = unique_key->data(); - - int* new_end = SortedAndUniqueIndex(dev_ctx, - rulebook_ptr + 2 * rulebook_len, - rulebook_len, - out_index, - unique_key, - unique_value); - // thrust::distance doesn't support stream parameters - // const int out_non_zero_num = thrust::distance(unique_key_ptr, - // new_end.first); - DistanceKernel<<<1, 1>>>(unique_key_ptr, - new_end, - rulebook_ptr + rulebook_rows * rulebook_cols - 1); - int out_non_zero_num = 0; -#ifdef PADDLE_WITH_HIP - phi::backends::gpu::GpuMemcpyAsync( - &out_non_zero_num, - rulebook_ptr + rulebook_rows * rulebook_cols - 1, - sizeof(int), - hipMemcpyDeviceToHost, - dev_ctx.stream()); -#else - phi::backends::gpu::GpuMemcpyAsync( - &out_non_zero_num, - rulebook_ptr + rulebook_rows * rulebook_cols - 1, - sizeof(int), - cudaMemcpyDeviceToHost, - dev_ctx.stream()); -#endif - dev_ctx.Wait(); - - // 5. update out_indices and rulebook by unique_value_ptr - const int64_t sparse_dim = 4; - DenseTensorMeta indices_meta( - DataType::INT32, {sparse_dim, out_non_zero_num}, DataLayout::NCHW); - DenseTensorMeta values_meta( - x.dtype(), {out_non_zero_num, kernel_dims[4]}, x.layout()); - phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta)); - phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta)); - - int* out_indices_ptr = out_indices.data(); - - config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_non_zero_num, 1); - UpdateIndexKernel<<>>(unique_key_ptr, - unique_value_ptr, - out_index_ptr, - out_non_zero_num, - rulebook_len, - d_out_dims, - out_indices_ptr, - rulebook_ptr + 2 * rulebook_len); - out->SetMember(out_indices, out_values, out_dims, true); - return rulebook_len; -} - /** * x: (N, D, H, W, C) * kernel: (D, H, W, C, OC) @@ -545,9 +46,12 @@ void Conv3dKernel(const Context& dev_ctx, const auto& kernel_dims = kernel.dims(); int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2]; DDim out_dims = {1, 1, 1, 1, 1}; + std::vector kernel_sizes(kernel_dims.size()); + for (int i = 0; i < kernel_dims.size(); i++) { + kernel_sizes[i] = kernel_dims[i]; + } phi::funcs::sparse::GetOutShape( - x_dims, kernel_dims, paddings, dilations, strides, &out_dims); - out->set_dims(out_dims); + x_dims, kernel_sizes, paddings, dilations, strides, &out_dims); const int in_channels = kernel_dims[3]; const int out_channels = kernel_dims[4]; std::vector offsets(kernel_size + 1), h_counter(kernel_size); @@ -574,7 +78,7 @@ void Conv3dKernel(const Context& dev_ctx, int n = ProductRuleBook(dev_ctx, x, - kernel, + kernel_sizes, subm_paddings, dilations, subm_strides, diff --git a/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..1048dd1be0c01c1fa40a8fb2bcab4dca01837d3c --- /dev/null +++ b/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu @@ -0,0 +1,120 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_info.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/pooling.h" +#include "paddle/phi/kernels/funcs/sparse/convolution.h" + +#include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h" + +namespace phi { +namespace sparse { + +template +__global__ void MaxPoolGradCudaKernel(const T* in_features_ptr, + const T* out_features_ptr, + const T* out_grad_ptr, + const int* rulebook_ptr, + const int n, + const int rulebook_len, + const int channels, + T* x_grad_ptr) { + phi::funcs::MaxPoolGrad grad_functor; + CUDA_KERNEL_LOOP_TYPE(i, n * channels, int64_t) { + int real_i = i / channels; + int c = i - real_i * channels; + int in_i = rulebook_ptr[real_i]; + int out_i = rulebook_ptr[real_i + rulebook_len]; + grad_functor.compute(in_features_ptr[in_i * channels + c], + out_features_ptr[out_i * channels + c], + out_grad_ptr[out_i * channels + c], + 1, + &x_grad_ptr[in_i * channels + c]); + } +} + +template +void MaxPoolGradKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& rulebook, + const SparseCooTensor& out, + const DenseTensor& out_grad, + const std::vector& kernel_sizes, + DenseTensor* x_grad) { + int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2]; + const int in_channels = x.dims()[4]; + int rulebook_len = rulebook.dims()[1]; + const int* rulebook_ptr = rulebook.data(); + std::vector offsets(kernel_size + 1), counter(kernel_size, 0), + h_counter(kernel_size); + phi::backends::gpu::GpuMemcpyAsync(&h_counter[0], + rulebook_ptr, + rulebook_len * sizeof(int), +#ifdef PADDLE_WITH_HIP + hipMemcpyDeviceToHost, +#else + cudaMemcpyDeviceToHost, +#endif + + dev_ctx.stream()); + dev_ctx.Wait(); + for (int i = 0; i < rulebook_len; i++) { + counter[h_counter[i]] += 1; + } + phi::funcs::sparse::PrefixSum(&counter[0], &offsets[0], kernel_size); + + const T* in_features_ptr = x.non_zero_elements().data(); + const T* out_features_ptr = out.non_zero_elements().data(); + const T* out_grad_ptr = out_grad.data(); + T* x_grad_ptr = x_grad->data(); + phi::funcs::SetConstant set_zero; + set_zero(dev_ctx, x_grad, static_cast(0.0f)); + + for (int i = 0; i < kernel_size; i++) { + if (counter[i] <= 0) { + continue; + } + + auto config = phi::backends::gpu::GetGpuLaunchConfig1D( + dev_ctx, counter[i] * in_channels, 1); + MaxPoolGradCudaKernel<<>>( + in_features_ptr, + out_features_ptr, + out_grad_ptr, + rulebook_ptr + offsets[i] + rulebook_len, + counter[i], + rulebook_len, + in_channels, + x_grad_ptr); + } +} + +} // namespace sparse +} // namespace phi + +PD_REGISTER_KERNEL(sparse_maxpool_grad, + GPU, + ALL_LAYOUT, + phi::sparse::MaxPoolGradKernel, + float, + double) { + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); +} diff --git a/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..0f6a0d13b1ddbd375a90808789a61e0cb045a7c9 --- /dev/null +++ b/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu @@ -0,0 +1,140 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/tensor_meta.h" +#include "paddle/phi/kernels/funcs/pooling.h" +#include "paddle/phi/kernels/funcs/sparse/convolution.h" +#include "paddle/phi/kernels/sparse/gpu/convolution.cu.h" +#include "paddle/phi/kernels/sparse/sparse_pool_kernel.h" + +namespace phi { +namespace sparse { + +template +__global__ void MaxPoolCudaKernel(const T* in_features_ptr, + const int* rulebook_ptr, + const int n, + const int rulebook_len, + const int channels, + T* out_features_ptr) { + phi::funcs::MaxPool max_pool_functor; + CUDA_KERNEL_LOOP_TYPE(i, n * channels, int64_t) { + int real_i = i / channels; + int channel_i = i - real_i * channels; + int in_i = rulebook_ptr[real_i]; + int out_i = rulebook_ptr[real_i + rulebook_len]; + max_pool_functor.compute(in_features_ptr[in_i * channels + channel_i], + &out_features_ptr[out_i * channels + channel_i]); + } +} + +/** + * x: (N, D, H, W, C) + * kernel: (D, H, W, C, OC) + * out: (N, D, H, W, OC) +**/ +template +void MaxPoolKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const std::vector& kernel_sizes, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + SparseCooTensor* out, + DenseTensor* rulebook) { + const auto& x_dims = x.dims(); + int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2]; + const std::vector& real_kernel_sizes = + phi::funcs::sparse::PoolResetKernel(kernel_sizes, x_dims[4], x_dims[4]); + DDim out_dims = {1, 1, 1, 1, 1}; + phi::funcs::sparse::GetOutShape( + x_dims, real_kernel_sizes, paddings, dilations, strides, &out_dims); + const int in_channels = real_kernel_sizes[3]; + + std::vector offsets(kernel_size + 1), counter(kernel_size); + DenseTensorMeta counter_meta( + DataType::INT32, {kernel_size}, DataLayout::NCHW); + DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta)); + DenseTensor offsets_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta)); + DenseTensorMeta index_meta(DataType::INT32, {1}, DataLayout::NCHW); + DenseTensor out_index = phi::Empty(dev_ctx, std::move(index_meta)); + DenseTensor unique_key = phi::Empty(dev_ctx, std::move(index_meta)); + DenseTensor unique_value = phi::Empty(dev_ctx, std::move(index_meta)); + + // 1. product rulebook + int rulebook_len = ProductRuleBook(dev_ctx, + x, + real_kernel_sizes, + paddings, + dilations, + strides, + out_dims, + false, + rulebook, + &counter_per_kernel, + &offsets_per_kernel, + &out_index, + &unique_key, + &unique_value, + out, + &counter, + &offsets); + + const int* rulebook_ptr = rulebook->data(); + + T* out_features_ptr = out->mutable_non_zero_elements()->data(); + const T* in_features_ptr = x.non_zero_elements().data(); +// 2. max pool +#ifdef PADDLE_WITH_HIP + thrust::fill(thrust::hip::par.on(dev_ctx.stream()), +#else + thrust::fill(thrust::cuda::par.on(dev_ctx.stream()), +#endif + out_features_ptr, + out_features_ptr + out->non_zero_elements().numel(), + static_cast(-FLT_MAX)); + // TODO(zhangkaihuo) Replacing multiple calls with one kernel may be faster + for (int i = 0; i < kernel_size; i++) { + if (counter[i] <= 0) { + continue; + } + + auto config = phi::backends::gpu::GetGpuLaunchConfig1D( + dev_ctx, counter[i] * in_channels, 1); + MaxPoolCudaKernel<<>>( + in_features_ptr, + rulebook_ptr + offsets[i] + rulebook_len, + counter[i], + rulebook_len, + in_channels, + out_features_ptr); + } +} + +} // namespace sparse +} // namespace phi + +PD_REGISTER_KERNEL(sparse_maxpool, + GPU, + ALL_LAYOUT, + phi::sparse::MaxPoolKernel, + float, + double, + phi::dtype::float16) { + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); +} diff --git a/paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h b/paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..572ade76281bc0e6af6be48ed8cc1a96751412ed --- /dev/null +++ b/paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h @@ -0,0 +1,49 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/sparse_coo_tensor.h" +#include "paddle/phi/kernels/empty_kernel.h" + +namespace phi { +namespace sparse { + +template +void MaxPoolGradKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& rulebook, + const SparseCooTensor& out, + const DenseTensor& out_grad, + const std::vector& kernel_sizes, + DenseTensor* x_grad); + +template +DenseTensor MaxPoolGrad(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& rulebook, + const SparseCooTensor& out, + const DenseTensor& out_grad, + const std::vector& kernel_sizes) { + DenseTensor x_grad = phi::Empty( + dev_ctx, + DenseTensorMeta(x.dtype(), x.non_zero_elements().dims(), x.layout())); + MaxPoolGradKernel( + dev_ctx, x, rulebook, out, out_grad, kernel_sizes, &x_grad); + return x_grad; +} + +} // namespace sparse +} // namespace phi diff --git a/paddle/phi/kernels/sparse/sparse_pool_kernel.h b/paddle/phi/kernels/sparse/sparse_pool_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..bfadbf72e300fd633e8475475442658a7db20ad9 --- /dev/null +++ b/paddle/phi/kernels/sparse/sparse_pool_kernel.h @@ -0,0 +1,53 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/sparse_coo_tensor.h" +#include "paddle/phi/kernels/empty_kernel.h" + +namespace phi { +namespace sparse { + +template +void MaxPoolKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const std::vector& kernel_sizes, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + SparseCooTensor* out, + DenseTensor* rulebook); + +template +SparseCooTensor MaxPool(const Context& dev_ctx, + const SparseCooTensor& x, + const std::vector& kernel_sizes, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + DenseTensor* rulebook) { + DenseTensor indices = phi::Empty( + dev_ctx, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW)); + DenseTensor values = + phi::Empty(dev_ctx, DenseTensorMeta(x.dtype(), {1}, x.layout())); + SparseCooTensor coo(indices, values, x.dims()); + MaxPoolKernel( + dev_ctx, x, kernel_sizes, paddings, dilations, strides, &coo, rulebook); + return coo; +} + +} // namespace sparse +} // namespace phi diff --git a/paddle/phi/kernels/tril_triu_grad_kernel.h b/paddle/phi/kernels/tril_triu_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..10faf5c48d5bffab9f5199ebeefe7d5a2267ecea --- /dev/null +++ b/paddle/phi/kernels/tril_triu_grad_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void TrilTriuGradKernel(const Context& ctx, + const DenseTensor& out_grad, + int diagonal, + bool lower, + DenseTensor* x_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/tril_triu_kernel.h b/paddle/phi/kernels/tril_triu_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..4daa84e25c373d6bd5a26f2682385921dc2ce880 --- /dev/null +++ b/paddle/phi/kernels/tril_triu_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void TrilTriuKernel(const Context& ctx, + const DenseTensor& x, + int diagonal, + bool lower, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/truncated_gaussian_random_kernel.h b/paddle/phi/kernels/truncated_gaussian_random_kernel.h index c4c13578a989961839600d8ee403e478c76d1345..f8547ced41934a9810dc6874c090ab5aefd43497 100644 --- a/paddle/phi/kernels/truncated_gaussian_random_kernel.h +++ b/paddle/phi/kernels/truncated_gaussian_random_kernel.h @@ -141,9 +141,19 @@ T Erfinv(T x) { template struct TruncatedNormal { T mean, std; - TruncatedNormal(T mean, T std) : mean(mean), std(std) {} + T a_normal_cdf; + T b_normal_cdf; + TruncatedNormal(T mean, T std) : mean(mean), std(std) { + auto normal_cdf = [](T x) { + return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0; + }; + a_normal_cdf = normal_cdf(-2.0); + b_normal_cdf = normal_cdf(2.0); + } + T operator()(T value) const { - return std::sqrt(2.0) * Erfinv(value) * std + mean; + auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value; + return std::sqrt(2.0) * Erfinv(2 * p - 1) * std + mean; } }; diff --git a/paddle/phi/ops/compat/activation_sig.cc b/paddle/phi/ops/compat/activation_sig.cc index cbfca5b17ae995a89360c6d6d4987028d95dc281..7ae0dc45c5e1be09a31821c171b84fbb47fe1c9e 100644 --- a/paddle/phi/ops/compat/activation_sig.cc +++ b/paddle/phi/ops/compat/activation_sig.cc @@ -16,45 +16,54 @@ limitations under the License. */ namespace phi { -#define DefineActGradDepXOpArgMap(func_name, op_name, attrs) \ - KernelSignature func_name##GradOpArgumentMapping( \ - const ArgumentMappingContext& ctx) { \ - return KernelSignature(op_name "_grad", \ - {"X", GradVarName("Out")}, \ - {attrs}, \ - {GradVarName("X")}); \ +#define DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(func_name, op_name, attrs) \ + KernelSignature func_name##GradOpArgumentMapping( \ + const ArgumentMappingContext& ctx) { \ + return KernelSignature(op_name "_grad", \ + {"X", GradVarName("Out")}, \ + {attrs}, \ + {GradVarName("X")}); \ } -#define DefineActGradDepOutOpArgMap(func_name, op_name, attrs) \ - KernelSignature func_name##GradOpArgumentMapping( \ - const ArgumentMappingContext& ctx) { \ - return KernelSignature(op_name "_grad", \ - {"Out", GradVarName("Out")}, \ - {attrs}, \ - {GradVarName("X")}); \ +#define DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(func_name, op_name, attrs) \ + KernelSignature func_name##GradOpArgumentMapping( \ + const ArgumentMappingContext& ctx) { \ + return KernelSignature(op_name "_grad", \ + {"Out", GradVarName("Out")}, \ + {attrs}, \ + {GradVarName("X")}); \ } #define comma , -DefineActGradDepXOpArgMap(Cos, "cos", ); // NOLINT -DefineActGradDepXOpArgMap(Tan, "tan", ); // NOLINT -DefineActGradDepXOpArgMap(Acos, "acos", ); // NOLINT -DefineActGradDepXOpArgMap(Sin, "sin", ); // NOLINT -DefineActGradDepXOpArgMap(Asin, "asin", ); // NOLINT -DefineActGradDepXOpArgMap(Atan, "atan", ); // NOLINT -DefineActGradDepXOpArgMap(Sinh, "sinh", ); // NOLINT -DefineActGradDepXOpArgMap(Cosh, "cosh", ); // NOLINT -DefineActGradDepXOpArgMap(Asinh, "asinh", ); // NOLINT -DefineActGradDepXOpArgMap(Acosh, "acosh", ); // NOLINT -DefineActGradDepXOpArgMap(Atanh, "atanh", ); // NOLINT -DefineActGradDepXOpArgMap(BRelu, "brelu", "t_min" comma "t_max"); // NOLINT -DefineActGradDepXOpArgMap(LeakyRelu, "leaky_relu", "alpha"); // NOLINT -DefineActGradDepXOpArgMap(ThresholdedRelu, - "thresholded_relu", - "threshold"); // NOLINT - -DefineActGradDepOutOpArgMap(Relu, "relu", ); // NOLINT -DefineActGradDepOutOpArgMap(Tanh, "tanh", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Cos, "cos", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Tan, "tan", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Acos, "acos", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Sin, "sin", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Asin, "asin", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Atan, "atan", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Sinh, "sinh", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Cosh, "cosh", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Asinh, "asinh", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Acosh, "acosh", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Atanh, "atanh", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(BRelu, "brelu", "t_min" comma "t_max"); +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(LeakyRelu, "leaky_relu", "alpha"); +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(ThresholdedRelu, + "thresholded_relu", + "threshold"); +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(SoftShrink, "soft_shrink", "lambda"); +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(HardShrink, "hard_shrink", "threshold"); +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(TanhShrink, "tanh_shrink", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Silu, "silu", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(LogSigmoid, "logsigmoid", ); // NOLINT + +DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Relu, "relu", ); // NOLINT +DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Tanh, "tanh", ); // NOLINT +DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Sigmoid, "sigmoid", ); // NOLINT +DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(HardSigmoid, + "hard_sigmoid", + "slope" comma "offset"); // NOLINT KernelSignature ReluDoubleGradOpArgumentMapping( const ArgumentMappingContext& ctx) { @@ -75,6 +84,20 @@ KernelSignature TanhTripleGradOpArgumentMapping( {"D_OutNew", "D_DOut", "D_DDx"}); } +KernelSignature SigmoidDoubleGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "sigmoid_double_grad", {"Out", "DDX", "DOut"}, {}, {"DOutNew", "DDOut"}); +} + +KernelSignature SigmoidTripleGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("sigmoid_triple_grad", + {"Out", "DDX", "DOut", "D_DDOut", "D_DOut_New"}, + {}, + {"D_OutNew", "D_DOut", "D_DDx"}); +} + KernelSignature LeakyReluDoubleGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature( @@ -85,11 +108,32 @@ KernelSignature LeakyReluOpArgumentMapping(const ArgumentMappingContext& ctx) { return KernelSignature("leaky_relu", {"X"}, {"alpha"}, {"Out"}); } +KernelSignature EluOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("elu", {"X"}, {"alpha"}, {"Out"}); +} + +KernelSignature EluGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("elu_grad", + {"X", "Out", GradVarName("Out")}, + {"alpha"}, + {GradVarName("X")}); +} + +KernelSignature EluDoubleGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "elu_double_grad", {"X", "DOut", "DDX"}, {"alpha"}, {"DX", "DDOut"}); +} + } // namespace phi PD_REGISTER_BASE_KERNEL_NAME(relu_grad_grad, relu_double_grad); PD_REGISTER_BASE_KERNEL_NAME(tanh_grad_grad, tanh_double_grad); PD_REGISTER_BASE_KERNEL_NAME(leaky_relu_grad_grad, leaky_relu_double_grad); +PD_REGISTER_BASE_KERNEL_NAME(softshrink, soft_shrink); +PD_REGISTER_BASE_KERNEL_NAME(softshrink_grad, soft_shrink_grad); +PD_REGISTER_BASE_KERNEL_NAME(elu_grad_grad, elu_double_grad); +PD_REGISTER_BASE_KERNEL_NAME(sigmoid_grad_grad, sigmoid_double_grad); PD_REGISTER_ARG_MAPPING_FN(cos_grad, phi::CosGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(tan_grad, phi::TanGradOpArgumentMapping); @@ -118,3 +162,22 @@ PD_REGISTER_ARG_MAPPING_FN(leaky_relu_grad_grad, phi::LeakyReluDoubleGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(thresholded_relu_grad, phi::ThresholdedReluGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(softshrink_grad, + phi::SoftShrinkGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(hard_shrink_grad, + phi::HardShrinkGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(tanh_shrink_grad, + phi::TanhShrinkGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(elu, phi::EluOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(elu_grad, phi::EluGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(elu_grad_grad, phi::EluDoubleGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(silu_grad, phi::SiluGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sigmoid_grad, phi::SigmoidGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sigmoid_grad_grad, + phi::SigmoidDoubleGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sigmoid_triple_grad, + phi::SigmoidTripleGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(logsigmoid_grad, + phi::LogSigmoidGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(hard_sigmoid_grad, + phi::HardSigmoidGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/assign_sig.cc b/paddle/phi/ops/compat/assign_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..d149e8e6a9aa04d3cc8d02e370e7e07e3cbebeb0 --- /dev/null +++ b/paddle/phi/ops/compat/assign_sig.cc @@ -0,0 +1,35 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature AssignOpArgumentMapping(const ArgumentMappingContext& ctx) { + if (ctx.HasInput("X")) { + if (ctx.IsDenseTensorVectorInput("X")) { + return KernelSignature("assign_array", {"X"}, {}, {"Out"}); + } else if (ctx.IsSelectedRowsInput("X")) { + return KernelSignature("assign_sr", {"X"}, {}, {"Out"}); + } else { + return KernelSignature("assign", {"X"}, {}, {"Out"}); + } + } else { + return KernelSignature("assign", {"X"}, {}, {"Out"}); + } +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(assign, phi::AssignOpArgumentMapping); diff --git a/paddle/phi/ops/compat/cumprod_sig.cc b/paddle/phi/ops/compat/cumprod_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..01084e764ed9e41ffb1e67cda26051f5a61fdeeb --- /dev/null +++ b/paddle/phi/ops/compat/cumprod_sig.cc @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature CumprodGradGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("cumprod_grad", + {"X", "Out", GradVarName("Out")}, + {"dim"}, + {GradVarName("X")}); +} + +} // namespace phi +PD_REGISTER_ARG_MAPPING_FN(cumprod_grad, phi::CumprodGradGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/deformable_conv_sig.cc b/paddle/phi/ops/compat/deformable_conv_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..e2a21673634c30988c64e74ffdb1f489a2392f63 --- /dev/null +++ b/paddle/phi/ops/compat/deformable_conv_sig.cc @@ -0,0 +1,34 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature DeformableConvOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("deformable_conv", + {"Input", "Offset", "Filter", "Mask"}, + {"strides", + "paddings", + "dilations", + "deformable_groups", + "groups", + "im2col_step"}, + {"Output"}); +} + +} // namespace phi +PD_REGISTER_ARG_MAPPING_FN(deformable_conv, + phi::DeformableConvOpArgumentMapping); diff --git a/paddle/phi/ops/compat/determinant_sig.cc b/paddle/phi/ops/compat/determinant_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..7bcd30ec5d79b9e137c3dc3fa38f0498e9fe01de --- /dev/null +++ b/paddle/phi/ops/compat/determinant_sig.cc @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature DeterminantGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("determinant_grad", + {"Input", "Out", GradVarName("Out")}, + {}, + {GradVarName("Input")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(determinant_grad, + phi::DeterminantGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/gather_sig.cc b/paddle/phi/ops/compat/gather_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..6c47bbe48b8ee18527cfef41fad3488bef6c1dd9 --- /dev/null +++ b/paddle/phi/ops/compat/gather_sig.cc @@ -0,0 +1,44 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature GatherOpArgumentMapping(const ArgumentMappingContext& ctx) { + if (ctx.HasInput("Axis")) { + return KernelSignature("gather", {"X", "Index"}, {"Axis"}, {"Out"}); + } else { + return KernelSignature("gather", {"X", "Index"}, {"axis"}, {"Out"}); + } +} + +KernelSignature GatherGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + if (ctx.HasInput("Axis")) { + return KernelSignature("gather_grad", + {"X", "Index", GradVarName("Out")}, + {"Axis", "overwrite"}, + {GradVarName("X")}); + } else { + return KernelSignature("gather_grad", + {"X", "Index", GradVarName("Out")}, + {"axis", "overwrite"}, + {GradVarName("X")}); + } +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(gather, phi::GatherOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(gather_grad, phi::GatherGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/gelu_sig.cc b/paddle/phi/ops/compat/gelu_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..bf4b47bcf5fa9c1fb9d03f6b332c0c867211f5ac --- /dev/null +++ b/paddle/phi/ops/compat/gelu_sig.cc @@ -0,0 +1,33 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature GeluOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("gelu", {"X"}, {"approximate"}, {"Out"}); +} + +KernelSignature GeluGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("gelu_grad", + {"X", GradVarName("Out")}, + {"approximate"}, + {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(gelu_grad, phi::GeluGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(gelu, phi::GeluOpArgumentMapping); diff --git a/paddle/phi/ops/compat/grid_sampler_sig.cc b/paddle/phi/ops/compat/grid_sampler_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..b76a9770d4dede5ea604f69858201c2fb035070d --- /dev/null +++ b/paddle/phi/ops/compat/grid_sampler_sig.cc @@ -0,0 +1,43 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature GridSamplerOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("grid_sample", + {"X", "Grid"}, + {"mode", "padding_mode", "align_corners"}, + {"Output"}); +} + +KernelSignature GridSamplerGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("grid_sample_grad", + {"X", "Grid", GradVarName("Output")}, + {"mode", "padding_mode", "align_corners"}, + {GradVarName("X"), GradVarName("Grid")}); +} + +} // namespace phi + +// use Python API name as kernel name +PD_REGISTER_BASE_KERNEL_NAME(grid_sampler, grid_sample); +PD_REGISTER_BASE_KERNEL_NAME(grid_sampler_grad, grid_sample_grad); + +PD_REGISTER_ARG_MAPPING_FN(grid_sampler, phi::GridSamplerOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(grid_sampler_grad, + phi::GridSamplerGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/hierarchical_sigmoid_sig.cc b/paddle/phi/ops/compat/hierarchical_sigmoid_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..20183d1a9b06634c38f9aa57a31cd58363e0095b --- /dev/null +++ b/paddle/phi/ops/compat/hierarchical_sigmoid_sig.cc @@ -0,0 +1,83 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature HierarchicalSigmoidOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("hierarchical_sigmoid", + {"X", "W", "Label", "PathTable", "PathCode", "Bias"}, + {"num_classes", + "remote_prefetch", + "trainer_id", + "height_sections", + "epmap", + "table_names", + "is_sparse"}, + {"Out", "PreOut", "W_Out"}); +} + +KernelSignature HierarchicalSigmoidGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + if (ctx.IsDenseTensorOutput(GradVarName("W"))) { + return KernelSignature( + "hierarchical_sigmoid_grad", + {"X", + "W", + "Label", + "PreOut", + GradVarName("Out"), + "PathTable", + "PathCode", + "Bias"}, + {"num_classes", + "remote_prefetch", + "trainer_id", + "height_sections", + "epmap", + "table_names", + "is_sparse"}, + {GradVarName("X"), GradVarName("W"), GradVarName("Bias")}); + } else if (ctx.IsSelectedRowsOutput(GradVarName("W"))) { + return KernelSignature( + "hierarchical_sigmoid_grad_sr", + {"X", + "W", + "Label", + "PreOut", + GradVarName("Out"), + "PathTable", + "PathCode", + "Bias"}, + {"num_classes", + "remote_prefetch", + "trainer_id", + "height_sections", + "epmap", + "table_names", + "is_sparse"}, + {GradVarName("X"), GradVarName("W"), GradVarName("Bias")}); + } else { + return KernelSignature("unregistered", {}, {}, {}); + } +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(hierarchical_sigmoid, + phi::HierarchicalSigmoidOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(hierarchical_sigmoid_grad, + phi::HierarchicalSigmoidGradOpArgumentMapping); diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu b/paddle/phi/ops/compat/index_select_sig.cc similarity index 50% rename from paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu rename to paddle/phi/ops/compat/index_select_sig.cc index a578c9f7d81083c533028b9c8912a24006ed0292..53eff1bbcd7ed5269299ccfe41631a699e3d0a32 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu +++ b/paddle/phi/ops/compat/index_select_sig.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,14 +12,19 @@ // See the License for the specific language governing permissions and // limitations under the License. -// .part used to speed up nvcc compile -#include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h" +#include "paddle/phi/core/compat/op_utils.h" -template -using CUDAReduceMeanGradKernel = - ops::ReduceCudaGradKernel; +namespace phi { -REGISTER_OP_CUDA_KERNEL(reduce_mean_grad, CUDAReduceMeanGradKernel, - CUDAReduceMeanGradKernel, - CUDAReduceMeanGradKernel, - CUDAReduceMeanGradKernel); +KernelSignature IndexSelectGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("index_select_grad", + {"X", "Index", GradVarName("Out")}, + {"dim"}, + {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(index_select_grad, + phi::IndexSelectGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/isclose_sig.cc b/paddle/phi/ops/compat/isclose_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..08632e990958dd5e19d46c7c5a0ba093c10e65f1 --- /dev/null +++ b/paddle/phi/ops/compat/isclose_sig.cc @@ -0,0 +1,50 @@ + +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature IscloseOpArgumentMapping(const ArgumentMappingContext& ctx) { + if (ctx.HasInput("Rtol")) { + if (ctx.HasInput("Atol")) { + return KernelSignature("isclose", + {"Input", "Other"}, + {"Rtol", "Atol", "equal_nan"}, + {"Out"}); + + } else { + return KernelSignature("isclose", + {"Input", "Other"}, + {"Rtol", "atol", "equal_nan"}, + {"Out"}); + } + } else { + if (ctx.HasInput("Atol")) { + return KernelSignature("isclose", + {"Input", "Other"}, + {"rtol", "Atol", "equal_nan"}, + {"Out"}); + } else { + return KernelSignature("isclose", + {"Input", "Other"}, + {"rtol", "atol", "equal_nan"}, + {"Out"}); + } + } +} + +} // namespace phi +PD_REGISTER_ARG_MAPPING_FN(isclose, phi::IscloseOpArgumentMapping); diff --git a/paddle/phi/ops/compat/kldiv_loss_sig.cc b/paddle/phi/ops/compat/kldiv_loss_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..22d2f074e9f13c7ba65c6bcbb4b5542881d4128c --- /dev/null +++ b/paddle/phi/ops/compat/kldiv_loss_sig.cc @@ -0,0 +1,30 @@ + +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature KLDivLossGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("kldiv_loss_grad", + {"X", "Target", GradVarName("Loss")}, + {"reduction"}, + {GradVarName("X")}); +} + +} // namespace phi +PD_REGISTER_ARG_MAPPING_FN(kldiv_loss_grad, + phi::KLDivLossGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/kthvalue_sig.cc b/paddle/phi/ops/compat/kthvalue_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..e59e9de1e43822ed8d50b8c1d1888e0d1d14540f --- /dev/null +++ b/paddle/phi/ops/compat/kthvalue_sig.cc @@ -0,0 +1,29 @@ + +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature KthvalueGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("kthvalue_grad", + {GradVarName("Out"), "X", "Indices"}, + {"k", "axis", "keepdim"}, + {GradVarName("X")}); +} + +} // namespace phi +PD_REGISTER_ARG_MAPPING_FN(kthvalue_grad, phi::KthvalueGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/layer_norm_sig.cc b/paddle/phi/ops/compat/layer_norm_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..17a81e9ec012f2c116762ff2d653bb96f0e1c4f4 --- /dev/null +++ b/paddle/phi/ops/compat/layer_norm_sig.cc @@ -0,0 +1,39 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature LayerNormOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("layer_norm", + {"X", "Scale", "Bias"}, + {"epsilon", "begin_norm_axis", "is_test"}, + {"Y", "Mean", "Variance"}); +} + +KernelSignature LayerNormGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "layer_norm_grad", + {"X", "Mean", "Variance", "Scale", "Bias", GradVarName("Y")}, + {"epsilon", "begin_norm_axis", "is_test"}, + {GradVarName("X"), GradVarName("Scale"), GradVarName("Bias")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(layer_norm, phi::LayerNormOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(layer_norm_grad, + phi::LayerNormGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/lgamma_sig.cc b/paddle/phi/ops/compat/lgamma_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..968ad4923ba7b4410f0643335c275059e6ea7bea --- /dev/null +++ b/paddle/phi/ops/compat/lgamma_sig.cc @@ -0,0 +1,25 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature LgammaGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature( + "lgamma_grad", {GradVarName("Out"), "X"}, {}, {GradVarName("X")}); +} + +} // namespace phi +PD_REGISTER_ARG_MAPPING_FN(lgamma_grad, phi::LgammaGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/log_softmax_sig.cc b/paddle/phi/ops/compat/log_softmax_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..b1ecc6d56768f069c208a0230722929200f1dfe0 --- /dev/null +++ b/paddle/phi/ops/compat/log_softmax_sig.cc @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature LogSoftmaxGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("log_softmax_grad", + {"Out", GradVarName("Out")}, + {"axis"}, + {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(log_softmax_grad, + phi::LogSoftmaxGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/mode_sig.cc b/paddle/phi/ops/compat/mode_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..20994c08aa73c33328568e334d258c44eef68171 --- /dev/null +++ b/paddle/phi/ops/compat/mode_sig.cc @@ -0,0 +1,34 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature ModeOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature( + "mode", {"X"}, {"axis", "keepdim"}, {"Out", "Indices"}); +} + +KernelSignature ModeGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("mode_grad", + {"X", "Indices", GradVarName("Out")}, + {"axis", "keepdim"}, + {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(mode, phi::ModeOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(mode_grad, phi::ModeGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/multiplex_sig.cc b/paddle/phi/ops/compat/multiplex_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..9dab4655d172312a7389d0bb243e31ee39ef5981 --- /dev/null +++ b/paddle/phi/ops/compat/multiplex_sig.cc @@ -0,0 +1,32 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature MultiplexOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("multiplex", {"X", "Ids"}, {}, {"Out"}); +} + +KernelSignature MultiplexGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "multiplex_grad", {"Ids", GradVarName("Out")}, {}, {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(multiplex, phi::MultiplexOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(multiplex_grad, phi::MultiplexGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/prelu_sig.cc b/paddle/phi/ops/compat/prelu_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..bd296c5e95318332523a3cf07e85f1afd6f8a95c --- /dev/null +++ b/paddle/phi/ops/compat/prelu_sig.cc @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature PReluGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("prelu_grad", + {"X", "Alpha", GradVarName("Out")}, + {"mode", "data_format"}, + {GradVarName("X"), GradVarName("Alpha")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(prelu_grad, phi::PReluGradOpArgumentMapping); diff --git a/paddle/phi/kernels/impl/shape_kernel_impl.h b/paddle/phi/ops/compat/qr_sig.cc similarity index 57% rename from paddle/phi/kernels/impl/shape_kernel_impl.h rename to paddle/phi/ops/compat/qr_sig.cc index 982cfb33f6b14fc14c7c58ff8c4548a4cdbd3b3b..dd424d590ee113adfab0e9643c3c7ffc519f86e4 100644 --- a/paddle/phi/kernels/impl/shape_kernel_impl.h +++ b/paddle/phi/ops/compat/qr_sig.cc @@ -12,25 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#pragma once - -#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/compat/op_utils.h" namespace phi { -template -void ShapeKernel(const Context& ctx, - const DenseTensor& input, - DenseTensor* out) { - auto in_var = &input; - phi::DDim in_dims; - in_dims = in_var->dims(); - auto out_t = out; - out_t->Resize({in_dims.size()}); - auto out_data = ctx.template HostAlloc(out_t); - for (int i = 0; i < in_dims.size(); ++i) { - out_data[i] = in_dims[i]; - } +KernelSignature QrOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("qr", {"X"}, {"mode"}, {"Q", "R"}); } } // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(qr, phi::QrOpArgumentMapping); diff --git a/paddle/phi/ops/compat/reduce_sig.cc b/paddle/phi/ops/compat/reduce_sig.cc index 997f1505bd08d991aa3f13f1ad831c0107664b2f..4bca0523801c1a94f90197c93cc495c2c4f56eeb 100644 --- a/paddle/phi/ops/compat/reduce_sig.cc +++ b/paddle/phi/ops/compat/reduce_sig.cc @@ -41,8 +41,7 @@ KernelSignature ReduceMeanOpArgumentMapping(const ArgumentMappingContext& ctx) { // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in // InferShape, so we must return the "mean_raw" KernelSignature. // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with - // the - // "mean_raw" KernelSignature + // the "mean_raw" KernelSignature if (ctx.IsForInferShape() || reduce_all) { return KernelSignature( "mean_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"}); @@ -53,8 +52,19 @@ KernelSignature ReduceMeanOpArgumentMapping(const ArgumentMappingContext& ctx) { } KernelSignature ReduceProdOpArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature( - "reduce_prod", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"}); + if (ctx.IsDenseTensorInput("X")) { + bool reduce_all = paddle::any_cast(ctx.Attr("reduce_all")); + // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in + // InferShape, so we must return the "max_raw" KernelSignature. + // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with + // the "max_raw" KernelSignature + if (ctx.IsForInferShape() || reduce_all) { + return KernelSignature( + "prod_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"}); + } + return KernelSignature("prod", {"X"}, {"dim", "keep_dim"}, {"Out"}); + } + return KernelSignature("unregistered", {}, {}, {}); } KernelSignature ReduceMaxOpArgumentMapping(const ArgumentMappingContext& ctx) { @@ -63,8 +73,7 @@ KernelSignature ReduceMaxOpArgumentMapping(const ArgumentMappingContext& ctx) { // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in // InferShape, so we must return the "max_raw" KernelSignature. // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with - // the - // "max_raw" KernelSignature + // the "max_raw" KernelSignature if (ctx.IsForInferShape() || reduce_all) { return KernelSignature( "max_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"}); @@ -74,6 +83,50 @@ KernelSignature ReduceMaxOpArgumentMapping(const ArgumentMappingContext& ctx) { return KernelSignature("unregistered", {}, {}, {}); } +KernelSignature ReduceMinOpArgumentMapping(const ArgumentMappingContext& ctx) { + if (ctx.IsDenseTensorInput("X")) { + bool reduce_all = paddle::any_cast(ctx.Attr("reduce_all")); + // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in + // InferShape, so we must return the "min_raw" KernelSignature. + // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with + // the "min_raw" KernelSignature + if (ctx.IsForInferShape() || reduce_all) { + return KernelSignature( + "min_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"}); + } + return KernelSignature("min", {"X"}, {"dim", "keep_dim"}, {"Out"}); + } + return KernelSignature("unregistered", {}, {}, {}); +} + +KernelSignature ReduceAnyOpArgumentMapping(const ArgumentMappingContext& ctx) { + if (ctx.IsDenseTensorInput("X")) { + bool reduce_all = paddle::any_cast(ctx.Attr("reduce_all")); + // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in + // InferShape, so we must return the "any_raw" KernelSignature. + // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with + // the "any_raw" KernelSignature + if (ctx.IsForInferShape() || reduce_all) { + return KernelSignature( + "any_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"}); + } + return KernelSignature("any", {"X"}, {"dim", "keep_dim"}, {"Out"}); + } + return KernelSignature("unregistered", {}, {}, {}); +} + +KernelSignature ReduceAllOpArgumentMapping(const ArgumentMappingContext& ctx) { + if (ctx.IsDenseTensorInput("X")) { + bool reduce_all = paddle::any_cast(ctx.Attr("reduce_all")); + if (ctx.IsForInferShape() || reduce_all) { + return KernelSignature( + "all_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"}); + } + return KernelSignature("all", {"X"}, {"dim", "keep_dim"}, {"Out"}); + } + return KernelSignature("unregistered", {}, {}, {}); +} + KernelSignature ReduceSumGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature( @@ -83,16 +136,73 @@ KernelSignature ReduceSumGradOpArgumentMapping( {GradVarName("X")}); } +KernelSignature ReduceMeanGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "mean_grad", + {"X", GradVarName("Out")}, + {"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"}, + {GradVarName("X")}); +} + +KernelSignature ReduceMaxGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "max_grad", + {"X", GradVarName("Out"), "Out"}, + {"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"}, + {GradVarName("X")}); +} + +KernelSignature ReduceMinGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "min_grad", + {"X", GradVarName("Out"), "Out"}, + {"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"}, + {GradVarName("X")}); +} + +KernelSignature ReduceProdGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "prod_grad", + {"X", GradVarName("Out"), "Out"}, + {"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"}, + {GradVarName("X")}); +} + } // namespace phi PD_REGISTER_BASE_KERNEL_NAME(reduce_sum, sum); PD_REGISTER_BASE_KERNEL_NAME(reduce_mean, mean); PD_REGISTER_BASE_KERNEL_NAME(reduce_max, max); +PD_REGISTER_BASE_KERNEL_NAME(reduce_min, min); +PD_REGISTER_BASE_KERNEL_NAME(reduce_prod, prod); +PD_REGISTER_BASE_KERNEL_NAME(reduce_all, all); +PD_REGISTER_BASE_KERNEL_NAME(reduce_any, any); + PD_REGISTER_BASE_KERNEL_NAME(reduce_sum_grad, sum_grad); +PD_REGISTER_BASE_KERNEL_NAME(reduce_mean_grad, mean_grad); +PD_REGISTER_BASE_KERNEL_NAME(reduce_prod_grad, prod_grad); +PD_REGISTER_BASE_KERNEL_NAME(reduce_max_grad, max_grad); +PD_REGISTER_BASE_KERNEL_NAME(reduce_min_grad, min_grad); PD_REGISTER_ARG_MAPPING_FN(reduce_sum, phi::ReduceSumOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(reduce_mean, phi::ReduceMeanOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(reduce_prod, phi::ReduceProdOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(reduce_max, phi::ReduceMaxOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(reduce_min, phi::ReduceMinOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(reduce_all, phi::ReduceAllOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(reduce_any, phi::ReduceAnyOpArgumentMapping); + PD_REGISTER_ARG_MAPPING_FN(reduce_sum_grad, phi::ReduceSumGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(reduce_mean_grad, + phi::ReduceMeanGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(reduce_prod_grad, + phi::ReduceProdGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(reduce_max_grad, + phi::ReduceMaxGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(reduce_min_grad, + phi::ReduceMinGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/roi_align_sig.cc b/paddle/phi/ops/compat/roi_align_sig.cc index 0549103b6fbcb8b2367c34c8a44fb3b52f318859..1717ec8f788091fc5eae59c40a32a30c355760e8 100644 --- a/paddle/phi/ops/compat/roi_align_sig.cc +++ b/paddle/phi/ops/compat/roi_align_sig.cc @@ -16,7 +16,7 @@ namespace phi { -KernelSignature ROIAlignOpArgumentMapping(const ArgumentMappingContext& ctx) { +KernelSignature RoiAlignOpArgumentMapping(const ArgumentMappingContext& ctx) { return KernelSignature("roi_align", {"X", "ROIs", "RoisNum"}, {"pooled_height", @@ -27,6 +27,19 @@ KernelSignature ROIAlignOpArgumentMapping(const ArgumentMappingContext& ctx) { {"Out"}); } +KernelSignature RoiAlignGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("roi_align_grad", + {"X", "ROIs", "RoisNum", GradVarName("Out")}, + {"pooled_height", + "pooled_width", + "spatial_scale", + "sampling_ratio", + "aligned"}, + {GradVarName("X")}); +} + } // namespace phi -PD_REGISTER_ARG_MAPPING_FN(roi_align, phi::ROIAlignOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(roi_align, phi::RoiAlignOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(roi_align_grad, phi::RoiAlignGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/roi_pool_sig.cc b/paddle/phi/ops/compat/roi_pool_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..d04c645f183c6e1ac91e4bf6003427008a24fe42 --- /dev/null +++ b/paddle/phi/ops/compat/roi_pool_sig.cc @@ -0,0 +1,37 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature RoiPoolOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("roi_pool", + {"X", "ROIs", "RoisNum"}, + {"pooled_height", "pooled_width", "spatial_scale"}, + {"Out", "Argmax"}); +} + +KernelSignature RoiPoolOpGradArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("roi_pool_grad", + {"X", "ROIs", "RoisNum", "Argmax", GradVarName("Out")}, + {"pooled_height", "pooled_width", "spatial_scale"}, + {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(roi_pool, phi::RoiPoolOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(roi_pool_grad, phi::RoiPoolOpGradArgumentMapping); diff --git a/paddle/phi/ops/compat/roll_sig.cc b/paddle/phi/ops/compat/roll_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..a144f0e8e8a90eee0bf0a8a80455b1e19611880c --- /dev/null +++ b/paddle/phi/ops/compat/roll_sig.cc @@ -0,0 +1,36 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature RollOpArgumentMapping(const ArgumentMappingContext& ctx) { + if (ctx.HasInput("ShiftsTensor")) { + return KernelSignature("roll", {"X"}, {"ShiftsTensor", "axis"}, {"Out"}); + } + return KernelSignature("roll", {"X"}, {"shifts", "axis"}, {"Out"}); +} + +KernelSignature RollGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("roll_grad", + {"X", GradVarName("Out")}, + {"shifts", "axis"}, + {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(roll, phi::RollOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(roll_grad, phi::RollGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/set_value_sig.cc b/paddle/phi/ops/compat/set_value_sig.cc index 9653250bded84f8ff87f613f6e17e50e351504fa..5feff54b028ba437125d65e4a6709254704164d8 100644 --- a/paddle/phi/ops/compat/set_value_sig.cc +++ b/paddle/phi/ops/compat/set_value_sig.cc @@ -19,9 +19,9 @@ namespace phi { KernelSignature SetValueOpArgumentMapping(const ArgumentMappingContext& ctx) { if (ctx.IsDenseTensorInput("Input")) { - if (ctx.HasInput("StartsTensorList")) { - if (ctx.HasInput("EndsTensorList")) { - if (ctx.HasInput("StepsTensorList")) { + if (ctx.InputSize("StartsTensorList") > 0) { + if (ctx.InputSize("EndsTensorList") > 0) { + if (ctx.InputSize("StepsTensorList") > 0) { if (ctx.HasInput("ValueTensor")) { return KernelSignature("set_value_with_tensor", {"Input", "ValueTensor"}, @@ -197,7 +197,7 @@ KernelSignature SetValueOpArgumentMapping(const ArgumentMappingContext& ctx) { } } } else { - if (ctx.HasInput("StepsTensorList")) { + if (ctx.InputSize("StepsTensorList") > 0) { if (ctx.HasInput("ValueTensor")) { return KernelSignature("set_value_with_tensor", {"Input", "ValueTensor"}, @@ -374,8 +374,8 @@ KernelSignature SetValueOpArgumentMapping(const ArgumentMappingContext& ctx) { } } } else { - if (ctx.HasInput("EndsTensorList")) { - if (ctx.HasInput("StepsTensorList")) { + if (ctx.InputSize("EndsTensorList") > 0) { + if (ctx.InputSize("StepsTensorList") > 0) { if (ctx.HasInput("ValueTensor")) { return KernelSignature("set_value_with_tensor", {"Input", "ValueTensor"}, @@ -551,7 +551,7 @@ KernelSignature SetValueOpArgumentMapping(const ArgumentMappingContext& ctx) { } } } else { - if (ctx.HasInput("StepsTensorList")) { + if (ctx.InputSize("StepsTensorList") > 0) { if (ctx.HasInput("ValueTensor")) { return KernelSignature("set_value_with_tensor", {"Input", "ValueTensor"}, @@ -734,9 +734,9 @@ KernelSignature SetValueOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature SetValueGradOpArgumentMapping( const ArgumentMappingContext& ctx) { - if (ctx.HasInput("StartsTensorList")) { - if (ctx.HasInput("EndsTensorList")) { - if (ctx.HasInput("StepsTensorList")) { + if (ctx.InputSize("StartsTensorList") > 0) { + if (ctx.InputSize("EndsTensorList") > 0) { + if (ctx.InputSize("StepsTensorList") > 0) { return KernelSignature( "set_value_grad", {GradVarName("Out")}, @@ -760,7 +760,7 @@ KernelSignature SetValueGradOpArgumentMapping( {GradVarName("Input"), GradVarName("ValueTensor")}); } } else { - if (ctx.HasInput("StepsTensorList")) { + if (ctx.InputSize("StepsTensorList") > 0) { return KernelSignature( "set_value_grad", {GradVarName("Out")}, @@ -785,8 +785,8 @@ KernelSignature SetValueGradOpArgumentMapping( } } } else { - if (ctx.HasInput("EndsTensorList")) { - if (ctx.HasInput("StepsTensorList")) { + if (ctx.InputSize("EndsTensorList") > 0) { + if (ctx.InputSize("StepsTensorList") > 0) { return KernelSignature( "set_value_grad", {GradVarName("Out")}, @@ -810,7 +810,7 @@ KernelSignature SetValueGradOpArgumentMapping( {GradVarName("Input"), GradVarName("ValueTensor")}); } } else { - if (ctx.HasInput("StepsTensorList")) { + if (ctx.InputSize("StepsTensorList") > 0) { return KernelSignature( "set_value_grad", {GradVarName("Out")}, diff --git a/paddle/phi/ops/compat/tile_sig.cc b/paddle/phi/ops/compat/tile_sig.cc index 49a6d02225d931f1dc2d3324cb13c2c620f5dfe6..ca3fa5fe1f86ac13252c04c05c0508c47feded42 100644 --- a/paddle/phi/ops/compat/tile_sig.cc +++ b/paddle/phi/ops/compat/tile_sig.cc @@ -20,6 +20,11 @@ KernelSignature TileOpArgumentMapping(const ArgumentMappingContext& ctx) { if (ctx.HasInput("RepeatTimes")) { return KernelSignature("tile", {"X"}, {"RepeatTimes"}, {"Out"}); } else if (ctx.InputSize("repeat_times_tensor") > 0) { + const auto& repeat_times = + paddle::any_cast>(ctx.Attr("repeat_times")); + if (!ctx.IsRuntime() && !repeat_times.empty()) { + return KernelSignature("tile", {"X"}, {"repeat_times"}, {"Out"}); + } return KernelSignature("tile", {"X"}, {"repeat_times_tensor"}, {"Out"}); } else { return KernelSignature("tile", {"X"}, {"repeat_times"}, {"Out"}); diff --git a/paddle/phi/ops/compat/tril_triu_sig.cc b/paddle/phi/ops/compat/tril_triu_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..4f79f8650decfc6556287be2caefa6d1074ecf7f --- /dev/null +++ b/paddle/phi/ops/compat/tril_triu_sig.cc @@ -0,0 +1,34 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature TrilTriuOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("tril_triu", {"X"}, {"diagonal", "lower"}, {"Out"}); +} + +KernelSignature TrilTriuGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("tril_triu_grad", + {GradVarName("Out")}, + {"diagonal", "lower"}, + {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(tril_triu, phi::TrilTriuOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(tril_triu_grad, phi::TrilTriuGradOpArgumentMapping); diff --git a/paddle/phi/tests/api/test_sparse_utils_api.cc b/paddle/phi/tests/api/test_sparse_utils_api.cc index 8595782be35ab677ef40eb31b8a09237e90f359a..da66334ced78ac92b85bc2effc749888ec3da4ad 100644 --- a/paddle/phi/tests/api/test_sparse_utils_api.cc +++ b/paddle/phi/tests/api/test_sparse_utils_api.cc @@ -53,8 +53,7 @@ TEST(API, to_sparse_coo) { // 1. test dense_to_sparse_coo paddle::experimental::Tensor x(dense_x); - auto out = paddle::experimental::sparse::to_sparse_coo( - x, phi::Backend::CPU, sparse_dim); + auto out = paddle::experimental::sparse::to_sparse_coo(x, sparse_dim); auto coo = std::dynamic_pointer_cast(out.impl()); ASSERT_EQ(coo->nnz(), non_zero_num); int cmp_indices = memcmp(coo->non_zero_indices().data(), @@ -91,8 +90,7 @@ TEST(API, to_sparse_coo) { auto csr = std::make_shared(crows, cols, values, dense_dims); paddle::experimental::Tensor csr_x(csr); - auto out2 = paddle::experimental::sparse::to_sparse_coo( - csr_x, phi::Backend::CPU, sparse_dim); + auto out2 = paddle::experimental::sparse::to_sparse_coo(csr_x, sparse_dim); auto coo2 = std::dynamic_pointer_cast(out.impl()); ASSERT_EQ(coo2->nnz(), non_zero_num); @@ -132,7 +130,7 @@ TEST(API, to_sparse_csr) { // 1. test dense_to_sparse_csr paddle::experimental::Tensor x(dense_x); - auto out = paddle::experimental::sparse::to_sparse_csr(x, phi::Backend::CPU); + auto out = paddle::experimental::sparse::to_sparse_csr(x); auto csr = std::dynamic_pointer_cast(out.impl()); auto check = [&](const phi::SparseCsrTensor& csr) { ASSERT_EQ(csr.non_zero_cols().numel(), non_zero_num); @@ -170,8 +168,7 @@ TEST(API, to_sparse_csr) { auto coo = std::make_shared(indices, values, dense_dims); paddle::experimental::Tensor coo_x(coo); - auto out2 = - paddle::experimental::sparse::to_sparse_csr(coo_x, phi::Backend::CPU); + auto out2 = paddle::experimental::sparse::to_sparse_csr(coo_x); auto csr2 = std::dynamic_pointer_cast(out.impl()); check(*csr2); @@ -212,7 +209,7 @@ TEST(API, to_dense) { std::make_shared(indices, values, dense_dims); paddle::experimental::Tensor coo_x(coo); - auto out = paddle::experimental::sparse::to_dense(coo_x, phi::Backend::CPU); + auto out = paddle::experimental::sparse::to_dense(coo_x); auto dense_out = std::dynamic_pointer_cast(out.impl()); int cmp1 = memcmp(dense_out->data(), &dense_data[0][0], 9 * sizeof(float)); @@ -237,7 +234,7 @@ TEST(API, to_dense) { auto csr = std::make_shared(crows, cols, values, dense_dims); paddle::experimental::Tensor csr_x(csr); - auto out2 = paddle::experimental::sparse::to_dense(csr_x, phi::Backend::CPU); + auto out2 = paddle::experimental::sparse::to_dense(csr_x); auto dense_out2 = std::dynamic_pointer_cast(out.impl()); int cmp2 = diff --git a/paddle/phi/tests/kernels/CMakeLists.txt b/paddle/phi/tests/kernels/CMakeLists.txt index 317dcce92c8edd1bb76b080cdb578d37eb8b1f58..3897c182e481ce3ae81c406c35e138adf2f7071f 100644 --- a/paddle/phi/tests/kernels/CMakeLists.txt +++ b/paddle/phi/tests/kernels/CMakeLists.txt @@ -14,6 +14,7 @@ cc_test(test_concat_dev_api SRCS test_concat_dev_api.cc DEPS phi phi_api_utils) cc_test(test_split_dev_api SRCS test_split_dev_api.cc DEPS phi phi_api_utils) cc_test(test_sparse_utils_dev_api SRCS test_sparse_utils_dev_api.cc DEPS phi phi_api_utils) cc_test(test_sparse_conv3d_dev_api SRCS test_sparse_conv3d_dev_api.cc DEPS phi phi_api_utils) +cc_test(test_sparse_pool_dev_api SRCS test_sparse_pool_dev_api.cc DEPS phi phi_api_utils) cc_test(test_math_function SRCS test_math_function.cc DEPS math_function) if(WITH_GPU) diff --git a/paddle/phi/tests/kernels/test_copy_dev_api.cc b/paddle/phi/tests/kernels/test_copy_dev_api.cc index d69c7b2174f726d5757ea707678ddb383cf19d68..460d85f83133f9ecef83daa4e6a446e53485cd0e 100644 --- a/paddle/phi/tests/kernels/test_copy_dev_api.cc +++ b/paddle/phi/tests/kernels/test_copy_dev_api.cc @@ -61,6 +61,10 @@ TEST(DEV_API, copy) { dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace()) .get()); + dev_ctx.SetHostAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); dev_ctx.Init(); phi::Copy( dev_ctx, *(dense_src.get()), phi::CPUPlace(), false, dense_dst.get()); diff --git a/paddle/phi/tests/kernels/test_elementwise_dev_api.cc b/paddle/phi/tests/kernels/test_elementwise_dev_api.cc index 3e5f96507415624750eb297953719f397e294230..9552c02976f30d11601967034815545f94ff1f97 100644 --- a/paddle/phi/tests/kernels/test_elementwise_dev_api.cc +++ b/paddle/phi/tests/kernels/test_elementwise_dev_api.cc @@ -16,7 +16,7 @@ limitations under the License. */ #include #include "paddle/phi/backends/cpu/cpu_context.h" -#include "paddle/phi/kernels/math_kernel.h" +#include "paddle/phi/kernels/elementwise_kernel.h" #include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/phi/api/lib/utils/allocator.h" diff --git a/paddle/phi/tests/kernels/test_flatten_dev_api.cc b/paddle/phi/tests/kernels/test_flatten_dev_api.cc index dc283728ee5f761e79c9c396d63121d555139dee..e3f2e8b57e3df48d860734f164f41be95f6f3d96 100644 --- a/paddle/phi/tests/kernels/test_flatten_dev_api.cc +++ b/paddle/phi/tests/kernels/test_flatten_dev_api.cc @@ -58,6 +58,10 @@ TEST(DEV_API, flatten) { dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace()) .get()); + dev_ctx.SetHostAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); dev_ctx.Init(); // 2. test API diff --git a/paddle/phi/tests/kernels/test_mean_dev_api.cc b/paddle/phi/tests/kernels/test_mean_dev_api.cc index 23edfeacaf81436d6381be674c72a27ae96e0b41..ce31b2021e01a4130038e7e26bc37fd3e13ef27a 100644 --- a/paddle/phi/tests/kernels/test_mean_dev_api.cc +++ b/paddle/phi/tests/kernels/test_mean_dev_api.cc @@ -15,7 +15,7 @@ limitations under the License. */ #include #include -#include "paddle/phi/kernels/math_kernel.h" +#include "paddle/phi/kernels/reduce_kernel.h" #include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/phi/api/lib/utils/allocator.h" diff --git a/paddle/phi/tests/kernels/test_reshape_dev_api.cc b/paddle/phi/tests/kernels/test_reshape_dev_api.cc index 16ad4fc341be0ac68c571b29ffe182ae5d4c625f..7de039372fa9c2b46d5b6f9b430a816382072449 100644 --- a/paddle/phi/tests/kernels/test_reshape_dev_api.cc +++ b/paddle/phi/tests/kernels/test_reshape_dev_api.cc @@ -50,6 +50,10 @@ TEST(DEV_API, reshape) { dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace()) .get()); + dev_ctx.SetHostAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); dev_ctx.Init(); auto out = phi::Reshape(dev_ctx, dense_x, shape); // 3. check result diff --git a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc index 37a69a176c6e1ded81a8449da3c571442bd94e78..4800e1402ba56f2956c207f44f2656a71d50b92c 100644 --- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc +++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc @@ -132,16 +132,17 @@ void TestConv3dBase(const std::vector& indices, f_verify(out.non_zero_elements().data(), correct_out_features); if (backward) { - std::vector grads = sparse::Conv3dGrad(dev_ctx_cpu, - x_tensor, - rulebook, - kernel_tensor, - out, - paddings, - dilations, - strides, - 1, - subm); + std::vector grads = + sparse::Conv3dGrad(dev_ctx_cpu, + x_tensor, + rulebook, + kernel_tensor, + out.non_zero_elements(), + paddings, + dilations, + strides, + 1, + subm); f_verify(grads[0].data(), features_grad); f_verify(grads[1].data(), kernel_grad); } @@ -231,16 +232,17 @@ void TestConv3dBase(const std::vector& indices, f_verify(h_features_tensor.data(), correct_out_features); if (backward) { - std::vector grads = sparse::Conv3dGrad(dev_ctx_gpu, - d_x_tensor, - d_rulebook, - d_kernel_tensor, - d_out, - paddings, - dilations, - strides, - 1, - subm); + std::vector grads = + sparse::Conv3dGrad(dev_ctx_gpu, + d_x_tensor, + d_rulebook, + d_kernel_tensor, + d_out.non_zero_elements(), + paddings, + dilations, + strides, + 1, + subm); DenseTensor h_features_grad = phi::Empty( dev_ctx_cpu, DenseTensorMeta(grads[0].dtype(), grads[0].dims(), grads[0].layout())); diff --git a/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc new file mode 100644 index 0000000000000000000000000000000000000000..27673704168c9eace0958db770a2309d10da648c --- /dev/null +++ b/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc @@ -0,0 +1,391 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h" +#include "paddle/phi/kernels/sparse/sparse_pool_kernel.h" + +#include "paddle/fluid/memory/allocation/allocator_facade.h" +#include "paddle/phi/api/lib/utils/allocator.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { +namespace tests { + +template +std::vector cast(const std::vector& in) { + std::vector out(in.size()); + for (uint64_t i = 0; i < in.size(); i++) { + out[i] = static_cast(in[i]); + } + return out; +} +template +void TestMaxPoolBase(const std::vector& indices, + const std::vector& features, + const DDim& x_dims, + const std::vector& correct_out_indices, + const std::vector& correct_out_features, + const DDim& correct_out_dims, + const int non_zero_num, + const std::vector& kernel_sizes, + const std::vector& paddings, + const std::vector& strides, + const std::vector& dilations, + const float diff = 1e-3, + const bool backward = false, + const std::vector features_grad = {}) { + phi::CPUContext dev_ctx_cpu; + dev_ctx_cpu.SetAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); + dev_ctx_cpu.Init(); + + const int in_channels = x_dims[4]; + const int out_channels = in_channels; + + DenseTensor indices_tensor = phi::Empty( + dev_ctx_cpu, + DenseTensorMeta(DataType::INT32, {4, non_zero_num}, DataLayout::NCHW)); + memcpy( + indices_tensor.data(), indices.data(), indices.size() * sizeof(int)); + DenseTensor features_tensor = phi::Empty( + dev_ctx_cpu, + DenseTensorMeta(paddle::experimental::CppTypeToDataType::Type(), + {non_zero_num, in_channels}, + DataLayout::NHWC)); + memcpy( + features_tensor.data(), features.data(), features.size() * sizeof(T)); + + SparseCooTensor x_tensor(indices_tensor, features_tensor, x_dims); + + auto f_verify = [&](const T* real_data, const std::vector& correct_data) { + for (uint64_t i = 0; i < correct_data.size(); i++) { + float tmp = std::fabs(static_cast(correct_data[i] - real_data[i])); + ASSERT_LT(tmp, diff); + } + }; + + if (!std::is_same::value) { + DenseTensor rulebook = phi::Empty( + dev_ctx_cpu, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW)); + SparseCooTensor out = sparse::MaxPool(dev_ctx_cpu, + x_tensor, + kernel_sizes, + paddings, + dilations, + strides, + &rulebook); + + ASSERT_EQ(correct_out_dims.size(), out.dims().size()); + for (int i = 0; i < correct_out_dims.size(); i++) { + ASSERT_EQ(correct_out_dims[i], out.dims()[i]); + } + ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, out.nnz()); + + int cmp_indices = memcmp(correct_out_indices.data(), + out.non_zero_indices().data(), + correct_out_indices.size() * sizeof(int)); + ASSERT_EQ(cmp_indices, 0); + + f_verify(out.non_zero_elements().data(), correct_out_features); + + if (backward) { + DenseTensor x_grad = sparse::MaxPoolGrad(dev_ctx_cpu, + x_tensor, + rulebook, + out, + out.non_zero_elements(), + kernel_sizes); + f_verify(x_grad.data(), features_grad); + } + } + +// test gpu +#if defined(PADDLE_WITH_CUDA) + phi::GPUContext dev_ctx_gpu; + dev_ctx_gpu.PartialInitWithoutAllocator(); + dev_ctx_gpu.SetAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(dev_ctx_gpu.GetPlace(), dev_ctx_gpu.stream()) + .get()); + dev_ctx_gpu.SetHostAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(phi::CPUPlace()) + .get()); + dev_ctx_gpu.PartialInitWithAllocator(); + + DenseTensor d_indices_tensor = phi::Empty( + dev_ctx_gpu, + DenseTensorMeta(DataType::INT32, {4, non_zero_num}, DataLayout::NCHW)); + phi::Copy( + dev_ctx_gpu, indices_tensor, phi::GPUPlace(), true, &d_indices_tensor); + + DenseTensor d_features_tensor = phi::Empty( + dev_ctx_gpu, + DenseTensorMeta(paddle::experimental::CppTypeToDataType::Type(), + {non_zero_num, in_channels}, + DataLayout::NHWC)); + phi::Copy( + dev_ctx_gpu, features_tensor, phi::GPUPlace(), true, &d_features_tensor); + + SparseCooTensor d_x_tensor(d_indices_tensor, d_features_tensor, x_dims); + + DenseTensor d_rulebook = phi::Empty( + dev_ctx_gpu, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW)); + SparseCooTensor d_out = sparse::MaxPool(dev_ctx_gpu, + d_x_tensor, + kernel_sizes, + paddings, + dilations, + strides, + &d_rulebook); + + ASSERT_EQ(correct_out_dims.size(), d_out.dims().size()); + ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, d_out.nnz()); + for (int i = 0; i < correct_out_dims.size(); i++) { + ASSERT_EQ(correct_out_dims[i], d_out.dims()[i]); + } + + DenseTensor h_indices_tensor = phi::Empty( + dev_ctx_cpu, + DenseTensorMeta(DataType::INT32, {4, d_out.nnz()}, DataLayout::NCHW)); + phi::Copy(dev_ctx_gpu, + d_out.non_zero_indices(), + phi::CPUPlace(), + true, + &h_indices_tensor); + + int cmp_indices2 = memcmp(correct_out_indices.data(), + h_indices_tensor.data(), + correct_out_indices.size() * sizeof(int)); + ASSERT_EQ(cmp_indices2, 0); + + DenseTensor h_features_tensor = phi::Empty( + dev_ctx_cpu, + DenseTensorMeta(paddle::experimental::CppTypeToDataType::Type(), + {d_out.nnz()}, + d_out.layout())); + + phi::Copy(dev_ctx_gpu, + d_out.non_zero_elements(), + phi::CPUPlace(), + true, + &h_features_tensor); + f_verify(h_features_tensor.data(), correct_out_features); + + if (backward) { + DenseTensor x_grad = sparse::MaxPoolGrad(dev_ctx_gpu, + d_x_tensor, + d_rulebook, + d_out, + d_out.non_zero_elements(), + kernel_sizes); + DenseTensor h_features_grad = phi::Empty( + dev_ctx_cpu, + DenseTensorMeta(x_grad.dtype(), x_grad.dims(), x_grad.layout())); + phi::Copy(dev_ctx_gpu, x_grad, phi::CPUPlace(), true, &h_features_grad); + f_verify(h_features_grad.data(), features_grad); + } +#endif +} + +void TestMaxPool(const std::vector& indices, + const std::vector& features, + const DDim& x_dims, + const std::vector& correct_out_indices, + const std::vector& correct_out_features, + const DDim& correct_out_dims, + const int non_zero_num, + const std::vector& kernel_sizes, + const std::vector& paddings, + const std::vector& strides, + const std::vector& dilations, + const float diff = 1e-3, + const bool backward = false, + const std::vector features_grad = {}) { + // test float + TestMaxPoolBase(indices, + features, + x_dims, + correct_out_indices, + correct_out_features, + correct_out_dims, + non_zero_num, + kernel_sizes, + paddings, + strides, + dilations, + diff, + backward, + features_grad); + // test double + TestMaxPoolBase(indices, + cast(features), + x_dims, + correct_out_indices, + cast(correct_out_features), + correct_out_dims, + non_zero_num, + kernel_sizes, + paddings, + strides, + dilations, + diff, + backward, + cast(features_grad)); +} + +TEST(DEV_API, sparse_maxpool) { + const int channels = 1; + DDim x_dims = {1, 1, 4, 4, channels}; + DDim out_dims = {1, 1, 2, 2, channels}; + std::vector kernel_sizes = {1, 3, 3}; + std::vector paddings = {0, 0, 0}; + std::vector strides = {1, 1, 1}; + std::vector dilations = {1, 1, 1}; + + const int non_zero_num = 3; + std::vector indices = {0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 1, 2}; + std::vector features = {1, 2, 3}; + std::vector out_indices = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, + }; + std::vector out_features = {2, 2, 3, 3}; + std::vector x_grad = {0, 4, 6}; + + TestMaxPool(indices, + features, + x_dims, + out_indices, + out_features, + out_dims, + non_zero_num, + kernel_sizes, + paddings, + strides, + dilations, + 1e-6, + true, + x_grad); +} + +TEST(DEV_API, sparse_maxpool_stride) { + const int channels = 1; + DDim x_dims = {1, 1, 4, 4, channels}; + DDim out_dims = {1, 1, 1, 1, channels}; + std::vector kernel_sizes = {1, 3, 3}; + std::vector paddings = {0, 0, 0}; + std::vector strides = {2, 2, 2}; + std::vector dilations = {1, 1, 1}; + + const int non_zero_num = 3; + std::vector indices = {0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 1, 2}; + std::vector features = {1, 2, 3}; + std::vector out_indices = {0, 0, 0, 0}; + std::vector out_features = {2}; + std::vector x_grad = {0, 2, 0}; + + TestMaxPool(indices, + features, + x_dims, + out_indices, + out_features, + out_dims, + non_zero_num, + kernel_sizes, + paddings, + strides, + dilations, + 1e-6, + true, + x_grad); +} + +TEST(DEV_API, sparse_maxpool_channel) { + const int channels = 2; + DDim x_dims = {1, 1, 4, 4, channels}; + DDim out_dims = {1, 1, 2, 2, channels}; + std::vector kernel_sizes = {1, 3, 3}; + std::vector paddings = {0, 0, 0}; + std::vector strides = {1, 1, 1}; + std::vector dilations = {1, 1, 1}; + + const int non_zero_num = 3; + std::vector indices = {0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 1, 2}; + std::vector features = {1, 1, 2, 2, 3, 3}; + std::vector out_indices = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, + }; + std::vector out_features = {2, 2, 2, 2, 3, 3, 3, 3}; + std::vector x_grad = {0, 0, 4, 4, 6, 6}; + + TestMaxPool(indices, + features, + x_dims, + out_indices, + out_features, + out_dims, + non_zero_num, + kernel_sizes, + paddings, + strides, + dilations, + 1e-6, + true, + x_grad); +} + +TEST(DEV_API, sparse_maxpool3d) { + const int channels = 2; + DDim x_dims = {1, 5, 4, 4, channels}; + DDim out_dims = {1, 3, 2, 2, channels}; + std::vector kernel_sizes = {3, 3, 3}; + std::vector paddings = {0, 0, 0}; + std::vector strides = {1, 1, 1}; + std::vector dilations = {1, 1, 1}; + + const int non_zero_num = 3; + std::vector indices = {0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 1, 2}; + std::vector features = {1, 1, 2, 2, 3, 3}; + std::vector out_indices = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, + }; + std::vector out_features = {2, 2, 2, 2, 3, 3, 3, 3}; + std::vector x_grad = {0, 0, 4, 4, 6, 6}; + + TestMaxPool(indices, + features, + x_dims, + out_indices, + out_features, + out_dims, + non_zero_num, + kernel_sizes, + paddings, + strides, + dilations, + 1e-6, + true, + x_grad); +} + +} // namespace tests +} // namespace phi diff --git a/paddle/phi/tests/kernels/test_sum_dev_api.cc b/paddle/phi/tests/kernels/test_sum_dev_api.cc index dfec291bc072f023dd09dba768cdeeb6e4cc3a34..82fa90c1574bd5c358d9e2325349811d43f5d973 100644 --- a/paddle/phi/tests/kernels/test_sum_dev_api.cc +++ b/paddle/phi/tests/kernels/test_sum_dev_api.cc @@ -15,7 +15,7 @@ limitations under the License. */ #include #include -#include "paddle/phi/kernels/math_kernel.h" +#include "paddle/phi/kernels/reduce_kernel.h" #include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/phi/api/lib/utils/allocator.h" diff --git a/paddle/phi/tests/ops/test_op_signature.h b/paddle/phi/tests/ops/test_op_signature.h index 06048f33d940a28ddf9e3aa488a6e24a9e4a93b6..8468dad10eb64a066cc11dafa125dde3174b7e30 100644 --- a/paddle/phi/tests/ops/test_op_signature.h +++ b/paddle/phi/tests/ops/test_op_signature.h @@ -72,6 +72,11 @@ class TestArgumentMappingContext : public phi::ArgumentMappingContext { return selected_rows_inputs.count(name) > 0; } + // add member if needed + bool IsDenseTensorVectorInput(const std::string& name) const override { + return false; + } + bool IsDenseTensorOutput(const std::string& name) const override { return dense_tensor_outputs.count(name) > 0; } diff --git a/paddle/scripts/infrt_build.sh b/paddle/scripts/infrt_build.sh index 76b45ff89f1869cc5e401b5c1b4151ad14158259..1b259023f94df7279066533bb6c182a644b4e9c2 100755 --- a/paddle/scripts/infrt_build.sh +++ b/paddle/scripts/infrt_build.sh @@ -44,6 +44,9 @@ function update_pd_ops() { cd ${PADDLE_ROOT}/tools/infrt/ python3 generate_pd_op_dialect_from_paddle_op_maker.py python3 generate_phi_kernel_dialect.py + # generate test model + cd ${PADDLE_ROOT} + python3 paddle/infrt/tests/model/abs_model.py ${PADDLE_ROOT}/build/paddle/infrt/tests/abs } function init() { @@ -93,7 +96,7 @@ function infrt_gen_and_build() { exit 7; fi - make -j ${parallel_number} infrt infrtopt infrtexec test_infrt_exec trt-exec phi-ir-exec phi-exec infrt_lib_dist paddle-mlir-convert;build_error=$? + make -j ${parallel_number} infrt infrtopt infrtexec test_infrt_exec trt-exec phi-exec infrt_lib_dist paddle-mlir-convert;build_error=$? if [ "$build_error" != 0 ];then exit 7; fi diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat index 75afa4ef43ff602d0fe4b9a6ce7c7c6ad5aab8a0..78a863040ade1a43e9de660bff59f5179535abef 100644 --- a/paddle/scripts/paddle_build.bat +++ b/paddle/scripts/paddle_build.bat @@ -55,7 +55,6 @@ wmic process where name="python.exe" call terminate 2>NUL rem ------initialize common variable------ if not defined GENERATOR set GENERATOR="Visual Studio 15 2017 Win64" -if not defined BRANCH set BRANCH=develop if not defined WITH_TENSORRT set WITH_TENSORRT=ON if not defined TENSORRT_ROOT set TENSORRT_ROOT=D:/TensorRT if not defined CUDA_ARCH_NAME set CUDA_ARCH_NAME=Auto @@ -70,7 +69,6 @@ if not defined WITH_ONNXRUNTIME set WITH_ONNXRUNTIME=OFF if not defined WITH_INFERENCE_API_TEST set WITH_INFERENCE_API_TEST=ON if not defined WITH_STATIC_LIB set WITH_STATIC_LIB=ON if not defined WITH_TPCACHE set WITH_TPCACHE=OFF -if not defined WITH_CLCACHE set WITH_CLCACHE=OFF if not defined WITH_CACHE set WITH_CACHE=OFF if not defined WITH_SCCACHE set WITH_SCCACHE=OFF if not defined WITH_UNITY_BUILD set WITH_UNITY_BUILD=OFF @@ -145,17 +143,6 @@ if %day_now% NEQ %day_before% ( echo %day_now% > %cache_dir%\day.txt type %cache_dir%\day.txt rmdir %BUILD_DIR% /s/q - - : clear third party cache every once in a while - if %day_now% EQU 21 ( - rmdir %cache_dir%\third_party /s/q - ) - if %day_now% EQU 11 ( - rmdir %cache_dir%\third_party /s/q - ) - if %day_now% EQU 01 ( - rmdir %cache_dir%\third_party /s/q - ) goto :mkbuild ) @@ -212,6 +199,7 @@ echo There is not sccache in this PC, will install sccache. echo Download package from https://paddle-ci.gz.bcebos.com/window_requirement/sccache.exe %PYTHON_ROOT%\python.exe -c "import wget;wget.download('https://paddle-ci.gz.bcebos.com/window_requirement/sccache.exe')" xcopy sccache.exe %PYTHON_ROOT%\ /Y +del sccache.exe goto:eof rem -------Caching strategy 2: End -------------------------------- @@ -232,13 +220,12 @@ set WITH_AVX=ON set MSVC_STATIC_CRT=OFF set ON_INFER=OFF set WITH_TENSORRT=ON +set WITH_INFERENCE_API_TEST=OFF call :cmake || goto cmake_error call :build || goto build_error call :test_whl_pacakage || goto test_whl_pacakage_error call :test_unit || goto test_unit_error -:: call :test_inference || goto test_inference_error -:: call :check_change_of_unittest || goto check_change_of_unittest_error goto:success rem ------PR CI windows check for OPENBLAS/CPU------ @@ -254,8 +241,6 @@ call :cmake || goto cmake_error call :build || goto build_error call :test_whl_pacakage || goto test_whl_pacakage_error call :test_unit || goto test_unit_error -:: call :test_inference || goto test_inference_error -:: call :check_change_of_unittest || goto check_change_of_unittest_error goto:success rem ------PR CI windows check for unittests and inference in CUDA11-MKL-AVX---------- @@ -265,7 +250,6 @@ set WITH_GPU=ON set WITH_AVX=ON set MSVC_STATIC_CRT=ON set ON_INFER=ON -set WITH_TESTING=ON set WITH_TENSORRT=ON set WITH_INFERENCE_API_TEST=ON @@ -274,7 +258,8 @@ call :build || goto build_error call :test_whl_pacakage || goto test_whl_pacakage_error call :test_unit || goto test_unit_error ::call :test_inference || goto test_inference_error -:: call :check_change_of_unittest || goto check_change_of_unittest_error +::call :test_inference_ut || goto test_inference_ut_error +call :check_change_of_unittest || goto check_change_of_unittest_error goto:success rem ------Build windows avx whl package------ @@ -365,18 +350,6 @@ if "%WITH_GPU%"=="ON" ( nvidia-smi 2>NUL ) -rem ------pre install clcache and init config---------- -rem pip install clcache --user -pip uninstall -y clcache -:: set USE_CLCACHE to enable clcache -rem set USE_CLCACHE=1 -:: In some scenarios, CLCACHE_HARDLINK can save one file copy. -rem set CLCACHE_HARDLINK=1 -:: If it takes more than 1000s to obtain the right to use the cache, an error will be reported -rem set CLCACHE_OBJECT_CACHE_TIMEOUT_MS=1000000 -:: set maximum cache size to 20G -rem clcache.exe -M 21474836480 - rem ------set third_party cache dir------ if "%WITH_TPCACHE%"=="OFF" ( @@ -384,6 +357,25 @@ if "%WITH_TPCACHE%"=="OFF" ( goto :cmake_impl ) +rem clear third party cache every ten days +for /F %%# in ('wmic os get localdatetime^|findstr 20') do set datetime=%%# +set day_now=%datetime:~6,2% +set day_before=-1 +set /p day_before=< %cache_dir%\day_third_party.txt +if %day_now% NEQ %day_before% ( + echo %day_now% > %cache_dir%\day_third_party.txt + type %cache_dir%\day_third_party.txt + if %day_now% EQU 21 ( + rmdir %cache_dir%\third_party /s/q + ) + if %day_now% EQU 11 ( + rmdir %cache_dir%\third_party /s/q + ) + if %day_now% EQU 01 ( + rmdir %cache_dir%\third_party /s/q + ) +) + echo set -ex > cache.sh echo md5_content=$(cat %work_dir:\=/%/cmake/external/*.cmake ^|md5sum ^| awk '{print $1}') >> cache.sh echo echo ${md5_content}^>md5.txt >> cache.sh @@ -535,11 +527,7 @@ echo Build Paddle the %build_times% time: if %GENERATOR% == "Ninja" ( ninja all ) else ( - if "%WITH_CLCACHE%"=="OFF" ( - MSBuild /m:%PARALLEL_PROJECT_COUNT% /p:PreferredToolArchitecture=x64 /p:TrackFileAccess=false /p:Configuration=Release /verbosity:%LOG_LEVEL% ALL_BUILD.vcxproj - ) else ( - MSBuild /m:%PARALLEL_PROJECT_COUNT% /p:PreferredToolArchitecture=x64 /p:TrackFileAccess=false /p:CLToolExe=clcache.exe /p:CLToolPath=%PYTHON_ROOT%\Scripts /p:Configuration=Release /verbosity:%LOG_LEVEL% ALL_BUILD.vcxproj - ) + MSBuild /m:%PARALLEL_PROJECT_COUNT% /p:PreferredToolArchitecture=x64 /p:TrackFileAccess=false /p:Configuration=Release /verbosity:%LOG_LEVEL% ALL_BUILD.vcxproj ) if %ERRORLEVEL% NEQ 0 ( @@ -774,77 +762,8 @@ echo ======================================== echo Step 6. Check whether deleting a unit test ... echo ======================================== -cd /d %work_dir%\%BUILD_DIR% -echo set -e> check_change_of_unittest.sh -echo set +x>> check_change_of_unittest.sh -echo GITHUB_API_TOKEN=%GITHUB_API_TOKEN% >> check_change_of_unittest.sh -echo GIT_PR_ID=%AGILE_PULL_ID% >> check_change_of_unittest.sh -echo BRANCH=%BRANCH%>> check_change_of_unittest.sh -echo if [ "${GITHUB_API_TOKEN}" == "" ] ^|^| [ "${GIT_PR_ID}" == "" ];then>> check_change_of_unittest.sh -echo exit 0 >> check_change_of_unittest.sh -echo fi>> check_change_of_unittest.sh -echo set -x>> check_change_of_unittest.sh -echo cat ^<^> check_change_of_unittest.sh -echo ============================================ >> check_change_of_unittest.sh -echo Generate unit tests.spec of this PR. >> check_change_of_unittest.sh -echo ============================================ >> check_change_of_unittest.sh -echo EOF>> check_change_of_unittest.sh -echo spec_path=$(pwd)/UNITTEST_PR.spec>> check_change_of_unittest.sh -echo ctest -N ^| awk -F ':' '{print $2}' ^| sed '/^^$/d' ^| sed '$d' ^> ${spec_path}>> check_change_of_unittest.sh -echo num=$(awk 'END{print NR}' ${spec_path})>> check_change_of_unittest.sh -echo echo "Windows 1 card TestCases count is $num">> check_change_of_unittest.sh -echo echo ipipe_log_param_Windows_1_Card_TestCases_Count: $num>> check_change_of_unittest.sh -echo UPSTREAM_URL='https://github.com/PaddlePaddle/Paddle'>> check_change_of_unittest.sh -echo origin_upstream_url=`git remote -v ^| awk '{print $1, $2}' ^| uniq ^| grep upstream ^| awk '{print $2}'`>> check_change_of_unittest.sh -echo if [ "$origin_upstream_url" == "" ]; then>> check_change_of_unittest.sh -echo git remote add upstream $UPSTREAM_URL.git>> check_change_of_unittest.sh -echo elif [ "$origin_upstream_url" ^!= "$UPSTREAM_URL" ] ^\>> check_change_of_unittest.sh -echo ^&^& [ "$origin_upstream_url" ^!= "$UPSTREAM_URL.git" ]; then>> check_change_of_unittest.sh -echo git remote remove upstream>> check_change_of_unittest.sh -echo git remote add upstream $UPSTREAM_URL.git>> check_change_of_unittest.sh -echo fi>> check_change_of_unittest.sh -echo if [ ! -e "$(pwd)/../.git/refs/remotes/upstream/$BRANCH" ]; then>> check_change_of_unittest.sh -echo git fetch upstream $BRANCH # develop is not fetched>> check_change_of_unittest.sh -echo fi>> check_change_of_unittest.sh -echo git checkout -b origin_pr >> check_change_of_unittest.sh -echo git checkout -f $BRANCH >> check_change_of_unittest.sh -echo cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^ --DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DON_INFER=%ON_INFER% ^ --DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^ --DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DWITH_STATIC_LIB=%WITH_STATIC_LIB% ^ --DWITH_TENSORRT=%WITH_TENSORRT% -DTENSORRT_ROOT="%TENSORRT_ROOT%" -DMSVC_STATIC_CRT=%MSVC_STATIC_CRT% ^ --DWITH_UNITY_BUILD=%WITH_UNITY_BUILD% -DCUDA_ARCH_NAME=%CUDA_ARCH_NAME% >> check_change_of_unittest.sh -echo cat ^<^> check_change_of_unittest.sh -echo ============================================ >> check_change_of_unittest.sh -echo Generate unit tests.spec of develop. >> check_change_of_unittest.sh -echo ============================================ >> check_change_of_unittest.sh -echo EOF>> check_change_of_unittest.sh -echo spec_path=$(pwd)/UNITTEST_DEV.spec>> check_change_of_unittest.sh -echo ctest -N ^| awk -F ':' '{print $2}' ^| sed '/^^$/d' ^| sed '$d' ^> ${spec_path}>> check_change_of_unittest.sh -echo unittest_spec_diff=`python $(pwd)/../tools/diff_unittest.py $(pwd)/UNITTEST_DEV.spec $(pwd)/UNITTEST_PR.spec`>> check_change_of_unittest.sh -echo if [ "$unittest_spec_diff" ^!= "" ]; then>> check_change_of_unittest.sh -echo set +x>> check_change_of_unittest.sh -echo approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`>> check_change_of_unittest.sh -echo set -x>> check_change_of_unittest.sh -echo if [ "$approval_line" ^!= "" ]; then>> check_change_of_unittest.sh -echo APPROVALS=`echo ${approval_line} ^|python $(pwd)/../tools/check_pr_approval.py 1 22165420 52485244 6836917`>> check_change_of_unittest.sh -echo echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}">> check_change_of_unittest.sh -echo if [ "${APPROVALS}" == "FALSE" ]; then>> check_change_of_unittest.sh -echo echo "************************************" >> check_change_of_unittest.sh -echo echo -e "It is forbidden to disable or delete the unit-test.\n" >> check_change_of_unittest.sh -echo echo -e "If you must delete it temporarily, please add it to[https://github.com/PaddlePaddle/Paddle/wiki/Temporarily-disabled-Unit-Test]." >> check_change_of_unittest.sh -echo echo -e "Then you must have one RD (kolinwei(recommended) or zhouwei25) approval for the deletion of unit-test. \n" >> check_change_of_unittest.sh -echo echo -e "If you have any problems about deleting unit-test, please read the specification [https://github.com/PaddlePaddle/Paddle/wiki/Deleting-unit-test-is-forbidden]. \n" >> check_change_of_unittest.sh -echo echo -e "Following unit-tests are deleted in this PR: \n ${unittest_spec_diff} \n" >> check_change_of_unittest.sh -echo echo "************************************" >> check_change_of_unittest.sh -echo exit 1 >> check_change_of_unittest.sh -echo fi>> check_change_of_unittest.sh -echo else>> check_change_of_unittest.sh -echo exit 1 >> check_change_of_unittest.sh -echo fi>> check_change_of_unittest.sh -echo fi>> check_change_of_unittest.sh -echo git checkout -f origin_pr >> check_change_of_unittest.sh -%cache_dir%\tools\busybox64.exe bash check_change_of_unittest.sh +%cache_dir%\tools\busybox64.exe bash %work_dir%\tools\windows\check_change_of_unittest.sh + goto:eof :check_change_of_unittest_error diff --git a/python/paddle/distributed/auto_parallel/completion.py b/python/paddle/distributed/auto_parallel/completion.py index ae2d9163435b906f17e9b28a680302d2bd305bbc..e303ce1216822b26bb58813c37239ae3e3fec043 100644 --- a/python/paddle/distributed/auto_parallel/completion.py +++ b/python/paddle/distributed/auto_parallel/completion.py @@ -21,11 +21,12 @@ from paddle.fluid import framework from .utils import print_program_with_dist_attr from .operators import find_best_compatible_distributed_operator_impl -from .dist_context import get_default_distributed_context +from .dist_context import get_default_distributed_context, _node_id from .dist_tensor import DistributedTensor from .dist_op import DistributedOperator from .dist_attribute import TensorDistributedAttribute from .dist_attribute import OperatorDistributedAttribute +from .process_mesh import ProcessMesh from paddle.distributed.fleet.meta_optimizers.common import OpRole @@ -108,6 +109,20 @@ def compute_compatible_dims_mapping(dims_mapping_list): return compatible_result +def merge_process_mesh_two(pm1, pm2): + process_set1 = set() + process_set2 = set() + if pm1 is None and pm2 is None: + return None + if pm1 is not None: + process_set1 = set(pm1.processes) + if pm2 is not None: + process_set2 = set(pm2.processes) + merged_process_set = process_set1.union(process_set2) + merged_process_mesh = ProcessMesh(list(merged_process_set)) + return merged_process_mesh + + class Completer: def __init__(self, dist_context): assert dist_context is not None @@ -119,7 +134,9 @@ class Completer: return False tensor_desc = tensor_node.var() # Skip reader tensor - if tensor_desc.type() == core.VarDesc.VarType.READER: + if tensor_desc.type() == core.VarDesc.VarType.READER \ + or tensor_desc.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \ + or tensor_desc.type == core.VarDesc.VarType.STEP_SCOPES: return False tensor_dist_attr = self._dist_context.get_tensor_dist_attr_for_graph( tensor_node) @@ -185,7 +202,7 @@ class Completer: op_dist_attr = dist_op.dist_attr if fwd: for tensor_node in op_node.inputs: - if tensor_node.var() is not None: + if tensor_node.is_var() and tensor_node.var() is not None: if tensor_node.var().type() == core.VarDesc.VarType.READER: continue tensor_desc = tensor_node.var() @@ -208,19 +225,19 @@ class Completer: # Find the most compatible implemenetations from the distributed operator op_dist_impl = find_best_compatible_distributed_operator_impl( dist_op, fwd=True) - assert op_dist_impl is not None, "Cannot find the dist op implementation." - dim_changed = op_dist_impl.update_dims_mapping(dist_op) - if dim_changed: - changed = True - if op_dist_impl.is_auto_compatible(dist_op): - if op_dist_impl.type == "elementwise": - op_dist_attr.impl_type = "default" - else: - op_dist_attr.impl_type = op_dist_impl.type - op_dist_attr.impl_idx = op_dist_impl.idx + if op_dist_impl is not None: + dim_changed = op_dist_impl.update_dims_mapping(dist_op) + if dim_changed: + changed = True + if op_dist_impl.is_auto_compatible(dist_op): + if op_dist_impl.type == "elementwise": + op_dist_attr.impl_type = "default" + else: + op_dist_attr.impl_type = op_dist_impl.type + op_dist_attr.impl_idx = op_dist_impl.idx else: for tensor_node in op_node.outputs: - if tensor_node.var() is not None: + if tensor_node.is_var() and tensor_node.var() is not None: if tensor_node.var().type() == core.VarDesc.VarType.READER: continue tensor_desc = tensor_node.var() @@ -243,61 +260,38 @@ class Completer: # Find the most compatible implemenetations from the distributed operator op_dist_impl = find_best_compatible_distributed_operator_impl( dist_op, fwd=False) - assert op_dist_impl is not None, "Cannot find the dist op implementation." - dim_changed = op_dist_impl.update_dims_mapping(dist_op) - if dim_changed: - changed = True - if op_dist_impl.is_auto_compatible(dist_op): - if op_dist_impl.type == "elementwise": - op_dist_attr.impl_type = "default" - else: - op_dist_attr.impl_type = op_dist_impl.type - op_dist_attr.impl_idx = op_dist_impl.idx + if op_dist_impl is not None: + dim_changed = op_dist_impl.update_dims_mapping(dist_op) + if dim_changed: + changed = True + if op_dist_impl.is_auto_compatible(dist_op): + if op_dist_impl.type == "elementwise": + op_dist_attr.impl_type = "default" + else: + op_dist_attr.impl_type = op_dist_impl.type + op_dist_attr.impl_idx = op_dist_impl.idx return changed - def _update_process_mesh(self): - def _find_nearset_node(nodes, idx): - for node in reversed(nodes[:idx]): - node_dist_attr = self._dist_context.get_dist_attr_for_graph( - node) - if node_dist_attr.process_mesh is not None: - return node - - total_reach_fix_point = False - while not total_reach_fix_point: - total_changed = False - for is_fwd in [True, False]: - all_nodes = self._dist_context.serial_ordered_nodes \ - if is_fwd else reversed(self._dist_context.serial_ordered_nodes) - reach_fix_point = False - while not reach_fix_point: - changed = False - for idx, node in enumerate(all_nodes): - nearest_node = _find_nearset_node( - self._dist_context.serial_ordered_nodes, idx) - if nearest_node is None: - continue - nearest_node_dis_attr = self._dist_context.get_dist_attr_for_graph( - nearest_node) - nearest_process_mesh = nearest_node_dis_attr.process_mesh - cur_node_dist_attr = self._dist_context.get_dist_attr_for_graph( - node) - cur_process_mesh = cur_node_dist_attr.process_mesh - compatible_process_mesh = compute_compatible_process_mesh( - [cur_process_mesh, nearest_process_mesh]) - if compatible_process_mesh is not None \ - and cur_process_mesh != compatible_process_mesh: - cur_node_dist_attr.process_mesh = compatible_process_mesh - changed = True - if changed: - reach_fix_point = False - total_changed = True - else: - reach_fix_point = True - if total_changed: - total_reach_fix_point = False - else: - total_reach_fix_point = True + def _update_dims_mapping_between_graphs(self): + changed = False + for parent_node, child_node in self._node_pairs_between_graphs: + parent_node_dist_attr = self._dist_context.get_dist_attr_for_graph( + parent_node) + child_node_dist_attr = self._dist_context.get_dist_attr_for_graph( + child_node) + parent_node_dims_mapping = parent_node_dist_attr.dims_mapping + child_node_dims_mapping = child_node_dist_attr.dims_mapping + compatible_dims_mapping = compute_compatible_dims_mapping( + [parent_node_dims_mapping, child_node_dims_mapping]) + if (compatible_dims_mapping is not None) \ + and (compatible_dims_mapping != parent_node_dims_mapping): + parent_node_dist_attr.dims_mapping = compatible_dims_mapping + changed = True + if (compatible_dims_mapping is not None) \ + and (compatible_dims_mapping != child_node_dims_mapping): + parent_node_dist_attr.dims_mapping = compatible_dims_mapping + changed = True + return changed def _update_dims_mapping(self): # Complete dims_mapping for each node @@ -318,11 +312,314 @@ class Completer: node, fwd=is_fwd) if op_changed: changed = True + graph_changed = self._update_dims_mapping_between_graphs() + if graph_changed: + changed = True if changed: reach_fix_point = False else: reach_fix_point = True + def _update_process_mesh_by_nearest(self, op_node, nearest_op_node): + op_dist_attr = self._dist_context.get_dist_attr_for_graph(op_node) + # Set the process mesh of the op node by its nearest op node + if not op_dist_attr.is_annotated("process_mesh"): + process_mesh = op_dist_attr.process_mesh + nearest_op_dis_attr = self._dist_context.get_dist_attr_for_graph( + nearest_op_node) + nearest_process_mesh = nearest_op_dis_attr.process_mesh + compatible_process_mesh = compute_compatible_process_mesh( + [process_mesh, nearest_process_mesh]) + if compatible_process_mesh is not None \ + and process_mesh != compatible_process_mesh: + op_dist_attr.process_mesh = compatible_process_mesh + # Skip the process_mesh setting of inputs and outputs of while_op + if op_dist_attr.op_type == "while": + return + # Set the process mesh of the op node's leaf-inputs + for tensor_node in op_node.inputs: + if tensor_node.is_var() and tensor_node.var() is not None: + tensor_dist_attr = self._dist_context.get_tensor_dist_attr_for_graph( + tensor_node) + if tensor_dist_attr.is_annotated("process_mesh"): + continue + # Skip the non-leaf var node + if len(tensor_node.inputs) != 0: + continue + compatible_process_mesh = compute_compatible_process_mesh( + [tensor_dist_attr.process_mesh, op_dist_attr.process_mesh]) + if compatible_process_mesh is not None \ + and tensor_dist_attr.process_mesh != compatible_process_mesh: + tensor_dist_attr.process_mesh = compatible_process_mesh + # Set the process mesh of the op node's outputs + for tensor_node in op_node.outputs: + if tensor_node.is_var() and tensor_node.var() is not None: + tensor_dist_attr = self._dist_context.get_tensor_dist_attr_for_graph( + tensor_node) + if tensor_dist_attr.is_annotated("process_mesh"): + continue + compatible_process_mesh = compute_compatible_process_mesh( + [tensor_dist_attr.process_mesh, op_dist_attr.process_mesh]) + if compatible_process_mesh is not None \ + and tensor_dist_attr.process_mesh != compatible_process_mesh: + tensor_dist_attr.process_mesh = compatible_process_mesh + + def _update_process_mesh_for_specials(self): + def _find_nearest_tensor_node_before(nodes, idx, var_name): + for node in reversed(nodes[:idx]): + if node.is_var() and node.var() is not None \ + and node.var().name() == var_name: + return node + + def _find_nearest_tensor_node_after(nodes, idx, var_name): + for node in nodes[idx + 1:]: + if node.is_var() and node.var() is not None \ + and node.var().name() == var_name: + return node + + def _find_nodes_related_to_cond(source_node): + related_nodes = [] + visited = set() + frontier = list() + frontier.append(source_node) + # BFS + while len(frontier) != 0: + cur = frontier[0] + frontier = frontier[1:] + if _node_id(cur) in visited: + continue + # TODO: need more restrictions + for node in cur.inputs: + if node.is_var() and node.var() is not None: + if node.var().type() != core.VarDesc.VarType.READER \ + and len(node.var().shape()) == 1: + frontier.append(node) + related_nodes.append(node) + if node.is_op() and node.op() is not None: + flag = True + if node.op().type() == "create_py_reader" \ + or node.op().type() == "create_double_buffer_reader" \ + or node.op().type() == "read": + flag = False + for tensor_node in node.inputs: + if tensor_node.is_var() and tensor_node.var( + ) is not None: + if tensor_node.var().type() == core.VarDesc.VarType.READER \ + or len(tensor_node.var().shape()) != 1: + flag = False + break + for tensor_node in node.outputs: + if tensor_node.is_var() and tensor_node.var( + ) is not None: + if tensor_node.var().type() == core.VarDesc.VarType.READER \ + or len(tensor_node.var().shape()) != 1: + flag = False + break + if flag: + frontier.append(node) + related_nodes.append(node) + visited.add(_node_id(cur)) + return related_nodes + + # Amend the process meshes related to while_op + for while_op_node, while_op_node_idx in self._while_op_nodes.values(): + sub_graph_id = while_op_node.op()._block_attr_id("sub_block") + sub_graph = self._dist_context._serial_graph.get_sub_graph( + sub_graph_id) + sub_graph_nodes = list(sub_graph.all_nodes()) + while_dist_op = self._dist_context.get_dist_op_for_graph( + while_op_node) + while_op_dist_attr = while_dist_op.dist_attr + + # Step 1: set the process mesh of while_op to the merged process mesh of its subblock + merged_process_mesh = while_op_dist_attr.process_mesh + for node in sub_graph_nodes: + if (node.is_var() and node.var() is not None) \ + or (node.is_op() and node.op() is not None): + dist_attr = self._dist_context.get_dist_attr_for_graph(node) + merged_process_mesh = merge_process_mesh_two( + merged_process_mesh, dist_attr.process_mesh) + while_op_dist_attr.process_mesh = merged_process_mesh + + # Step 2: set the related nodes of while_op to the process mesh of while_op + # Step 2.1: Find related nodes of cond var the graph of while_op + cond_tensor_related_nodes = [] + cond_tensor_name = while_op_node.op().input("Condition")[0] + cond_tensor_node = None + for node in while_op_node.inputs: + if node.is_var() and node.var() is not None \ + and node.var().name() == cond_tensor_name: + cond_tensor_node = node + cond_tensor_related_nodes.append(cond_tensor_node) + break + + cond_tensor_related_nodes.extend( + _find_nodes_related_to_cond(cond_tensor_node)) + + # Step 2.2: Find related nodes of cond var in the subgraph of while_op + cond_tensor_node = None + for node in reversed(sub_graph_nodes): + if node.is_var() and node.var() is not None \ + and node.var().name() == cond_tensor_name \ + and len(node.outputs) == 0: + cond_tensor_node = node + break + + cond_tensor_related_nodes.extend( + _find_nodes_related_to_cond(cond_tensor_node)) + # Step 2.3: Add the StepScops output of while_op + stepscopes_tensor_name = while_op_node.op().output("StepScopes")[0] + stepscopes_tensor_node = None + for output_node in while_op_node.outputs: + if output_node.is_var() and output_node.var() is not None \ + and output_node.var().name() == stepscopes_tensor_name: + stepscopes_tensor_node = output_node + cond_tensor_related_nodes.append(stepscopes_tensor_node) + # Step 2.4: Set the process meshes of all nodes related to cond var to the process mesh of while op + for node in cond_tensor_related_nodes: + tensor_dist_attr = self._dist_context.get_dist_attr_for_graph( + node) + tensor_dist_attr.process_mesh = merged_process_mesh + + # Step 3: set the process meshes of the inputs in while_op to the process meshes of the outside input nodes + while_op_inputs_dist_attrs = while_op_dist_attr.inputs_dist_attrs + for tensor_name, tensor_dist_attr in while_op_inputs_dist_attrs.items( + ): + nearest_tensor_node = _find_nearest_tensor_node_before( + self._dist_context.serial_ordered_nodes, while_op_node_idx, + tensor_name) + nearest_tensor_dist_attr = self._dist_context.get_dist_attr_for_graph( + nearest_tensor_node) + tensor_dist_attr.process_mesh = nearest_tensor_dist_attr.process_mesh + + # Step 4: set the process meshes of the outputs in while_op to the process meshes of the outside output nodes + while_op_outputs_dist_attrs = while_op_dist_attr.outputs_dist_attrs + for tensor_name, tensor_dist_attr in while_op_outputs_dist_attrs.items( + ): + nearest_tensor_node = _find_nearest_tensor_node_before( + self._dist_context.serial_ordered_nodes, while_op_node_idx, + tensor_name) + if nearest_tensor_node is None: + nearest_tensor_node = _find_nearest_tensor_node_after( + self._dist_context.serial_ordered_nodes, + while_op_node_idx, tensor_name) + nearest_tensor_dist_attr = self._dist_context.get_dist_attr_for_graph( + nearest_tensor_node) + tensor_dist_attr.process_mesh = nearest_tensor_dist_attr.process_mesh + + # Amend the process meshes related to array + for array_node_list in self._array_nodes.values(): + merged_process_mesh = None + for array_node in array_node_list: + dist_attr = self._dist_context.get_dist_attr_for_graph( + array_node) + merged_process_mesh = merge_process_mesh_two( + merged_process_mesh, dist_attr.process_mesh) + for array_node in array_node_list: + dist_attr = self._dist_context.get_dist_attr_for_graph( + array_node) + dist_attr.process_mesh = merged_process_mesh + + def _update_process_mesh(self): + ordered_op_nodes = self._dist_context._serial_ordered_op_nodes + + # Step 1: Set the annotated process meshes from tensors to the first ops using them + ordered_tensor_nodes = self._dist_context._serial_ordered_tensor_nodes + for tensor_node in ordered_tensor_nodes: + tensor_dist_attr = self._dist_context.get_tensor_dist_attr_for_graph( + tensor_node) + if not tensor_dist_attr.is_annotated("process_mesh"): + continue + first_op_node = None + for op_node in ordered_op_nodes: + # TODO: Need a better rule for the control flow ops. + # For now, do not set the process mesh of while_op from its inputs + if op_node.op().type() == "while": + continue + for input_tensor_node in op_node.inputs: + if _node_id(tensor_node) == _node_id(input_tensor_node): + first_op_node = op_node + break + if first_op_node is not None: + break + if first_op_node is None: + continue + op_dist_attr = self._dist_context.get_dist_attr_for_graph( + first_op_node) + if op_dist_attr is not None and not op_dist_attr.is_annotated( + "process_mesh"): + compatible_process_mesh = compute_compatible_process_mesh( + [tensor_dist_attr.process_mesh, op_dist_attr.process_mesh]) + if compatible_process_mesh is not None \ + and op_dist_attr.process_mesh != compatible_process_mesh: + op_dist_attr.process_mesh = compatible_process_mesh + + # Step 2: set the process meshes of ops with the nearest op before them + # Step 2.1: find the first op node which has the process mesh + idx_of_first_op_node_has_process_mesh = -1 + for idx, op_node in enumerate(ordered_op_nodes): + op_dist_attr = self._dist_context.get_dist_attr_for_graph(op_node) + if op_dist_attr.process_mesh is not None \ + and idx_of_first_op_node_has_process_mesh == -1: + idx_of_first_op_node_has_process_mesh = idx + # Reuse the following method to set the related tensors for same op node + self._update_process_mesh_by_nearest(op_node, op_node) + # Step 2.2: set the process meshes of ops by the nearest op node after the first op node + if idx_of_first_op_node_has_process_mesh + 1 > len(ordered_op_nodes): + return None + for idx, op_node in enumerate(ordered_op_nodes[ + idx_of_first_op_node_has_process_mesh + 1:]): + original_idx = idx_of_first_op_node_has_process_mesh + +idx + 1 + nearest_op_node = ordered_op_nodes[original_idx - 1] + nearest_op_dist_attr = self._dist_context.get_dist_attr_for_graph( + nearest_op_node) + op_dist_attr = self._dist_context.get_dist_attr_for_graph(op_node) + assert nearest_op_dist_attr.process_mesh is not None + self._update_process_mesh_by_nearest(op_node, nearest_op_node) + # Step 2.3: set the process meshes of ops by the nearest op node before the first op node + nearest_op_node = ordered_op_nodes[ + idx_of_first_op_node_has_process_mesh] + for op_node in ordered_op_nodes[:idx_of_first_op_node_has_process_mesh]: + self._update_process_mesh_by_nearest(op_node, nearest_op_node) + + # Step 3: adjust the process meshes for special ops + self._update_process_mesh_for_specials() + + def _prepare(self): + self._while_op_nodes = {} + self._array_nodes = {} + self._node_pairs_between_graphs = [] + all_nodes = self._dist_context.serial_ordered_nodes + for idx, node in enumerate(all_nodes): + if node.is_op(): + if node.op().type() == "while": + self._while_op_nodes[_node_id(node)] = (node, idx) + if node.op().type() == "read_from_array": + array_var_name = node.op().input("X")[0] + if self._array_nodes.get(array_var_name, None) is None: + self._array_nodes[array_var_name] = [] + self._array_nodes[array_var_name].append(node) + if node.op().type() == "write_to_array": + array_var_name = node.op().output("Out")[0] + if self._array_nodes.get(array_var_name, None) is None: + self._array_nodes[array_var_name] = [] + self._array_nodes[array_var_name].append(node) + self._array_nodes[array_var_name].append(node.outputs[0]) + if node.is_var() and node.var() is not None: + if node.node.graph_id() != 0: + for before_node in reversed(all_nodes[:idx]): + if before_node.is_var() and before_node.var() is not None \ + and before_node.node.graph_id() == node.node.graph_id() - 1 \ + and before_node.var().name() == node.var().name(): + self._node_pairs_between_graphs.append( + (before_node, node)) + for after_node in all_nodes[idx + 1:]: + if after_node.is_var() and after_node.var() is not None \ + and after_node.node.graph_id() == node.node.graph_id() - 1 \ + and after_node.var().name() == node.var().name(): + self._node_pairs_between_graphs.append( + (after_node, node)) + def complete_forward_annotation(self, serial_main_program): """ Complete annotation for the partial annotated serial_main_program. Arguments: @@ -336,24 +633,24 @@ class Completer: # Initialize distributed attributes for all var and op node in serial_main_program self._dist_context.init_dist_attr_for_program() + # print_program_with_dist_attr(serial_main_program, self._dist_context) # Initialize distributed attributes for all var and op node in graph self._dist_context.init_dist_attr_for_graph() + self._prepare() + self._update_process_mesh() - # Complete dims_mapping for each node self._update_dims_mapping() # Copy the corresponding distributed attribute from graph to serial_main_program self._dist_context.copy_dist_attr_from_graph_to_program() self._dist_context.clear_dist_info_for_graph() - # print_serial_main_program_with_dist_attr(serial_main_program, self._dist_context) # Do the validation check and amend some completion self._dist_context.amend_dist_attr_for_program() - # print_serial_main_program_with_dist_attr(serial_main_program, self._dist_context) self._dist_context.validate_dist_attr_for_program() return serial_main_program diff --git a/python/paddle/distributed/auto_parallel/dist_attribute.py b/python/paddle/distributed/auto_parallel/dist_attribute.py index b27cd7a37c95626584194ae7bd619ab16a0e5ea7..8ec702ffcb0b65af96833b4d4d2be1c8ff08d788 100644 --- a/python/paddle/distributed/auto_parallel/dist_attribute.py +++ b/python/paddle/distributed/auto_parallel/dist_attribute.py @@ -175,6 +175,7 @@ class TensorDistributedAttribute: class OperatorDistributedAttribute: def __init__(self): self._process_mesh = None + self._op_type = None self._impl_type = None self._impl_idx = None self._inputs_dist_attrs = {} @@ -194,11 +195,23 @@ class OperatorDistributedAttribute: if isinstance(process_mesh, list): process_mesh = ProcessMesh(process_mesh) self._process_mesh = copy.deepcopy(process_mesh) + # In while op, the proess mesh is not shared by all inputs and outputs + if self._op_type == "while": + return None for dist_attr in self._inputs_dist_attrs.values(): dist_attr.process_mesh = process_mesh for dist_attr in self._outputs_dist_attrs.values(): dist_attr.process_mesh = process_mesh + @property + def op_type(self): + return self._op_type + + @op_type.setter + def op_type(self, op_type): + if op_type is not None: + self._op_type = op_type + @property def impl_type(self): return self._impl_type @@ -326,6 +339,8 @@ class OperatorDistributedAttribute: assert False, "No setter for {} in args {}.".format( key, dist_attr) # Make sure proscess_meshes in dist op be same + if self.op_type == "while": + return None process_meshes = [] process_meshes.append(self.process_mesh) for tensor_dist_attr in self.inputs_dist_attrs.values(): diff --git a/python/paddle/distributed/auto_parallel/dist_context.py b/python/paddle/distributed/auto_parallel/dist_context.py index 573f23fdca519ae1da10d62ef7eb2da6238805f3..2807c46540ab1e52f7490c850faa34eac00c04db 100644 --- a/python/paddle/distributed/auto_parallel/dist_context.py +++ b/python/paddle/distributed/auto_parallel/dist_context.py @@ -15,6 +15,7 @@ import copy from collections import defaultdict from paddle.fluid import framework +from paddle.fluid.framework import get_flags, set_flags from paddle.fluid import core from .dist_attribute import TensorDistributedAttribute from .dist_attribute import OperatorDistributedAttribute @@ -39,6 +40,10 @@ def set_default_distributed_context(dist_context): _g_default_distributed_context = dist_context +def _node_id(node): + return (node.node.graph_id(), node.node.id()) + + class DistributedContext: """ DistributedContext is used to collect related distributed information for program and graph. @@ -146,7 +151,7 @@ class DistributedContext: return None def get_dist_tensor_for_graph(self, serial_tensor_node): - serial_tensor_node_id = serial_tensor_node.id() + serial_tensor_node_id = _node_id(serial_tensor_node) return self._dist_tensors_for_graph.get(serial_tensor_node_id, None) def get_dist_op_for_program(self, serial_op): @@ -168,7 +173,7 @@ class DistributedContext: del self._dist_ops_for_program[serial_tensor_id] def get_dist_op_for_graph(self, serial_op_node): - serial_op_node_id = serial_op_node.id() + serial_op_node_id = _node_id(serial_op_node) return self._dist_ops_for_graph.get(serial_op_node_id, None) def get_tensor_dist_attr_for_program(self, serial_tensor): @@ -197,7 +202,7 @@ class DistributedContext: self.add_dist_tensor_for_program(dist_tensor) def get_tensor_dist_attr_for_graph(self, serial_tensor_node): - serial_tensor_node_id = serial_tensor_node.id() + serial_tensor_node_id = _node_id(serial_tensor_node) dist_tensor = self._dist_tensors_for_graph.get(serial_tensor_node_id, None) if dist_tensor: @@ -242,7 +247,7 @@ class DistributedContext: self.add_dist_op_for_program(dist_op) def get_op_dist_attr_for_graph(self, serial_op_node): - serial_op_node_id = serial_op_node.id() + serial_op_node_id = _node_id(serial_op_node) dist_op = self._dist_ops_for_graph.get(serial_op_node_id, None) if dist_op: return dist_op.dist_attr @@ -262,7 +267,7 @@ class DistributedContext: def get_dist_attr_for_graph(self, serial_node): if serial_node.is_var() and serial_node.var() is not None: - serial_tensor_node_id = serial_node.id() + serial_tensor_node_id = _node_id(serial_node) dist_tensor = self._dist_tensors_for_graph.get( serial_tensor_node_id, None) if dist_tensor: @@ -270,7 +275,7 @@ class DistributedContext: else: return None if serial_node.is_op() and serial_node.op() is not None: - serial_op_node_id = serial_node.id() + serial_op_node_id = _node_id(serial_node) dist_op = self._dist_ops_for_graph.get(serial_op_node_id, None) if dist_op: return dist_op.dist_attr @@ -311,40 +316,69 @@ class DistributedContext: def order_nodes_by_program_order(self): def _contains(nodes, target_node): for node in nodes: - if node.id() == target_node.id(): + if _node_id(node) == _node_id(target_node): return True return False - ordered_tensor_nodes = [] - ordered_op_nodes = [] - all_nodes = self._serial_graph.all_nodes() + serial_ordered_tensor_nodes = [] + serial_ordered_op_nodes = [] + all_nodes = [] + # for idx, graph in enumerate(self._serial_graph.all_sub_graphs()): + for idx, graph in enumerate(self._serial_graph.all_sub_graphs()): + for node in graph.all_nodes(): + all_nodes.append(node) for node in all_nodes: if node.is_var() and node.var() is not None: - ordered_tensor_nodes.append(node) + serial_ordered_tensor_nodes.append(node) if node.is_op() and node.op() is not None: - ordered_op_nodes.append(node) - ordered_tensor_nodes.sort(key=lambda node: node.node.original_desc_id()) - ordered_op_nodes.sort(key=lambda node: node.node.original_desc_id()) - for op_node in ordered_op_nodes: + serial_ordered_op_nodes.append(node) + serial_ordered_tensor_nodes.sort( + key=lambda node: node.node.original_desc_id()) + serial_ordered_op_nodes.sort( + key=lambda node: node.node.original_desc_id()) + num_nodes_before = len(serial_ordered_tensor_nodes) + len( + serial_ordered_op_nodes) + + new_serial_ordered_tensor_nodes = [] + new_serial_ordered_op_nodes = [] + for op_node in serial_ordered_op_nodes: tensor_nodes = [] for tensor_node in op_node.inputs: if tensor_node.is_var() \ and tensor_node.var() is not None \ and not _contains(self._serial_ordered_nodes, tensor_node): tensor_nodes.append(tensor_node) + new_serial_ordered_tensor_nodes.append(tensor_node) tensor_nodes.sort(key=lambda node: node.node.original_desc_id()) self._serial_ordered_nodes.extend(tensor_nodes) self._serial_ordered_nodes.append(op_node) + new_serial_ordered_op_nodes.append(op_node) tensor_nodes = [] for tensor_node in op_node.outputs: if tensor_node.is_var() \ and tensor_node.var() is not None \ and not _contains(self._serial_ordered_nodes, tensor_node): tensor_nodes.append(tensor_node) + new_serial_ordered_tensor_nodes.append(tensor_node) + tensor_nodes.sort(key=lambda node: node.node.original_desc_id()) self._serial_ordered_nodes.extend(tensor_nodes) - num_nodes_before = len(ordered_tensor_nodes) + len(ordered_op_nodes) - assert len(self._serial_ordered_nodes) == num_nodes_before, \ - "The number of nodes before ordering is not the same after ordering." + new_serial_ordered_tensor_nodes.sort( + key=lambda node: node.node.original_desc_id()) + new_serial_ordered_op_nodes.sort( + key=lambda node: node.node.original_desc_id()) + self._serial_ordered_tensor_nodes = new_serial_ordered_tensor_nodes + self._serial_ordered_op_nodes = new_serial_ordered_op_nodes + assert len(self._serial_ordered_nodes) == len( + self._serial_ordered_tensor_nodes) + len( + self._serial_ordered_op_nodes) + self._serial_orphan_tensor_nodes = [] + for tensor_node in serial_ordered_tensor_nodes: + if not _contains(self._serial_ordered_tensor_nodes, tensor_node): + self._serial_orphan_tensor_nodes.append(tensor_node) + if len(self._serial_ordered_nodes) != num_nodes_before: + print( + "WARNING: there are some orphan tensors or ops which are not used in the execution." + ) def init_dist_attr_for_graph(self): assert self._is_initialized_for_program, \ @@ -352,9 +386,9 @@ class DistributedContext: if self._is_initialized_for_graph: return # Convert program to graph + set_flags({"FLAGS_convert_all_blocks": True}) self._serial_graph = framework.IrGraph( core.Graph(self._serial_program.desc)) - all_nodes = self._serial_graph.all_nodes() self.order_nodes_by_program_order() for node in self.serial_ordered_nodes: if node.is_var() and node.var() is not None: @@ -365,10 +399,11 @@ class DistributedContext: if tensor_id == cur_tensor_id \ or tensor_id == cur_dist_tensor.serial_tensor.desc.original_id(): dist_tensor = cur_dist_tensor - self._node_id_to_tensor_id[node.id()] = cur_tensor_id + self._node_id_to_tensor_id[_node_id( + node)] = cur_tensor_id assert dist_tensor is not None, \ "Tensor must have a distributed tensor after the initialization for program." - serial_tensor_node_id = node.id() + serial_tensor_node_id = _node_id(node) new_dist_tensor = DistributedTensor(dist_tensor.serial_tensor, dist_tensor.dist_attr) self._dist_tensors_for_graph[ @@ -381,10 +416,10 @@ class DistributedContext: if op_id == cur_op_id \ or op_id == cur_dist_op.serial_op.desc.original_id(): dist_op = cur_dist_op - self._node_id_to_op_id[node.id()] = cur_op_id + self._node_id_to_op_id[_node_id(node)] = cur_op_id assert dist_op is not None, \ "Operator must have a distributed operator after the initialization for program." - serial_op_node_id = node.id() + serial_op_node_id = _node_id(node) new_dist_op = DistributedOperator(dist_op.serial_op, dist_op.dist_attr) self._dist_ops_for_graph[serial_op_node_id] = new_dist_op @@ -402,10 +437,11 @@ class DistributedContext: assert self._is_initialized_for_program and self._is_initialized_for_graph, \ "Both program and graph must be initialized." updated_tensors = {} - all_nodes = self._serial_graph.all_nodes() + # all_nodes = self._serial_graph.all_nodes() + all_nodes = self._serial_ordered_nodes for node in all_nodes: if node.is_var() and node.var() is not None: - tensor_id = self._node_id_to_tensor_id[node.id()] + tensor_id = self._node_id_to_tensor_id[_node_id(node)] updated = updated_tensors.get(tensor_id, False) # If a var has multiples var nodes in graph, only use the first one for now if not updated: @@ -416,16 +452,31 @@ class DistributedContext: dist_tensor_for_program.dist_attr = tensor_dist_attr_for_graph updated_tensors[tensor_id] = True if node.is_op() and node.op() is not None: - op_id = self._node_id_to_op_id[node.id()] + op_id = self._node_id_to_op_id[_node_id(node)] op_dist_attr_for_graph = self.get_op_dist_attr_for_graph(node) dist_op_for_program = self._dist_ops_for_program[op_id] dist_op_for_program.dist_attr = op_dist_attr_for_graph + # TODO: the completion algorithm will skip orphan tensors, + # here we just set there process_mesh to the first one. + for orphan_node in self._serial_orphan_tensor_nodes: + serial_tensor_id = orphan_node.var().id() + dist_tensor = self._dist_tensors_for_program.get(serial_tensor_id, + None) + if dist_tensor: + dist_tensor.dist_attr.process_mesh = self._process_meshes[0] + else: + serial_tensor_id = orphan_node.var().original_id() + dist_tensor = self._dist_tensors_for_program.get( + serial_tensor_id, None) + dist_tensor.dist_attr.process_mesh = self._process_meshes[0] def amend_dist_attr_for_program(self): for dist_tensor in self._dist_tensors_for_program.values(): serial_tensor = dist_tensor.serial_tensor dist_attr = dist_tensor.dist_attr - if serial_tensor.type == core.VarDesc.VarType.READER: + if serial_tensor.type == core.VarDesc.VarType.READER \ + or serial_tensor.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \ + or serial_tensor.type == core.VarDesc.VarType.STEP_SCOPES: tensor_shape = [] else: tensor_shape = serial_tensor.shape @@ -446,6 +497,7 @@ class DistributedContext: tensor_shape = [] else: if dist_op.get_serial_input(arg_name).type == core.VarDesc.VarType.READER \ + or dist_op.get_serial_input(arg_name).type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \ or dist_op.serial_op.type == "create_py_reader": tensor_shape = [] else: @@ -459,8 +511,9 @@ class DistributedContext: and process_mesh_shape[dims_mapping[i]] > tensor_shape[i]: dims_mapping[i] = -1 for arg_name in serial_op.output_arg_names: - if dist_op.get_serial_output( - arg_name).type == core.VarDesc.VarType.READER: + if dist_op.get_serial_output(arg_name).type == core.VarDesc.VarType.READER \ + or dist_op.get_serial_output(arg_name).type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \ + or dist_op.get_serial_output(arg_name).type == core.VarDesc.VarType.STEP_SCOPES: tensor_shape = [] else: tensor_shape = dist_op.get_serial_output(arg_name).shape @@ -498,7 +551,8 @@ class DistributedContext: for k, v in self.__dict__.items(): if k == "_serial_program" or k == "_serial_graph" \ or k == "_dist_main_programs" or k == "_dist_startup_programs" \ - or k == "_serial_ordered_nodes": + or k == "_serial_ordered_nodes" or k == "_serial_ordered_tensor_nodes" \ + or k == "_serial_ordered_op_nodes": setattr(result, k, v) else: setattr(result, k, copy.deepcopy(v, memo)) diff --git a/python/paddle/distributed/auto_parallel/dist_op.py b/python/paddle/distributed/auto_parallel/dist_op.py index 67de298564afc8caddad90d228131f1795f5707e..a2c2748a8cea390003dfec857a252b7df3ee1b05 100644 --- a/python/paddle/distributed/auto_parallel/dist_op.py +++ b/python/paddle/distributed/auto_parallel/dist_op.py @@ -76,7 +76,8 @@ class DistributedOperator: if tensor is None: tensor_shape = [] else: - if tensor.type == core.VarDesc.VarType.READER: + if tensor.type == core.VarDesc.VarType.READER \ + or tensor.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY: tensor_shape = [] else: tensor_shape = tensor.shape @@ -86,7 +87,9 @@ class DistributedOperator: tensor_dims_mapping) for tensor_name in self._serial_op.output_arg_names: tensor = self._serial_op.block._var_recursive(tensor_name) - if tensor.type == core.VarDesc.VarType.READER or tensor.type == core.VarDesc.VarType.STEP_SCOPES: + if tensor.type == core.VarDesc.VarType.READER \ + or tensor.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \ + or tensor.type == core.VarDesc.VarType.STEP_SCOPES: tensor_shape = [] else: tensor_shape = tensor.shape @@ -95,6 +98,8 @@ class DistributedOperator: tensor_dims_mapping = [-1 for _ in range(len(tensor_shape))] self._dist_attr.set_output_dims_mapping(tensor_name, tensor_dims_mapping) + if self._dist_attr.op_type is None: + self._dist_attr.op_type = self.serial_op.type if self._dist_attr.impl_type is None: self._dist_attr.impl_type = "default" if self._dist_attr.impl_idx is None: @@ -134,12 +139,16 @@ class DistributedOperator: return new_dist_attr def validate_dist_attr(self): - if "read" in self.serial_op.type: + if "read" in self.serial_op.type or "while" == self.serial_op.type: return True for name in self.serial_op.input_arg_names: input_dist_attr = self.dist_attr.get_input_dist_attr(name) dims_mapping = input_dist_attr.dims_mapping - shape = self.get_serial_input(name).shape + if self.get_serial_input( + name).type == core.VarDesc.VarType.LOD_TENSOR_ARRAY: + shape = [] + else: + shape = self.get_serial_input(name).shape if len(shape) != len(dims_mapping): return False for i in range(len(dims_mapping)): @@ -155,7 +164,11 @@ class DistributedOperator: for name in self.serial_op.output_arg_names: output_dist_attr = self.dist_attr.get_output_dist_attr(name) dims_mapping = output_dist_attr.dims_mapping - shape = self.get_serial_output(name).shape + if self.get_serial_output(name).type == core.VarDesc.VarType.LOD_TENSOR_ARRAY\ + or self.get_serial_output(name).type == core.VarDesc.VarType.STEP_SCOPES: + shape = [] + else: + shape = self.get_serial_output(name).shape if len(shape) != len(dims_mapping): return False for i in range(len(dims_mapping)): @@ -241,14 +254,14 @@ class DistributedModule: def __call__(self, *args, **kwargs): from .dist_context import get_default_distributed_context - main_prog = paddle.fluid.default_main_program() - main_block = main_prog.global_block() - op_size = len(main_block.ops) + default_prog = paddle.fluid.default_main_program() + cur_block = default_prog.current_block() + op_size = len(cur_block.ops) output = self._serial_module(*args, **kwargs) - new_op_size = len(main_block.ops) + new_op_size = len(cur_block.ops) default_dist_ctx = get_default_distributed_context() for idx in range(op_size, new_op_size): - op = main_block.ops[idx] + op = cur_block.ops[idx] dist_op = DistributedOperator(op, self._dist_attr) dist_op.dist_attr.mark_annotated_as(self._dist_attr) default_dist_ctx.add_dist_op_for_program(dist_op) diff --git a/python/paddle/distributed/auto_parallel/dist_tensor.py b/python/paddle/distributed/auto_parallel/dist_tensor.py index 5e3c852699ab6f8dcb92b386989338e5ca3d2c1f..a42ce863492b3511e0e7ddfaa3a04b67f57e1157 100644 --- a/python/paddle/distributed/auto_parallel/dist_tensor.py +++ b/python/paddle/distributed/auto_parallel/dist_tensor.py @@ -184,7 +184,9 @@ class DistributedTensor: def _init_default_dist_attr(self): if self._dist_attr.dims_mapping is None: - if self.serial_tensor.type == core.VarDesc.VarType.READER: + if self.serial_tensor.type == core.VarDesc.VarType.READER \ + or self.serial_tensor.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \ + or self.serial_tensor.type == core.VarDesc.VarType.STEP_SCOPES: tensor_shape = [] else: tensor_shape = self._serial_tensor.shape @@ -192,7 +194,9 @@ class DistributedTensor: self._dist_attr.dims_mapping = tensor_dims_mapping def validate_dist_attr(self): - if self.serial_tensor.type == core.VarDesc.VarType.READER: + if self.serial_tensor.type == core.VarDesc.VarType.READER \ + or self.serial_tensor.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \ + or self.serial_tensor.type == core.VarDesc.VarType.STEP_SCOPES: return True tensor_shape = self.serial_tensor.shape if len(tensor_shape) != len(self.dist_attr.dims_mapping): diff --git a/python/paddle/distributed/auto_parallel/engine.py b/python/paddle/distributed/auto_parallel/engine.py index 56beb8957415d3c3c401fdbf754cb17fc5e253a7..6bd1c5527a99e73ddcde1ada5f2a5a496c0d9933 100644 --- a/python/paddle/distributed/auto_parallel/engine.py +++ b/python/paddle/distributed/auto_parallel/engine.py @@ -259,7 +259,7 @@ class Engine: "train_" + name: val for name, val in logs.items() } - self._logger.info(logs) + self._logger.info(train_logs) def _train_step(self, data): logs = {} diff --git a/python/paddle/distributed/auto_parallel/operators/common.py b/python/paddle/distributed/auto_parallel/operators/common.py index 4b079e7b6b575a6bcfd372782529ccc2958cf5db..47f76353e465529f1d29a05852a952d151c76c93 100644 --- a/python/paddle/distributed/auto_parallel/operators/common.py +++ b/python/paddle/distributed/auto_parallel/operators/common.py @@ -17,7 +17,9 @@ from ..dist_attribute import OperatorDistributedAttribute _g_distributed_operator_impl_containers = {} -_g_elementwise_ops = ["elementwise_add", "gelu", "dropout", "cast"] +_g_elementwise_ops = [ + "elementwise_add", "gelu", "dropout", "cast", "gather", "concat" +] BACKWARD_ONLY_DIST_OPS = {'check_finite_and_unscale', 'update_loss_scaling'} diff --git a/python/paddle/distributed/auto_parallel/operators/dist_default.py b/python/paddle/distributed/auto_parallel/operators/dist_default.py index 4e977007261a73e9b24a051f84e6e30f2bf9d860..de6d018d60521564ebc98b8df03e4b1356b846c8 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_default.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_default.py @@ -55,9 +55,14 @@ class DistributedDefaultImpl0(DistributedOperatorImpl): op_dist_attr = dist_op.dist_attr for arg_name in op_desc.input_arg_names(): serial_tensor = dist_op.get_serial_input(arg_name) - if serial_tensor.is_parameter: - continue dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name) + if serial_tensor.is_parameter: + for mapping in dims_mapping: + if mapping != -1: + return False + # continue + # if len(dims_mapping) < 1: + # continue if len(dims_mapping) > 1: for mapping in dims_mapping[1:]: if mapping != -1: @@ -73,9 +78,14 @@ class DistributedDefaultImpl0(DistributedOperatorImpl): xshape_arg_names = op_desc.output("XShape") for arg_name in op_desc.output_arg_names(): serial_tensor = dist_op.get_serial_output(arg_name) - if serial_tensor.is_parameter: - continue dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name) + if serial_tensor.is_parameter: + for mapping in dims_mapping: + if mapping != -1: + return False + # continue + # if len(dims_mapping) < 1: + # continue if arg_name not in xshape_arg_names: if len(dims_mapping) > 1: for mapping in dims_mapping[1:]: @@ -104,7 +114,8 @@ class DistributedDefaultImpl0(DistributedOperatorImpl): for mapping in dims_mapping[1:]: if mapping != -1: return False - batch_dim_mappings.append(dims_mapping[0]) + if len(dims_mapping) >= 1: + batch_dim_mappings.append(dims_mapping[0]) # Check output compatibility output_names = op_desc.output_names() @@ -121,7 +132,8 @@ class DistributedDefaultImpl0(DistributedOperatorImpl): for mapping in dims_mapping[1:]: if mapping != -1: return False - batch_dim_mappings.append(dims_mapping[0]) + if len(dims_mapping) >= 1: + batch_dim_mappings.append(dims_mapping[0]) else: if dims_mapping[0] != -1: return False @@ -129,7 +141,8 @@ class DistributedDefaultImpl0(DistributedOperatorImpl): for mapping in dims_mapping[2:]: if mapping != -1: return False - batch_dim_mappings.append(dims_mapping[1]) + if len(dims_mapping) >= 2: + batch_dim_mappings.append(dims_mapping[1]) # Check batch dim mapping compatibility if not all(batch_dim_mappings[0] == dim_mapping @@ -143,7 +156,9 @@ class DistributedDefaultImpl0(DistributedOperatorImpl): op_desc = dist_op.serial_op.desc op_dist_attr = dist_op.dist_attr # The following statement will be replaced by a more elegent way - if op_desc.type() == "shape" or op_desc.type() == "slice": + if op_desc.type() == "shape" \ + or op_desc.type() == "slice" \ + or op_desc.type() == "while": return False output_names = op_desc.output_names() xshape_arg_names = [] @@ -155,17 +170,22 @@ class DistributedDefaultImpl0(DistributedOperatorImpl): if serial_tensor.is_parameter: continue dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name) - batch_dim_mappings.append(dims_mapping[0]) + if len(dims_mapping) >= 1: + batch_dim_mappings.append(dims_mapping[0]) for arg_name in op_desc.output_arg_names(): serial_tensor = dist_op.get_serial_output(arg_name) if serial_tensor.is_parameter: continue dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name) if arg_name not in xshape_arg_names: - batch_dim_mappings.append(dims_mapping[0]) + if len(dims_mapping) >= 1: + batch_dim_mappings.append(dims_mapping[0]) else: batch_dim_mappings.append(dims_mapping[1]) + if not batch_dim_mappings: + return changed + compatible_dim_mapping = compute_compatible_dim_mapping( batch_dim_mappings) assert compatible_dim_mapping is not None, "There is no compatible dim mapping." @@ -174,7 +194,8 @@ class DistributedDefaultImpl0(DistributedOperatorImpl): if serial_tensor.is_parameter: continue dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name) - if compatible_dim_mapping != dims_mapping[0]: + if len(dims_mapping + ) >= 1 and compatible_dim_mapping != dims_mapping[0]: dims_mapping[0] = compatible_dim_mapping changed = True for arg_name in op_desc.output_arg_names(): @@ -183,11 +204,13 @@ class DistributedDefaultImpl0(DistributedOperatorImpl): continue dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name) if arg_name not in xshape_arg_names: - if compatible_dim_mapping != dims_mapping[0]: + if len(dims_mapping + ) >= 1 and compatible_dim_mapping != dims_mapping[0]: dims_mapping[0] = compatible_dim_mapping changed = True else: - if compatible_dim_mapping != dims_mapping[1]: + if len(dims_mapping + ) >= 2 and compatible_dim_mapping != dims_mapping[1]: dims_mapping[1] = compatible_dim_mapping changed = True diff --git a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py index 058ae1d0a9fd5c25ec83ea15ed9c2e479322957c..c92142cf7384d2b0c76c1a5cb3b4e6ac257303a2 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py @@ -1432,7 +1432,6 @@ class DistributedMatmulV2Impl2(DistributedOperatorImpl): if is_valid_list_index(y_dims_mapping, -2) and is_dim_shard(y_dims_mapping[-2]): return False - return True def is_output_compatible(self, dist_op): diff --git a/python/paddle/distributed/auto_parallel/tuner/recorder.py b/python/paddle/distributed/auto_parallel/tuner/recorder.py new file mode 100644 index 0000000000000000000000000000000000000000..140336566a146776f805f7b546fe6bb39c267861 --- /dev/null +++ b/python/paddle/distributed/auto_parallel/tuner/recorder.py @@ -0,0 +1,214 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + + +class MetricRecord(object): + """ + One record for a single metric at a given execution step. + """ + + def __init__(self, value, step): + self._value = value + self._step = step + + @property + def value(self): + return self._value + + @value.setter + def value(self, value): + self._value = value + + @property + def step(self): + return self._step + + @step.setter + def step(self, step): + self._step = step + + def mean(self): + return np.mean(self.value) + + def get_state(self): + return {"value": self.value, "step": self.step} + + @classmethod + def from_state(cls, state): + return cls(**state) + + def __eq__(self, other): + if not isinstance(other, MetricRecord): + return False + return other.value == self.value and other.step == self.step + + def __repr__(self): + return "MetricRecord(value={}, step={})".format(self.value, self.step) + + +class MetricRecords(object): + """ + Records of a single metric across different executions. + """ + + def __init__(self, direction="min"): + if direction not in {"min", "max"}: + raise ValueError( + "direction should be one of {min, max}, but got: {}.".format( + direction)) + self._direction = direction + self._records = {} + + @property + def records(self): + return sorted(self._records.values(), key=lambda r: r.step) + + @records.setter + def records(self, records): + for r in records: + self.update(r.value, step=r.step) + + @property + def direction(self): + return self._direction + + @direction.setter + def direction(self, direction): + self._direction = direction + + def update(self, value, step=0): + if step in self._records: + self._records[step].set_value(value) + else: + self._records[step] = MetricRecord(value, step=step) + + def get_best_value(self): + values = list(r.mean() for r in self._records.values()) + if not values: + return None + if self._direction == "min": + return np.nanmin(values) + return np.nanmax(values) + + def get_best_step(self): + best_value = self.get_best_value() + if best_value is None: + return None + for r in self._records.values(): + if r.mean() == best_value: + return r.step + + def get_statistics(self): + records = self.records + records_values = [r.mean() for r in records] + if not len(records_values): + return {} + return { + "min": float(np.nanmin(records_values)), + "max": float(np.nanmax(records_values)), + "mean": float(np.nanmean(records_values)), + "median": float(np.nanmedian(records_values)), + "var": float(np.nanvar(records_values)), + "std": float(np.nanstd(records_values)), + } + + def get_state(self): + state = {} + state["direction"] = self._direction + state["records"] = [r.get_state() for r in self.records] + return state + + @classmethod + def from_state(cls, state): + records = cls(state["direction"]) + records.records = [MetricRecord.from_state(r) for r in state["records"]] + print("here 1", records.records) + return records + + +class MetricsRecorder(object): + """ + Record the values for all metrics. + """ + + def __init__(self, metrics=None): + self._records = {} + self.register_metrics(metrics) + + @property + def records(self): + return self._records + + def exists(self, name): + return name in self._records + + def register_metrics(self, metrics=None): + metrics = metrics or [] + for metric in metrics: + self.register(metric.name) + + def register(self, name, direction=None): + if self.exists(name): + raise ValueError("Metric {} have been registered.".format(name)) + if direction is None: + direction = "min" + self._records[name] = MetricRecords(direction) + + def update(self, name, value, step=0): + value = float(value) + if not self.exists(name): + self.register(name) + + prev_best = self._records[name].get_best_value() + self._records[name].update(value, step=step) + new_best = self._records[name].get_best_value() + + improved = new_best != prev_best + return improved + + def get_records(self, name): + return self._records[name].records + + def set_records(self, name, records): + if not self.exists(name): + self.register(name) + self._records[name].records = records + + def get_best_value(self, name): + return self._records[name].get_best_value() + + def get_best_step(self, name): + return self._records[name].get_best_step() + + def get_statistics(self, name): + return self._records[name].get_statistics() + + def get_state(self): + return { + "metrics": { + name: metric_records.get_state() + for name, metric_records in self._records.items() + } + } + + @classmethod + def from_state(cls, state): + recorder = cls() + recorder._records = { + name: MetricRecords.from_state(metric_records) + for name, metric_records in state["metrics"].items() + } + return recorder diff --git a/python/paddle/distributed/auto_parallel/tuner/trial.py b/python/paddle/distributed/auto_parallel/tuner/trial.py new file mode 100644 index 0000000000000000000000000000000000000000..22a6638c5ca63b953dcac3d62c564acf6087a305 --- /dev/null +++ b/python/paddle/distributed/auto_parallel/tuner/trial.py @@ -0,0 +1,114 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import hashlib +import random +import time +from enum import Enum + +from .storable import Storable +from .recorder import MetricsRecorder +from .tunable_space import TunableSpace + + +class TrialStatus: + RUNNING = "RUNNING" + COMPLETED = "COMPLETED" + STOPPED = "STOPPED" + INVALID = "INVALID" + + +class Trial(Storable): + def __init__(self, tunable_space, trial_id=None, + status=TrialStatus.RUNNING): + self._id = _generate_trial_id() if trial_id is None else trial_id + self._space = tunable_space + self._recorder = MetricsRecorder() + self._score = None + self._best_step = None + self._status = status + + @property + def id(self): + return self._id + + @property + def space(self): + return self._space + + @property + def recorder(self): + return self._recorder + + @property + def score(self): + return self._score + + @score.setter + def score(self, score): + self._score = score + + @property + def best_step(self): + return self._best_step + + @best_step.setter + def best_step(self, best_step): + self._best_step = best_step + + @property + def status(self): + return self._status + + @status.setter + def status(self, status): + self._status = status + + def summary(self): + print("Tunable space:") + if self.space.values: + for tv, value in self.space.values.items(): + print(tv + ":", value) + + if self.score is not None: + print("Score: {}".format(self.score)) + + def get_state(self): + return { + "id": self.id, + "space": self.space.get_state(), + "recorder": self.recorder.get_state(), + "score": self.score, + "best_step": self.best_step, + "status": self.status, + } + + def set_state(self, state): + self._id = state["id"] + self._space = TunableSpace.from_state(state["space"]) + self._recorder = MetricsRecorder.from_state(state["recorder"]) + self._score = state["score"] + self._best_step = state["best_step"] + self._status = state["status"] + + @classmethod + def from_state(cls, state): + trial = cls(tunable_space=None) + trial.set_state(state) + return trial + + +def _generate_trial_id(): + s = str(time.time()) + str(random.randint(1, int(1e7))) + return hashlib.sha256(s.encode("utf-8")).hexdigest()[:32] diff --git a/python/paddle/distributed/auto_parallel/utils.py b/python/paddle/distributed/auto_parallel/utils.py index 241eadcbace22cf36504e2c0ed36566fa94b9e4b..86c274cb45cc323dab60968571837e82619e6987 100644 --- a/python/paddle/distributed/auto_parallel/utils.py +++ b/python/paddle/distributed/auto_parallel/utils.py @@ -1271,7 +1271,6 @@ def get_all_distributed_main_program(serial_program_info, dist_context, used_dist_context._dist_op_context = DistributedOperatorContext() _, _, dist_startup_program, dist_main_program, _ = copied_parallelizer._get_dist_program( rank_id, used_dist_context) - # print("dist_main_program: ", dist_main_program) all_dist_main_program.append(dist_main_program) return all_dist_main_program diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py index 89b59254e5b9105a55c68f3ef871396de1bd9199..6a30276e02ba238a0f4ee838164a5bf9976f7d84 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py @@ -89,7 +89,7 @@ class ShardingClipGrad: global_norm_fp16 = paddle.cast( global_norm_fp16, dtype=paddle.float32) - # global norm of non-distributed FP16 params_and_grads for slice parameter + # global norm of non-distributed FP16 params_and_grads for unslice parameter if len(unslice_params_fp16) == 0: global_unslice_fp16 = paddle.to_tensor([0.], dtype=paddle.float32) else: @@ -104,21 +104,20 @@ class ShardingClipGrad: [0.], dtype=paddle.float32) global_norm_fp32 = layers.reduce_sum(global_norm_fp32) - # global norm of non-distributed FP32 params_and_grads for slice parameter + # global norm of non-distributed FP32 params_and_grads for unslice parameter global_unslice_fp32 = layers.concat(unslice_params_fp32) if len( unslice_params_fp32) != 0 else paddle.to_tensor( [0.], dtype=paddle.float32) global_unslice_fp32 = layers.reduce_sum(global_unslice_fp32) global_unslice_var = global_unslice_fp16 + global_unslice_fp32 - global_norm_var = global_norm_fp16 + global_norm_fp32 + global_norm_var = global_norm_fp16 + global_norm_fp32 + 1.0 / self._group.nranks * global_unslice_var # add all reduce to get global norm of distributed params_and_grads dev_id = int(self._device.split(":")[1]) with device_guard(dev_id, "gpu"): paddle.distributed.all_reduce(global_norm_var, group=self._group) - global_norm_var += global_unslice_var global_norm_var = layers.sqrt(global_norm_var) max_global_norm = layers.fill_constant( shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm) diff --git a/python/paddle/distributed/models/__init__.py b/python/paddle/distributed/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e1663029ef1f844676ce9484f724dc253d625386 --- /dev/null +++ b/python/paddle/distributed/models/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/paddle/distributed/models/moe/__init__.py b/python/paddle/distributed/models/moe/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e1663029ef1f844676ce9484f724dc253d625386 --- /dev/null +++ b/python/paddle/distributed/models/moe/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/paddle/distributed/models/moe/utils.py b/python/paddle/distributed/models/moe/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..fd98c64318c60e2e67af320c51b24e39a3132c43 --- /dev/null +++ b/python/paddle/distributed/models/moe/utils.py @@ -0,0 +1,55 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.fluid import core +from paddle.fluid.layer_helper import LayerHelper +from paddle.fluid.framework import in_dygraph_mode + + +def _number_count(gate_idx, upper_range): + """ + calculate the expert count according to the gate index. + Args: + gate_idx (Tensor): Tensor. The input gate index whose data type should be int32 or int64. + upper_range (int): The number of the experts. + Returns: + out (Tensor): The output expert count. + Examples: + .. code-block:: python + # required: distributed + import paddle + + gate_idx = [ + [0, 2], + [0, 2] + ] + upper_range = 6 + gate_idx = paddle.to_tensor(gate_idx, dtype="int32") + number_count = paddle.distributed.utils.number_count(gate_idx, upper_range) + print(number_count) # the result: [2, 0, 2, 0, 0, 0] + """ + if in_dygraph_mode(): + return core.ops.number_count(gate_idx, 'upper_range', upper_range) + else: + op_type = 'number_count' + + helper = LayerHelper(op_type, **locals()) + out = helper.create_variable_for_type_inference(dtype=gate_idx.dtype) + + helper.append_op( + type=op_type, + inputs={'gate_idx': gate_idx}, + outputs={'Out': out}, + attrs={'upper_range': upper_range}) + return out diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 7480909a2d88dda51971d0ef66ae6c88a56cd79c..fb9e8d8ece100baa3ed7c65a8dc495aa12c254ff 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -228,3 +228,5 @@ if core.is_compiled_with_npu(): atexit.register(core.clear_executor_cache) # NOTE(Aganlengzi): clean up KernelFactory in advance manually. atexit.register(core.clear_kernel_factory) +# NOTE(wangran16): clean up DeviceManger in advance manually. +atexit.register(core.clear_device_manager) diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py index 80d2ccb0d5ca6fcb3a802014a860bfb2ff9b3400..9dba5d658dfc9f480c5e668be2c34b2bcb673078 100644 --- a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py +++ b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py @@ -173,6 +173,9 @@ if core.is_compiled_with_xpu(): elif core.is_compiled_with_npu(): _, _, _sys_unsupported_fp16_list = core.op_supported_infos( 'NPU', core.VarDesc.VarType.FP16) +elif core.is_compiled_with_mlu(): + _, _, _sys_unsupported_fp16_list = core.op_supported_infos( + 'MLU', core.VarDesc.VarType.FP16) else: _, _, _sys_unsupported_fp16_list = core.op_supported_infos( 'GPU', core.VarDesc.VarType.FP16) diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py index 97b4116826a2a0e809af4a4129a0e953fd607b07..d614630b3db12d47d7b6790e40edefbd0402e384 100644 --- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py +++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py @@ -979,8 +979,6 @@ class PostTrainingQuantization(object): if op.type in ( self._quantizable_op_type + self._out_scale_op_list): out_var_names = _get_op_output_var_names(op) - assert len(out_var_names) == 1, "Post training " + \ - "quantization only support one output for " + op.type for var_name in out_var_names: analysis_and_save_info(op, var_name) diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py index efa000274d01a601d6949c16803020272da918b4..afca617b6dd82b9106dbbb4a28e89090b0ad5278 100644 --- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py +++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py @@ -59,6 +59,7 @@ _out_scale_op_list = [ "tanh", "prelu", "swish", + "dropout", "softmax", "batch_norm", "layer_norm", @@ -68,6 +69,8 @@ _out_scale_op_list = [ "transpose2", "concat", "elementwise_mul", + "elementwise_pow", + "elementwise_sub", "scale", "slice", "hard_swish", @@ -81,8 +84,54 @@ _out_scale_op_list = [ "flatten2", "transpose", "pad2d", + "pad3d", "reshape", - "layer_norm", + "split", + "flatten_contiguous_range", + "squeeze", + "squeeze2", + "nearest_interp_v2", + "fill_constant_batch_size_like", + "bilinear_interp", + "bilinear_interp_v2", + "arg_max", + "abs", + "assign", + "cast", + "clip", + "box_coder", + "crop", + "cumsum", + "equal", + "expand_v2", + "fill_any_like", + "fill_constant", + "gelu", + "instance_norm", + "lookup_table", + "lookup_table_v2", + "norm", + "p_norm", + "pow", + "reduce_mean", + "stack", + "top_k_v2", + "unsqueeze", + "unsqueeze2", + "logical_and", + "logical_not", + "meshgrid", + "roi_align", + "strided_slice", + "where", + "grid_sampler", + "tile", + "group_norm", + "reduce_sum", + "square", + "softplus", + "gather", + "shuffle_channel", ] # list op real input and output names, to avoid processing input such as AxisTensor. @@ -119,7 +168,7 @@ _op_real_in_out_name = { "relu": [["X"], ["Out"]], "relu6": [["X"], ["Out"]], "leaky_relu": [["X"], ["Out"]], - "prelu": [["X"], ["Out"]], + "prelu": [["X", "Alpha"], ["Out"]], "tanh": [["X"], ["Out"]], "swish": [["X"], ["Out"]], "dropout": [["X"], ["Out"]], @@ -127,16 +176,59 @@ _op_real_in_out_name = { "layer_norm": [["X"], ["Y"]], "sigmoid": [["X"], ["Out"]], "elementwise_mul": [["X", "Y"], ["Out"]], + "elementwise_pow": [["X", "Y"], ["Out"]], "scale": [["X"], ["Out"]], "hard_swish": [["X"], ["Out"]], "hard_sigmoid": [["X"], ["Out"]], "gru": [["Input", "Weight"], ["Hidden"]], "lstm": [["Input", "Weight"], ["Hidden"]], "pad2d": [["X"], ["Out"]], + "pad3d": [["X"], ["Out"]], "flatten": [["X"], ["Out"]], "flatten2": [["X"], ["Out"]], "unsqueeze2": [["X"], ["Out"]], - "flatten_contiguous_range": [['X'], ["Out"]], + "unsqueeze2": [["X"], ["Out"]], + "flatten_contiguous_range": [["X"], ["Out"]], + "split": [["X"], ["Out"]], + "squeeze2": [["X"], ["Out"]], + "nearest_interp_v2": [["X"], ["Out"]], + "bilinear_interp": [["X"], ["Out"]], + "bilinear_interp_v2": [["X"], ["Out"]], + "fill_constant_batch_size_like": [["Input"], ["Out"]], + "arg_max": [["X"], ["Out"]], + "abs": [["X"], ["Out"]], + "assign": [["X"], ["Out"]], + "cast": [["X"], ["Out"]], + "clip": [["X"], ["Out"]], + "box_coder": [["PriorBox"], ["OutputBox"]], + "crop": [["X"], ["Out"]], + "cumsum": [["X"], ["Out"]], + "expand_v2": [["X"], ["Out"]], + "fill_any_like": [["X"], ["Out"]], + "fill_constant": [[], ["Out"]], + "gelu": [["X"], ["Out"]], + "instance_norm": [["X"], ["Out"]], + "lookup_table": [["W", "Ids"], ["Out"]], + "lookup_table_v2": [["W", "Ids"], ["Out"]], + "norm": [["X"], ["Norm"]], + "p_norm": [["X"], ["Out"]], + "pow": [["X"], ["Out"]], + "reduce_mean": [["X"], ["Out"]], + "stack": [["X"], ["Y"]], + "top_k_v2": [["X"], ["Out", "Indices"]], + "logical_and": [["X", "Y"], ["Out"]], + "logical_not": [["X"], ["Out"]], + "meshgrid": [["X"], ["Out"]], + "roi_align": [["X", "ROIs"], ["Out"]], + "strided_slice": [["Input"], ["Out"]], + "where": [["Condition", "X", "Y"], ["Out"]], + "grid_sampler": [["X", "Grid"], ["Output"]], + "tile": [["X"], ["Out"]], + "group_norm": [["X"], ["Y", "Mean", "Variance"]], + "reduce_sum": [["X"], ["Out"]], + "square": [["X"], ["Out"]], + "softplus": [["X"], ["Out"]], + "shuffle_channel": [["X"], ["Out"]], } _conv_ops = ['conv2d', 'depthwise_conv2d', 'conv2d_transpose'] @@ -1797,14 +1889,93 @@ class AddQuantDequantPass(object): quantized ops's inputs. """ _supported_quantizable_op_type = [ - "pool2d", "elementwise_add", "concat", "softmax", "argmax", "transpose", - "equal", "gather", "greater_equal", "greater_than", "less_equal", - "less_than", "mean", "not_equal", "reshape", "reshape2", - "bilinear_interp", "nearest_interp", "trilinear_interp", "slice", - "squeeze", "elementwise_sub", "mul", "matmul", "relu", "relu6", - "leaky_relu", "tanh", "swish", "scale", "transpose", "transpose2", - "sigmoid", "pad2d", "flatten", "flatten2", "batch_norm", "layer_norm", - "matmul_v2" + "pool2d", + "elementwise_add", + "concat", + "softmax", + "argmax", + "transpose", + "equal", + "gather", + "greater_equal", + "greater_than", + "less_equal", + "less_than", + "mean", + "not_equal", + "reshape", + "reshape2", + "dropout", + "bilinear_interp", + "nearest_interp", + "trilinear_interp", + "slice", + "squeeze", + "elementwise_sub", + "mul", + "matmul", + "relu", + "relu6", + "leaky_relu", + "tanh", + "swish", + "scale", + "transpose", + "transpose2", + "sigmoid", + "pad2d", + "flatten", + "flatten2", + "batch_norm", + "layer_norm", + "matmul_v2", + "split", + "flatten_contiguous_range", + "squeeze2", + "nearest_interp_v2", + "bilinear_interp", + "bilinear_interp_v2", + "fill_constant_batch_size_like", + "arg_max", + "abs", + "assign", + "cast", + "clip", + "box_coder", + "crop", + "cumsum", + "elementwise_mul", + "elementwise_pow", + "expand_v2", + "fill_any_like", + "fill_constant", + "gelu", + "hard_sigmoid", + "hard_swish", + "instance_norm", + "lookup_table", + "lookup_table_v2", + "norm", + "p_norm", + "pad3d", + "pow", + "prelu", + "reduce_mean", + "unsqueeze", + "unsqueeze2", + "logical_and", + "logical_not", + "meshgrid", + "roi_align", + "strided_slice", + "where", + "grid_sampler", + "tile", + "group_norm", + "reduce_sum", + "square", + "softplus", + "shuffle_channel", ] # To be compatible with PaddleSlim, not remove _activation_type for now diff --git a/python/paddle/fluid/contrib/slim/tests/save_quant_model.py b/python/paddle/fluid/contrib/slim/tests/save_quant_model.py index 3fadf25150f9ef3556a343fdce8acc24d788f5dc..f97c2778c0918ecbfbed546089c17e9d505818cd 100644 --- a/python/paddle/fluid/contrib/slim/tests/save_quant_model.py +++ b/python/paddle/fluid/contrib/slim/tests/save_quant_model.py @@ -52,6 +52,30 @@ def parse_args(): '--debug', action='store_true', help='If used, the graph of Quant model is drawn.') + parser.add_argument( + '--quant_model_filename', + type=str, + default="", + help='The input model`s file name. If empty, search default `__model__` and separate parameter files and use them or in case if not found, attempt loading `model` and `params` files.' + ) + parser.add_argument( + '--quant_params_filename', + type=str, + default="", + help='If quant_model_filename is empty, this field is ignored. The input model`s all parameters file name. If empty load parameters from separate files.' + ) + parser.add_argument( + '--save_model_filename', + type=str, + default="__model__", + help='The name of file to save the inference program itself. If is set None, a default filename __model__ will be used.' + ) + parser.add_argument( + '--save_params_filename', + type=str, + default=None, + help='The name of file to save all related parameters. If it is set None, parameters will be saved in separate files' + ) test_args, args = parser.parse_known_args(namespace=unittest) return test_args, sys.argv[:1] + args @@ -61,18 +85,29 @@ def transform_and_save_int8_model(original_path, save_path, ops_to_quantize='', op_ids_to_skip='', - debug=False): + debug=False, + quant_model_filename='', + quant_params_filename='', + save_model_filename='', + save_params_filename=''): place = fluid.CPUPlace() exe = fluid.Executor(place) inference_scope = fluid.executor.global_scope() with fluid.scope_guard(inference_scope): - if os.path.exists(os.path.join(original_path, '__model__')): - [inference_program, feed_target_names, - fetch_targets] = fluid.io.load_inference_model(original_path, exe) + if not quant_model_filename: + if os.path.exists(os.path.join(original_path, '__model__')): + [inference_program, feed_target_names, + fetch_targets] = fluid.io.load_inference_model(original_path, + exe) + else: + [inference_program, feed_target_names, + fetch_targets] = fluid.io.load_inference_model( + original_path, exe, 'model', 'params') else: [inference_program, feed_target_names, - fetch_targets] = fluid.io.load_inference_model(original_path, exe, - 'model', 'params') + fetch_targets] = fluid.io.load_inference_model( + original_path, exe, quant_model_filename, + quant_params_filename) ops_to_quantize_set = set() print(ops_to_quantize) @@ -97,8 +132,14 @@ def transform_and_save_int8_model(original_path, graph = transform_to_mkldnn_int8_pass.apply(graph) inference_program = graph.to_program() with fluid.scope_guard(inference_scope): - fluid.io.save_inference_model(save_path, feed_target_names, - fetch_targets, exe, inference_program) + fluid.io.save_inference_model( + save_path, + feed_target_names, + fetch_targets, + exe, + inference_program, + model_filename=save_model_filename, + params_filename=save_params_filename) print( "Success! INT8 model obtained from the Quant model can be found at {}\n" .format(save_path)) @@ -109,4 +150,6 @@ if __name__ == '__main__': test_args, remaining_args = parse_args() transform_and_save_int8_model( test_args.quant_model_path, test_args.int8_model_save_path, - test_args.ops_to_quantize, test_args.op_ids_to_skip, test_args.debug) + test_args.ops_to_quantize, test_args.op_ids_to_skip, test_args.debug, + test_args.quant_model_filename, test_args.quant_params_filename, + test_args.save_model_filename, test_args.save_params_filename) diff --git a/python/paddle/fluid/contrib/sparsity/__init__.py b/python/paddle/fluid/contrib/sparsity/__init__.py index 9bf45f4272738c69073d252371b6a6c59aaf15da..ec288a1287119dd436a91843b863ba355bba28fb 100644 --- a/python/paddle/fluid/contrib/sparsity/__init__.py +++ b/python/paddle/fluid/contrib/sparsity/__init__.py @@ -29,10 +29,11 @@ from .asp import decorate from .asp import prune_model from .asp import set_excluded_layers from .asp import reset_excluded_layers +from .supported_layer_list import add_supported_layer __all__ = [ 'calculate_density', 'check_mask_1d', 'get_mask_1d', 'check_mask_2d', 'get_mask_2d_greedy', 'get_mask_2d_best', 'create_mask', 'check_sparsity', 'MaskAlgo', 'CheckMethod', 'decorate', 'prune_model', 'set_excluded_layers', - 'reset_excluded_layers' + 'reset_excluded_layers', 'add_supported_layer' ] diff --git a/python/paddle/fluid/contrib/sparsity/asp.py b/python/paddle/fluid/contrib/sparsity/asp.py index ffa12ac70460084fd49a14d0193be6e913495b9a..30439ad736d26f3086a7f87d591aa68a59b7baa8 100644 --- a/python/paddle/fluid/contrib/sparsity/asp.py +++ b/python/paddle/fluid/contrib/sparsity/asp.py @@ -23,6 +23,8 @@ import paddle from paddle.fluid import global_scope, program_guard, layers from paddle.fluid.initializer import ConstantInitializer from paddle.fluid.contrib import sparsity +from paddle.fluid.contrib.sparsity.supported_layer_list import supported_layers_and_prune_func_map +from paddle.fluid.contrib.sparsity.supported_layer_list import _default_pruning from paddle.fluid import core OpRole = core.op_proto_and_checker_maker.OpRole @@ -292,8 +294,8 @@ class ASPHelper(object): 2. pruning well-trained models into 2:4 sparse pattern on FP16 or 1:2 sparse pattern on FP32 for fine-tuning. """ - MASK_APPENDDED_NAME = '_asp_mask' - SUPPORTED_LAYERS = {'fc': 'w_0', 'linear': 'w_0', 'conv2d': 'w_0'} + MASK_APPENDDED_NAME = 'asp_mask' + PADDLE_WEIGHT_SUFFIX = "w_" __asp_info = {} @@ -334,7 +336,6 @@ class ASPHelper(object): r""" This is the implementation of `sparsity.prune_model`, for details please see explanation in `sparsity.prune_model`. """ - checked_func_name = sparsity.CheckMethod.get_checking_method(mask_algo) if main_program is None: main_program = paddle.static.default_main_program() @@ -345,33 +346,27 @@ class ASPHelper(object): weight_tensor = global_scope().find_var(param.name).get_tensor() weight_nparray = np.array(weight_tensor) - # The double transpose ops here make sure pruning direction consistent with cuSparseLt. - # SPMMA in cuSparseLt: D = (AxB) + C, where matrix A (mxk) is sparse matrix. - # cuSparseLt would prune matrix A along k dimension. - # In sparse training, layer weight matriices is viewed sparse matrix A, so - # the math fomula should be 'Act(WX + b)'. However, default fomula in PaddlePaddle - # is 'Act(XW + b)'. For enabling SPMMA, weights and inputs should be transposed - # for computing, Act( (W^T X^T)^T + b). Therefore, we have to prune alog k dimension - # of W^T, which is m dimension of W. Moreove, all mask generating functions in - # sparsity/utils is row-major pruning. That is the reason we have to transpose weight - # matrices beforce invoking create_mask. Then we transpose the result maks to make - # sure its shape to be the same as the input weight. - weight_sparse_mask = sparsity.create_mask( - weight_nparray.T, func_name=mask_algo, n=n, m=m).T - weight_pruned_nparray = np.multiply(weight_nparray, - weight_sparse_mask) + prune_func = ASPHelper._get_prune_func_by_name(param.name) + + weight_pruned_nparray, weight_sparse_mask = \ + prune_func(weight_nparray, m, n, mask_algo, param.name) + weight_pruned_nparray = weight_pruned_nparray.astype( + weight_nparray.dtype) weight_tensor.set(weight_pruned_nparray, place) - assert sparsity.check_sparsity(weight_pruned_nparray.T, n=n, m=m, func_name=checked_func_name), \ - 'Pruning {} weight matrix failure!!!'.format(param.name) + if with_mask: weight_mask_param = global_scope().find_var( ASPHelper._get_mask_name(param.name)) assert weight_mask_param is not None, \ - 'Cannot find {} variable, please call ASPHelper.minimize' \ + 'Cannot find {} variable, please call optimizer.minimize (' \ + 'paddle.sparsity.decorate(optimizer).minimize(loss)' \ ' and initialization (exe.run(startup_program)) first!'.format(ASPHelper._get_mask_name(param.name)) weight_mask_tensor = weight_mask_param.get_tensor() + weight_sparse_mask = weight_sparse_mask.astype( + np.array(weight_mask_tensor).dtype) weight_mask_tensor.set(weight_sparse_mask, place) asp_info.update_masks(param.name, weight_sparse_mask) + return asp_info.masks.copy() @staticmethod @@ -384,7 +379,7 @@ class ASPHelper(object): Returns: string: The mask name of :attr:`param_name`. """ - return param_name + ASPHelper.MASK_APPENDDED_NAME + return param_name + "." + ASPHelper.MASK_APPENDDED_NAME @staticmethod def _get_not_ASP_relevant_vars(main_program): @@ -434,19 +429,46 @@ class ASPHelper(object): # fc_0.w_0 -> True # fc_0.b_0 -> False """ - if ASPHelper.MASK_APPENDDED_NAME in param_name: + param_name_list = param_name.split('.') + + if ASPHelper.MASK_APPENDDED_NAME in param_name_list: return False for layer in cls._get_program_asp_info(main_program).excluded_layers: if layer in param_name: return False - for name in ASPHelper.SUPPORTED_LAYERS: - if name in param_name and \ - ASPHelper.SUPPORTED_LAYERS[name] in param_name: - return True + if param_name in supported_layers_and_prune_func_map: + return True + + param_name_no_weight_suffix = param_name_list[0] + param_type_suffix = param_name_list[1] + layer_name = param_name_no_weight_suffix[:param_name_no_weight_suffix. + rfind('_')] + if ASPHelper.PADDLE_WEIGHT_SUFFIX not in param_type_suffix: + return False + + if param_name_no_weight_suffix in supported_layers_and_prune_func_map or \ + layer_name in supported_layers_and_prune_func_map: + return True + return False + @classmethod + def _get_prune_func_by_name(cls, param_name): + func = supported_layers_and_prune_func_map.get(param_name, None) + param_name_no_weight_suffix = param_name.split('.')[0] + if func is None: + func = supported_layers_and_prune_func_map.get( + param_name_no_weight_suffix, None) + if func is None: + layer_name = param_name_no_weight_suffix[: + param_name_no_weight_suffix. + rfind('_')] + func = supported_layers_and_prune_func_map.get(layer_name, + _default_pruning) + return func + @classmethod def _minimize(cls, optimizer, @@ -509,8 +531,7 @@ class ASPHelper(object): if ASPHelper._is_supported_layer(main_program, param_and_grad[0].name): mask_param = layers.create_parameter( - name=param_and_grad[0].name + - ASPHelper.MASK_APPENDDED_NAME, + name=ASPHelper._get_mask_name(param_and_grad[0].name), shape=param_and_grad[0].shape, dtype=param_and_grad[0].dtype, default_initializer=ConstantInitializer(value=1.0)) diff --git a/python/paddle/fluid/contrib/sparsity/supported_layer_list.py b/python/paddle/fluid/contrib/sparsity/supported_layer_list.py new file mode 100644 index 0000000000000000000000000000000000000000..105c2ded9eee71f68344d99dba325fee0a155850 --- /dev/null +++ b/python/paddle/fluid/contrib/sparsity/supported_layer_list.py @@ -0,0 +1,86 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 NVIDIA Corporation. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import paddle +from paddle.fluid.contrib import sparsity +import threading + +__all__ = ['add_supported_layer'] + + +def _default_pruning(weight_nparray, m, n, func_name, param_name): + + checked_func_name = sparsity.CheckMethod.get_checking_method(func_name) + + # The double transpose ops here make sure pruning direction consistent with cuSparseLt. + # SPMMA in cuSparseLt: D = (AxB) + C, where matrix A (mxk) is sparse matrix. + # cuSparseLt would prune matrix A along k dimension. + # In sparse training, layer weight matrices is viewed sparse matrix A, so + # the math fomula should be 'Act(WX + b)'. However, default fomula in PaddlePaddle + # is 'Act(XW + b)'. For enabling SPMMA, weights and inputs should be transposed + # for computing, Act( (W^T X^T)^T + b). Therefore, we have to prune alog k dimension + # of W^T, which is m dimension of W. Moreove, all mask generating functions in + # sparsity/utils is row-major pruning. That is the reason we have to transpose weight + # matrices beforce invoking create_mask. Then we transpose the result mask to make + # sure its shape to be the same as the input weight. + weight_sparse_mask = sparsity.create_mask( + weight_nparray.T, func_name=func_name, n=n, m=m).T + weight_pruned_nparray = np.multiply(weight_nparray, weight_sparse_mask) + assert sparsity.check_sparsity(weight_pruned_nparray.T, n=n, m=m, func_name=checked_func_name), \ + 'Pruning {} weight matrix failure!!!'.format(param_name) + return weight_pruned_nparray, weight_sparse_mask + + +# When value of given key in this DICT is None, +# ASP will call default pruning function in pruning stage. +_supported_layers_and_prune_func_map_lock = threading.Lock() +supported_layers_and_prune_func_map = {} + + +def add_supported_layer(layer, pruning_func=None): + r""" + Add supported layers and its corresponding pruning function. + + Args: + name (string|Layer): The name or type of layer, needed to support. If layer is `Layer` then + it would be turn to string internally. ASP would use this name to match parameter's name and call + its the corresponding pruning function. + pruning_func (function, optional): a function type which receives five argument (weight_nparray, + m, n, func_name, param_name), weight_nparray is a nparray of weight, param_name is the name of weight, + m, n, and func_name, please see `prune_model` for details. + """ + name = None + if isinstance(layer, str): + name = layer + elif isinstance(layer, paddle.fluid.dygraph.layers.Layer): + name = paddle.fluid.dygraph.layers._convert_camel_to_snake( + type(layer).__name__) + elif issubclass(layer, paddle.fluid.dygraph.layers.Layer): + name = paddle.fluid.dygraph.layers._convert_camel_to_snake( + layer.__name__) + else: + assert "The type of layer should be string of Layer, but got {}!".format( + type(layer)) + if pruning_func is None: + pruning_func = _default_pruning + _supported_layers_and_prune_func_map_lock.acquire() + supported_layers_and_prune_func_map.update({name: pruning_func}) + _supported_layers_and_prune_func_map_lock.release() + + +add_supported_layer('fc') +add_supported_layer('linear') +add_supported_layer('conv2d') diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py index 706ec0d523b938fda0501dfd04f1fc976bf6a26b..5385ac28b90f614fcd6003994b9a7000bc16702a 100644 --- a/python/paddle/fluid/dataloader/dataloader_iter.py +++ b/python/paddle/fluid/dataloader/dataloader_iter.py @@ -564,6 +564,14 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase): self._rcvd_idx += 1 self._batches_outstanding -= 1 else: + # NOTE: when _rcvd_idx catch up _send_idx, which means + # one of following: + # 1. all 2 * num_workers batches have been loaded + # and stored in _blocking_queue + # 2. all data drained + # we need to let _thread blocking at _data_queue + # get_data to inoccupy CPU, otherwise may occupy + # CPU time for model running # NOTE: in persistent workers mode, do not check data # drained here, simply let it go to _data_queue # reading to get _ResumeIteration @@ -573,7 +581,6 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase): # may also be data in blocking queue if self._batches_outstanding < len(self._places): return None - continue if self._rcvd_idx in self._task_infos and \ len(self._task_infos[self._rcvd_idx]) == 3: diff --git a/python/paddle/fluid/dygraph/amp/auto_cast.py b/python/paddle/fluid/dygraph/amp/auto_cast.py index 191661b7bf9d5a5b2877f22ebe6d2ec9124f2f96..4127f1e4449bf82aae294ce952122f1f8f6e775f 100644 --- a/python/paddle/fluid/dygraph/amp/auto_cast.py +++ b/python/paddle/fluid/dygraph/amp/auto_cast.py @@ -271,18 +271,28 @@ def amp_guard(enable=True, "current_tracer is None, maybe it is not in imperative mode.") # check device_type: - # NOTE: Now, amp only support gpu for float16 and bfloat16, xpu for float16. + # NOTE: Now, amp only support gpu for float16 and bfloat16, xpu for float16, mlu for float16, npu for float16. # Maybe we will support cpu for bfloat16. if enable and not (tracer._expected_place.is_gpu_place() or - tracer._expected_place.is_xpu_place()): + tracer._expected_place.is_xpu_place() or + tracer._expected_place.is_mlu_place() or + tracer._expected_place.is_npu_place()): warnings.warn( - 'amp_guard can only be enabled on CUDAPlace and XPUPlace, current place is %s, so it makes no effect.' + 'amp_guard can only be enabled on CUDAPlace, XPUPlace, MLUPlace, and NPUPlace, current place is %s, so it makes no effect.' % tracer._expected_place) enable = False + # For npu: + if tracer._expected_place.is_npu_place() and (dtype == 'bfloat16'): + warnings.warn('NPUPlace only support float16 amp.') + enable = False # For xpu: if tracer._expected_place.is_xpu_place() and (dtype == 'bfloat16'): warnings.warn('XPUPlace only support float16 amp.') enable = False + # For mlu: + if tracer._expected_place.is_mlu_place() and (dtype == 'bfloat16'): + warnings.warn('MLUPlace only support float16 amp.') + enable = False # For gpu float16: Compute Capability should >= 7. # For gpu bfloat16: Compute Capability should >= 8 & CUDA Version should >= 11. if tracer._expected_place.is_gpu_place(): diff --git a/python/paddle/fluid/dygraph/amp/loss_scaler.py b/python/paddle/fluid/dygraph/amp/loss_scaler.py index f7c2d6be574c4e17fa5ce6fa44ca4ecc55a5eb95..c57290861942b8020f6f55792c445d42a0578c90 100644 --- a/python/paddle/fluid/dygraph/amp/loss_scaler.py +++ b/python/paddle/fluid/dygraph/amp/loss_scaler.py @@ -105,9 +105,11 @@ class AmpScaler(object): "current_tracer is None, maybe it is not in imperative mode.") if enable and not (tracer._expected_place.is_gpu_place() or - tracer._expected_place.is_xpu_place()): + tracer._expected_place.is_xpu_place() or + tracer._expected_place.is_mlu_place() or + tracer._expected_place.is_npu_place()): warnings.warn( - 'AmpScaler can only be enabled on CUDAPlace and XPUPlace, current place is %s, so it makes no effect.' + 'AmpScaler can only be enabled on CUDAPlace, XPUPlace, MLUPlace and NPUPlace, current place is %s, so it makes no effect.' % tracer._expected_place) enable = False @@ -286,14 +288,28 @@ class AmpScaler(object): ) and (param._grad_ivar().dtype == core.VarDesc.VarType.FP32 ) ] - if len(param_grads_fp16): - _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale, - param_grads_fp16, - self._temp_found_inf_fp16) - if len(param_grads_fp32): - _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale, - param_grads_fp32, - self._temp_found_inf_fp32) + if core.is_compiled_with_npu(): + float_status = _C_ops.alloc_float_status() + _C_ops.clear_float_status(float_status, float_status) + + if len(param_grads_fp16): + _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale, + float_status, param_grads_fp16, + self._temp_found_inf_fp16) + if len(param_grads_fp32): + _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale, + float_status, param_grads_fp32, + self._temp_found_inf_fp32) + else: + if len(param_grads_fp16): + _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale, + param_grads_fp16, + self._temp_found_inf_fp16) + if len(param_grads_fp32): + _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale, + param_grads_fp32, + self._temp_found_inf_fp32) + if len(param_grads_fp16) and len(param_grads_fp32): self._found_inf = self._temp_found_inf_fp16 or self._temp_found_inf_fp32 elif len(param_grads_fp16): diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py index 8149d69d36a27fadcefa8dc6b6ff1dd89792e29e..9439982858530e1e81156be4b32ef2d91dc4a33a 100644 --- a/python/paddle/fluid/dygraph/base.py +++ b/python/paddle/fluid/dygraph/base.py @@ -565,16 +565,25 @@ def grad(outputs, if isinstance(in_out_list, (list, tuple)): assert len(in_out_list) > 0, "{} cannot be empty".format(name) for each_var in in_out_list: - assert isinstance( - each_var, - core.VarBase), "Elements of {} must be Variable".format( - name) + if core._in_eager_mode(): + assert isinstance( + each_var, core.eager. + Tensor), "Elements of {} must be Tensor".format(name) + else: + assert isinstance( + each_var, + core.VarBase), "Elements of {} must be Variable".format( + name) return in_out_list else: - assert isinstance( - in_out_list, - core.VarBase), "{} must be Variable or list of Variable".format( - name) + if core._in_eager_mode(): + assert isinstance( + in_out_list, core.eager. + Tensor), "{} must be Tensor or list of Tensor".format(name) + else: + assert isinstance( + in_out_list, core.VarBase + ), "{} must be Variable or list of Variable".format(name) return [in_out_list] outputs = check_in_out(outputs, 'outputs') @@ -586,9 +595,14 @@ def grad(outputs, for each_var in grad_outputs: if each_var is not None: - assert isinstance( - each_var, core.VarBase - ), "grad_outputs must be None, a Variable or a list containing None or Variables" + if core._in_eager_mode(): + assert isinstance( + each_var, core.eager.Tensor + ), "grad_outputs must be None, a Variable or a list containing None or Variables" + else: + assert isinstance( + each_var, core.VarBase + ), "grad_outputs must be None, a Variable or a list containing None or Variables" else: grad_outputs = [] @@ -600,14 +614,27 @@ def grad(outputs, no_grad_vars = [] elif isinstance(no_grad_vars, core.VarBase): no_grad_vars = [no_grad_vars] + elif isinstance(no_grad_vars, core.eager.Tensor): + no_grad_vars = [no_grad_vars] elif isinstance(no_grad_vars, (list, tuple, set)): no_grad_vars = list(no_grad_vars) for var in no_grad_vars: - assert isinstance( - var, core.VarBase), "no_grad_vars can only contains Variable" + if core._in_eager_mode(): + assert isinstance( + var, + core.eager.Tensor), "no_grad_vars can only contains Tensor" + else: + assert isinstance( + var, + core.VarBase), "no_grad_vars can only contains Variable" else: - raise AssertionError( - "no_grad_vars must be None, Variable or list/tuple/set of Variables") + if core._in_eager_mode(): + raise AssertionError( + "no_grad_vars must be None, Tensor or list/tuple/set of Tensors") + else: + raise AssertionError( + "no_grad_vars must be None, Variable or list/tuple/set of Variables" + ) assert isinstance(create_graph, bool), "create_graph must be True or False" @@ -622,6 +649,11 @@ def grad(outputs, assert isinstance(only_inputs, bool), "only_inputs must be True or False" assert only_inputs, "only_inputs=False is not supported yet" + if core._in_eager_mode(): + return core.eager.run_partial_grad( + outputs, inputs, grad_outputs, retain_graph, create_graph, + only_inputs, allow_unused, no_grad_vars) + place = core.Place() place.set_place(framework._current_expected_place()) return core.dygraph_partial_grad(inputs, outputs, grad_outputs, diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py index e1df2324889b440737740e443228d0fa69b47b51..7733226cc09f2d6e2f9bcb8403ed1be42aa75e0c 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py @@ -297,10 +297,6 @@ class TensorShapeTransformer(gast.NodeTransformer): return False def _update_name_to_var_shape(self, node): - def replace_dot(name): - # replace all '.' into '_' - return name.replace('.', '_') - assert isinstance(node, gast.Assign) target_node = node.targets[0] value_node = node.value @@ -315,7 +311,6 @@ class TensorShapeTransformer(gast.NodeTransformer): if value_node.id in self.name_to_var_shape: # TODO(zhhsplendid): is context a problem for the result node of gast.parse? static_shape_var_name = unique_name.generate( - replace_dot(target_id) + STATIC_CONVERT_VAR_SHAPE_SUFFIX) static_shape_var_node = gast.parse( static_shape_var_name).body[0].value @@ -337,7 +332,6 @@ class TensorShapeTransformer(gast.NodeTransformer): if isinstance(value_node, gast.Attribute): if self._is_var_shape(value_node): # eg: x.shape static_shape_var_name = unique_name.generate( - replace_dot(target_id) + STATIC_CONVERT_VAR_SHAPE_SUFFIX) static_shape_var_node = gast.parse( static_shape_var_name).body[0].value @@ -370,7 +364,6 @@ class TensorShapeTransformer(gast.NodeTransformer): if isinstance(value_node, gast.Name): if value_node.id in self.name_to_var_shape: static_shape_var_name = unique_name.generate( - replace_dot(target_id) + STATIC_CONVERT_VAR_SHAPE_SUFFIX) static_shape_var_node = gast.parse( static_shape_var_name).body[0].value @@ -387,7 +380,7 @@ class TensorShapeTransformer(gast.NodeTransformer): self.name_to_var_shape[target_id] = static_shape_var_name elif self._is_var_shape(value_node): # eg: x.shape or x.shape[0] static_shape_var_name = unique_name.generate( - replace_dot(target_id) + STATIC_CONVERT_VAR_SHAPE_SUFFIX) + STATIC_CONVERT_VAR_SHAPE_SUFFIX) static_shape_var_node = gast.parse(static_shape_var_name).body[ 0].value static_shape_value_node = copy.deepcopy(value_node) diff --git a/python/paddle/fluid/dygraph/io.py b/python/paddle/fluid/dygraph/io.py index f58952d3036c506341955eff2472079bb696bb1f..a36164a277dec0762e7ba49a1d158837f27bc517 100644 --- a/python/paddle/fluid/dygraph/io.py +++ b/python/paddle/fluid/dygraph/io.py @@ -30,6 +30,7 @@ from paddle.fluid.layers import nn from paddle.fluid.layers.utils import _hash_with_id from paddle.fluid.dygraph.base import switch_to_static_graph from paddle.fluid.framework import in_dygraph_mode +from paddle import _C_ops __all__ = ['TranslatedLayer'] @@ -761,6 +762,21 @@ def _construct_params_and_buffers(model_path, return var_dict +def _valid_vars(vars): + if vars: + return vars + if framework._in_eager_mode(): + return [ + core.eager.Tensor(core.VarDesc.VarType.FP32, [], "Fake_var", + core.VarDesc.VarType.RAW, False) + ] + else: + return [ + core.VarBase(core.VarDesc.VarType.FP32, [], "Fake_var", + core.VarDesc.VarType.RAW, False) + ] + + def _run_dygraph(instance, input, program_holder): # 1. prepare inputs, outputs, attrs @@ -826,17 +842,12 @@ def _run_dygraph(instance, input, program_holder): # hold forward variables if framework._in_eager_mode(): - tmp_scope_vec = core.eager.Tensor( - dtype=core.VarDesc.VarType.FP32, - dims=[], - name="program_out_scope", - type=core.VarDesc.VarType.STEP_SCOPES, - persistable=True) + tmp_scope_vec = [program_holder.scope] else: tmp_scope_vec = core.VarBase(core.VarDesc.VarType.FP32, [], "program_out_scope", core.VarDesc.VarType.STEP_SCOPES, True) - tmp_scope_vec.value().set_scope(program_holder.scope) + tmp_scope_vec.value().set_scope(program_holder.scope) double_grad_vars = [] for var_desc in program_holder.double_grad_descs: @@ -852,41 +863,18 @@ def _run_dygraph(instance, input, program_holder): var_desc.shape(), var_desc.name(), var_desc.type(), False) double_grad_vars.append(var) - if len(double_grad_vars) == 0: - if framework._in_eager_mode(): - double_grad_vars = [ - core.eager.Tensor( - value=[1], - name='Fake_var', - place=framework._current_expected_place()) - ] - else: - double_grad_vars = [ - core.VarBase( - value=[1], - name='Fake_var', - place=framework._current_expected_place()) - ] # 2. run program by op trace_program = program_holder.infer_program if instance._is_test else program_holder.train_program end_op_index = program_holder.infer_program.block(0).op_size() - framework._dygraph_tracer().trace_op( - type='run_program', - inputs={'X': input_vars, - 'Params': persistable_vars}, - outputs={ - 'Out': output_vars, - 'OutScope': tmp_scope_vec, - 'DOut': double_grad_vars - }, - attrs={ - 'global_block': trace_program.block(0), - 'start_op_index': 0, - 'end_op_index': end_op_index, - 'is_test': instance._is_test, - 'program_id': _hash_with_id(trace_program, instance) - }) + attrs = ('global_block', trace_program.block(0), 'start_op_index', 0, + 'end_op_index', end_op_index, 'is_test', instance._is_test, + 'program_id', _hash_with_id(trace_program, instance)) + _C_ops.run_program( + _valid_vars(input_vars), + _valid_vars(persistable_vars), + _valid_vars(output_vars), tmp_scope_vec, + _valid_vars(double_grad_vars), *attrs) # NOTE: [ why need set param's gradient type here ] # if user set sparse gradient mode, the param's gradient # will be SelectedRows, not LoDTensor. But tracer will just @@ -914,8 +902,10 @@ def _run_dygraph(instance, input, program_holder): def drop_scope_if_no_grad(instance, scope_vec): tracer = framework._dygraph_tracer() + scope = scope_vec.value().get_scope() if isinstance(scope_vec, ( + core.VarBase)) else scope_vec[0] if (not instance._is_test) and (not tracer._has_grad): - scope_vec.value().get_scope().drop_kids() + scope.drop_kids() def _run_static_graph(input, program_holder, trace_program): diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py index b1865691b2475c4f855f51244e627965047d7720..1e1ce3ba7e4912d391085c7acbd7aa4bbb6a4da1 100644 --- a/python/paddle/fluid/dygraph/jit.py +++ b/python/paddle/fluid/dygraph/jit.py @@ -821,7 +821,7 @@ def save(layer, path, input_spec=None, **configs): for var in flatten(input_spec): if isinstance(var, paddle.static.InputSpec): inner_input_spec.append(var) - elif isinstance(var, (core.VarBase, Variable)): + elif isinstance(var, (core.VarBase, core.eager.Tensor, Variable)): inner_input_spec.append( paddle.static.InputSpec.from_tensor(var)) else: diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py index 53dbf1a66b27f35a75b44a0b6444cd8282c5278c..6957850d205794363183b4e6ca58a6daf3e11358 100644 --- a/python/paddle/fluid/dygraph/layers.py +++ b/python/paddle/fluid/dygraph/layers.py @@ -760,7 +760,8 @@ class Layer(object): raise KeyError("The name of buffer can not be empty.") elif hasattr(self, name) and name not in self._buffers: raise KeyError("attribute '{}' already exists.".format(name)) - elif tensor is not None and not type(tensor) == core.VarBase: + elif tensor is not None and not (type(tensor) == core.VarBase or + type(tensor) == core.eager.Tensor): raise TypeError( "The registered buffer should be a core.VarBase, but received {}.". format(type(tensor).__name__)) diff --git a/python/paddle/fluid/dygraph/tracer.py b/python/paddle/fluid/dygraph/tracer.py index d0552ca41f0daf56ce23317dd06cb5744baaff84..d8b1883fc62a0fb4575a2e525d7d37a9029cf40d 100644 --- a/python/paddle/fluid/dygraph/tracer.py +++ b/python/paddle/fluid/dygraph/tracer.py @@ -35,6 +35,12 @@ final_state_name_mapping = { "x": "X", "out": "Out", }, + "pool2d": { + "final_op_name": "final_state_pool2d", + "x": "X", + "kernel_size": "ksize", + "out": "Out", + }, "abs": { "final_op_name": "final_state_abs", "x": "X", diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index fd7226c48661fdb2cd4dcf7227d0f8383c6c9439..000f08b0a3e282d815c758b5a153ba53ff84c8e0 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -6299,7 +6299,14 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None): if dim_size == -1: assert unk_dim_idx == -1, ( "Only one dimension value of 'shape' in reshape can " - "be -1. But received shape[%d] is also -1." % dim_idx) + "be -1. But received shape[%d] is also -1.\n" + "\n\t# N = x.shape()[2]\t\t# N is an int. " + "(NOT recommend under @to_static)\n\tN = paddle.shape(x)[2]\t\t" + "# N is a Tensor. (Recommend)\n\tz = paddle.reshape([N, -1, 4])" + "\t# z.shape is [-1, -1, 4]\n\n" + " If your target shape in Reshape represents dynamic shape, " + "please turn it into a Tensor under @to_static. See above example for details." + % dim_idx) unk_dim_idx = dim_idx elif dim_size == 0: assert dim_idx < len(x.shape), ( diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index cbe360f556cd986646bc7f45b3a80ab0f5edb9eb..44e6f8e8f2a6d11371f21fff5a9dccefcd72ebed 100755 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -960,6 +960,7 @@ set_tests_properties(test_bicubic_interp_op PROPERTIES TIMEOUT 120) set_tests_properties(test_deformable_conv_op PROPERTIES TIMEOUT 120) set_tests_properties(test_nearest_interp_op PROPERTIES TIMEOUT 120) set_tests_properties(test_profiler PROPERTIES TIMEOUT 120) +set_tests_properties(test_inplace_eager_fluid PROPERTIES TIMEOUT 120) set_tests_properties(test_inplace_softmax_with_cross_entropy PROPERTIES TIMEOUT 120) set_tests_properties(test_cross_entropy2_op PROPERTIES TIMEOUT 120) set_tests_properties(test_fetch_unmerged PROPERTIES TIMEOUT 120) @@ -1118,9 +1119,9 @@ set_tests_properties(test_cumprod_op PROPERTIES TIMEOUT 120) set_tests_properties(test_split_program PROPERTIES TIMEOUT 120) if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL) set_tests_properties(test_parallel_dygraph_dataparallel PROPERTIES TIMEOUT 120) - set_tests_properties(test_parallel_dygraph_unused_variables PROPERTIES TIMEOUT 120) + set_tests_properties(test_parallel_dygraph_unused_variables PROPERTIES TIMEOUT 150) set_tests_properties(test_parallel_dygraph_control_flow PROPERTIES TIMEOUT 200) - set_tests_properties(test_parallel_dygraph_no_sync PROPERTIES TIMEOUT 120) + set_tests_properties(test_parallel_dygraph_no_sync PROPERTIES TIMEOUT 150) set_tests_properties(test_parallel_dygraph_no_sync_gradient_check PROPERTIES TIMEOUT 30) set_tests_properties(test_parallel_dygraph_pipeline_parallel PROPERTIES TIMEOUT 200) set_tests_properties(test_parallel_dygraph_tensor_parallel PROPERTIES TIMEOUT 200) diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py b/python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py new file mode 100644 index 0000000000000000000000000000000000000000..a2b499a9e01c36eefcb9b6cb91956abc5ee0a99b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py @@ -0,0 +1,179 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 NVIDIA Corporation. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid.contrib import sparsity +from paddle.fluid.contrib.sparsity.supported_layer_list import supported_layers_and_prune_func_map +from paddle.fluid.dygraph.layers import Layer, _convert_camel_to_snake + + +class MyOwnLayer(Layer): + def __init__(self): + super(MyOwnLayer, self).__init__() + + def forward(self, x): + return x + + +static_tensor = None +static_tensor_mask = None + + +def my_own_pruning(tensor, m, n, mask_algo, param_name): + global static_tensor + global static_tensor_mask + if static_tensor is None: + static_tensor = np.random.rand(*tensor.shape).astype(np.float32) + if static_tensor_mask is None: + static_tensor_mask = np.random.rand(*tensor.shape).astype(np.float32) + return static_tensor, static_tensor_mask + + +class TestASPAddSupportedLayer(unittest.TestCase): + def test_add_supported_layer_via_name(self): + sparsity.add_supported_layer("test_supported_1") + sparsity.add_supported_layer("test_supported_2", my_own_pruning) + sparsity.add_supported_layer(MyOwnLayer) + my_own_layer_name = _convert_camel_to_snake(MyOwnLayer.__name__) + + self.assertTrue( + "test_supported_1" in supported_layers_and_prune_func_map) + self.assertTrue( + "test_supported_2" in supported_layers_and_prune_func_map) + self.assertTrue( + "test_supported_2" in supported_layers_and_prune_func_map) + self.assertTrue(supported_layers_and_prune_func_map["test_supported_2"] + == my_own_pruning) + self.assertTrue( + my_own_layer_name in supported_layers_and_prune_func_map) + + +class TestASPStaticCustomerizedPruneFunc(unittest.TestCase): + def setUp(self): + paddle.enable_static() + + self.main_program = fluid.Program() + self.startup_program = fluid.Program() + + self.customer_prefix = "customer_layer" + + def build_model(): + img = fluid.data( + name='img', shape=[None, 3, 32, 32], dtype='float32') + label = fluid.data(name='label', shape=[None, 1], dtype='int64') + hidden = fluid.layers.conv2d( + input=img, num_filters=4, filter_size=3, padding=2, act="relu") + hidden = fluid.layers.fc(input=hidden, + size=32, + act='relu', + name=self.customer_prefix) + hidden = fluid.layers.fc(input=hidden, + size=32, + act='relu', + name=self.customer_prefix) + hidden = fluid.layers.fc(input=hidden, size=32, act='relu') + prediction = fluid.layers.fc(input=hidden, size=10, act='softmax') + return img, label, prediction + + with fluid.program_guard(self.main_program, self.startup_program): + self.img, self.label, self.predict = build_model() + self.supported_layer_count_ref = 5 + + self.place = paddle.CPUPlace() + if core.is_compiled_with_cuda(): + self.place = paddle.CUDAPlace(0) + self.exe = fluid.Executor(self.place) + + sparsity.add_supported_layer(self.customer_prefix, my_own_pruning) + + def test_inference_pruning(self): + self.exe.run(self.startup_program) + + sparsity.prune_model( + self.main_program, mask_algo="mask_1d", with_mask=False) + + supported_layer_count = 0 + for param in self.main_program.global_block().all_parameters(): + mat = np.array(fluid.global_scope().find_var(param.name).get_tensor( + )) + if sparsity.asp.ASPHelper._is_supported_layer(self.main_program, + param.name): + supported_layer_count += 1 + if (self.customer_prefix in param.name): + self.assertLessEqual( + np.sum(mat.flatten() - static_tensor.flatten()), 1e-4) + else: + self.assertTrue( + sparsity.check_sparsity( + mat.T, + func_name=sparsity.CheckMethod.CHECK_1D, + n=2, + m=4)) + self.assertEqual(supported_layer_count, self.supported_layer_count_ref) + + def test_training_pruning(self): + with fluid.program_guard(self.main_program, self.startup_program): + loss = fluid.layers.mean( + fluid.layers.cross_entropy( + input=self.predict, label=self.label)) + optimizer = sparsity.decorate( + fluid.optimizer.SGD(learning_rate=0.01)) + optimizer.minimize(loss, self.startup_program) + + self.exe.run(self.startup_program) + + sparsity.prune_model( + self.main_program, mask_algo="mask_1d", with_mask=True) + + supported_layer_count = 0 + for param in self.main_program.global_block().all_parameters(): + mat = np.array(fluid.global_scope().find_var(param.name).get_tensor( + )) + if sparsity.asp.ASPHelper._is_supported_layer(self.main_program, + param.name): + mat_mask = np.array(fluid.global_scope().find_var( + sparsity.asp.ASPHelper._get_mask_name(param.name)) + .get_tensor()) + supported_layer_count += 1 + if (self.customer_prefix in param.name): + self.assertLessEqual( + np.sum(mat.flatten() - static_tensor.flatten()), 1e-4) + self.assertLessEqual( + np.sum(mat_mask.flatten() - static_tensor_mask.flatten( + )), 1e-4) + else: + self.assertTrue( + sparsity.check_sparsity( + mat.T, + func_name=sparsity.CheckMethod.CHECK_1D, + n=2, + m=4)) + self.assertTrue( + sparsity.check_sparsity( + mat_mask.T, + func_name=sparsity.CheckMethod.CHECK_1D, + n=2, + m=4)) + self.assertEqual(supported_layer_count, self.supported_layer_count_ref) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt index 1f7ae53acdf4536921ca25e27874279f271b4de8..a730d21afa57980538841a3ad7fe874fd2343d4a 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt @@ -9,6 +9,12 @@ if(WITH_DISTRIBUTE AND WITH_GPU) set_tests_properties(test_relaunch_with_gpt_planner PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 240) py_test_modules(test_engine_api MODULES test_engine_api ENVS ${dist_ENVS}) set_tests_properties(test_engine_api PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 80) + py_test_modules(test_while_op_completion MODULES test_while_op_completion ENVS ${dist_ENVS}) py_test_modules(test_converter MODULES test_converter ENVS ${dist_ENVS}) set_tests_properties(test_converter PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 50) + + py_test_modules(test_tunable_variable MODULES test_tunable_variable ENVS ${dist_ENVS}) + py_test_modules(test_tunable_space MODULES test_tunable_space ENVS ${dist_ENVS}) + py_test_modules(test_recorder MODULES test_recorder ENVS ${dist_ENVS}) + py_test_modules(test_trial MODULES test_trial ENVS ${dist_ENVS}) endif() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_recorder.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_recorder.py new file mode 100644 index 0000000000000000000000000000000000000000..ab704a6a25714ef2fb935d6e3e776105aa4142cc --- /dev/null +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_recorder.py @@ -0,0 +1,152 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np + +from paddle.distributed.auto_parallel.tuner import recorder as rd + + +class TestRecorder(unittest.TestCase): + def test_register(self): + recorder = rd.MetricsRecorder() + recorder.register("metric") + self.assertEqual(set(recorder.records.keys()), {"metric"}) + self.assertEqual(recorder.records["metric"].direction, "min") + + def test_exists(self): + recorder = rd.MetricsRecorder() + recorder.register("metric", direction="max") + self.assertTrue(recorder.exists("metric")) + + def test_update(self): + recorder = rd.MetricsRecorder() + recorder.update("metric", 4, 1000) + self.assertEqual(recorder.records["metric"].direction, "min") + self.assertEqual( + recorder.get_records("metric"), [rd.MetricRecord(4, 1000)]) + + def test_get_records(self): + recorder = rd.MetricsRecorder() + recorder.update("metric", 1, step=0) + recorder.update("metric", 2, step=1) + recorder.update("metric", 3, step=2) + recorder.update("metric", 4, step=3) + self.assertEqual( + recorder.get_records("metric"), [ + rd.MetricRecord(1, 0), + rd.MetricRecord(2, 1), + rd.MetricRecord(3, 2), + rd.MetricRecord(4, 3), + ]) + + def test_set_records(self): + recorder = rd.MetricsRecorder() + recorder.set_records( + "metric", + [ + rd.MetricRecord(1, 0), + rd.MetricRecord(2, 1), + rd.MetricRecord(3, 2), + rd.MetricRecord(4, 3), + ], ) + self.assertEqual( + recorder.get_records("metric"), [ + rd.MetricRecord(1, 0), + rd.MetricRecord(2, 1), + rd.MetricRecord(3, 2), + rd.MetricRecord(4, 3), + ]) + + def test_get_best_value(self): + recorder = rd.MetricsRecorder() + recorder.register("metric_min", "min") + recorder.register("metric_max", "max") + + recorder.set_records( + "metric_min", + [ + rd.MetricRecord(1, 0), + rd.MetricRecord(2, 1), + rd.MetricRecord(3, 2), + rd.MetricRecord(4, 3), + ], ) + self.assertEqual(recorder.get_best_value("metric_min"), 1) + + recorder.set_records( + "metric_max", + [ + rd.MetricRecord(1, 0), + rd.MetricRecord(2, 1), + rd.MetricRecord(3, 2), + rd.MetricRecord(4, 3), + ], ) + self.assertEqual(recorder.get_best_value("metric_max"), 4) + + def test_get_best_step(self): + recorder = rd.MetricsRecorder() + + recorder.register("metric_min", "min") + recorder.set_records( + "metric_min", + [ + rd.MetricRecord(1, 0), + rd.MetricRecord(2, 1), + rd.MetricRecord(3, 2), + rd.MetricRecord(4, 3), + ], ) + self.assertEqual(recorder.get_best_step("metric_min"), 0) + + recorder.register("metric_max", "max") + recorder.set_records( + "metric_max", + [ + rd.MetricRecord(1, 0), + rd.MetricRecord(2, 1), + rd.MetricRecord(3, 2), + rd.MetricRecord(4, 3), + ], ) + self.assertEqual(recorder.get_best_step("metric_max"), 3) + + def test_get_statistics(self): + recorder = rd.MetricsRecorder() + records = [rd.MetricRecord(np.random.random(), i) for i in range(14)] + recorder.set_records("metric", records) + stats = recorder.get_statistics("metric") + records = [r.value for r in records] + self.assertEqual(stats["min"], np.min(records)) + self.assertEqual(stats["max"], np.max(records)) + self.assertEqual(stats["mean"], np.mean(records)) + self.assertEqual(stats["median"], np.median(records)) + self.assertEqual(stats["var"], np.var(records)) + self.assertEqual(stats["std"], np.std(records)) + + def test_serialization(self): + recorder = rd.MetricsRecorder() + recorder.register("metric") + recorder.set_records( + "metric", + [ + rd.MetricRecord(1, 0), + rd.MetricRecord(2, 1), + rd.MetricRecord(3, 2), + rd.MetricRecord(4, 3), + ], ) + print(recorder.get_state()) + new_recorder = rd.MetricsRecorder.from_state(recorder.get_state()) + self.assertEqual(new_recorder.records.keys(), recorder.records.keys()) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_trial.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_trial.py new file mode 100644 index 0000000000000000000000000000000000000000..fc52d1c394effc223a609ae5db73ea89a25c298b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_trial.py @@ -0,0 +1,53 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +from paddle.distributed.auto_parallel.tuner import tunable_space as ts +from paddle.distributed.auto_parallel.tuner import trial as tr + + +class TestTiral(unittest.TestCase): + def test_trial(self): + space = ts.TunableSpace() + space.choice("choice", [0, 1, 2, 3], default=2) + trial = tr.Trial(space, trial_id="trial-1") + trial.recorder.register("latency", direction="min") + trial.recorder.update("latency", 0.1, step=0) + trial.recorder.update("latency", 0.2, step=1) + trial.best_step = 0 + + self.assertEqual(trial.id, "trial-1") + self.assertEqual(trial.space.get_value("choice"), 2) + self.assertEqual(trial.best_step, 0) + self.assertEqual(trial.status, "RUNNING") + + def test_serialization(self): + space = ts.TunableSpace() + space.int_range("int_range", start=1, stop=4, default=2) + trial = tr.Trial(space, trial_id="trial-2", status="COMPLETED") + trial.recorder.register("latency", direction="min") + trial.recorder.update("latency", 0.1, step=0) + trial.recorder.update("latency", 0.2, step=1) + trial.best_step = 0 + + new_trial = tr.Trial.from_state(trial.get_state()) + self.assertEqual(new_trial.id, "trial-2") + self.assertEqual(new_trial.space.get_value("int_range"), 2) + self.assertEqual(new_trial.best_step, 0) + self.assertEqual(new_trial.status, "COMPLETED") + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py new file mode 100644 index 0000000000000000000000000000000000000000..1179fd9a9f0887f5133349118eb5b4c8fbab733d --- /dev/null +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py @@ -0,0 +1,209 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import paddle +import numpy as np +import paddle.nn as nn +import paddle.utils as utils +import paddle.static as static +import paddle.nn.functional as F +import paddle.distributed.auto_parallel as auto + +from paddle.distributed import fleet +from paddle.distributed.auto_parallel.completion import Completer +from paddle.distributed.auto_parallel.partitioner import Partitioner +from paddle.distributed.auto_parallel.utils import make_data_unshard +from paddle.distributed.auto_parallel.dist_attribute import OperatorDistributedAttribute, TensorDistributedAttribute +from paddle.distributed.auto_parallel.dist_context import DistributedContext, get_default_distributed_context +from paddle.distributed.auto_parallel.operators import find_best_compatible_distributed_operator_impl +from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr + +paddle.enable_static() + +batch_size = 4 +epoch_num = 10 +hidden_size = 1024 +sequence_len = 512 +_g_process_mesh = [[0, 1], [2, 3]] + + +def get_random_inputs_and_labels(input_shape, label_shape): + input = np.random.random(size=input_shape).astype('float32') + label = np.random.random(size=label_shape).astype('float32') + return input, label + + +def batch_generator_creator(): + def __reader__(): + for _ in range(batch_size): + batch_input, batch_label = get_random_inputs_and_labels( + [batch_size, sequence_len, hidden_size], + [batch_size, sequence_len, 1]) + yield batch_input, batch_label + + return __reader__ + + +class MLPLayer(nn.Layer): + def __init__(self, + hidden_size=1024, + intermediate_size=4 * 1024, + dropout_ratio=0.1, + initializer_range=0.02): + super(MLPLayer, self).__init__() + d_model = hidden_size + dim_feedforward = intermediate_size + param_initializer = nn.initializer.Normal( + mean=0.0, std=initializer_range) + + self.norm = nn.LayerNorm(d_model, epsilon=1e-5) + self.linear0 = nn.Linear( + d_model, + dim_feedforward, + weight_attr=paddle.ParamAttr(initializer=param_initializer), + bias_attr=None) + self.linear1 = nn.Linear( + dim_feedforward, + d_model, + weight_attr=paddle.ParamAttr(initializer=param_initializer), + bias_attr=None) + + def forward(self, input): + out = self.norm(input) + auto.shard_tensor( + self.linear0.weight, + dist_attr={ + "process_mesh": _g_process_mesh[0], + "dims_mapping": [-1, 0] + }) + out = self.linear0(out) + out = F.gelu(out, approximate=True) + auto.shard_tensor( + self.linear1.weight, + dist_attr={ + "process_mesh": _g_process_mesh[1], + "dims_mapping": [0, -1] + }) + out = self.linear1(out) + + return out + + +def loop_cond(i, loop_len, input_array): + return i < loop_len + + +def loop_body(i, loop_len, input_array): + pre_input = paddle.tensor.array_read(array=input_array, i=i) + mlp_while0 = MLPLayer( + hidden_size=hidden_size, + intermediate_size=4 * hidden_size, + dropout_ratio=0.1, + initializer_range=0.02) + + mlp_while1 = MLPLayer( + hidden_size=hidden_size, + intermediate_size=4 * hidden_size, + dropout_ratio=0.1, + initializer_range=0.02) + + output = mlp_while0(pre_input) + cur_pred = mlp_while1(output) + # 更新循环条件 + i = paddle.increment(x=i, value=1) + paddle.tensor.array_write(cur_pred, array=input_array, i=i) + return i, loop_len, input_array + + +def get_program(): + dist_strategy = fleet.DistributedStrategy() + dist_strategy.semi_auto = True + # fleet.init(is_collective=True, strategy=dist_strategy) + + train_program = static.Program() + start_program = static.Program() + with static.program_guard(train_program, start_program): + + # 循环计数器 + i = paddle.full(shape=[1], fill_value=0, dtype='int64') + # 循环次数 + loop_len = paddle.full(shape=[1], fill_value=epoch_num, dtype='int64') + + # input + input = static.data( + name="input", + shape=[batch_size, sequence_len, hidden_size], + dtype='float32') + label = static.data( + name="label", shape=[batch_size, sequence_len, 1], dtype='float32') + data_holder = [input, label] + # dataloader + dataloader = paddle.io.DataLoader.from_generator( + feed_list=data_holder, capacity=4 * batch_size, iterable=False) + dataloader.set_batch_generator( + batch_generator_creator(), places=paddle.static.cuda_places()) + # data dist_attr + auto.shard_tensor( + input, + dist_attr={ + "process_mesh": _g_process_mesh[0], + "dims_mapping": [-1, -1, -1] + }) + auto.shard_tensor( + label, + dist_attr={ + "process_mesh": _g_process_mesh[0], + "dims_mapping": [-1, -1, -1] + }) + + mlp_start = MLPLayer( + hidden_size=hidden_size, + intermediate_size=4 * hidden_size, + dropout_ratio=0.1, + initializer_range=0.02) + pred = mlp_start(input) + + input_array = paddle.tensor.array_write(pred, i) + i, loop_len, input_array = static.nn.while_loop( + cond=loop_cond, + body=loop_body, + loop_vars=[i, loop_len, input_array]) + end_pred = paddle.tensor.array_read(array=input_array, i=i) + + mlp_end = MLPLayer( + hidden_size=hidden_size, + intermediate_size=4 * hidden_size, + dropout_ratio=0.1, + initializer_range=0.02) + pred = mlp_end(end_pred) + + error_cost = paddle.nn.functional.square_error_cost(pred, label) + loss = paddle.mean(error_cost) + + return train_program, start_program, dataloader, i, loss + + +class TestMLP(unittest.TestCase): + def test_completer(self): + train_program, start_program, dataloader, i, loss = get_program() + dist_context = DistributedContext() + completer = Completer(dist_context) + complete_train_program = completer.complete_forward_annotation( + train_program) + # print_program_with_dist_attr(complete_train_program, dist_context) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py index cac64c7391351b23c4b9f9275c4b20bf85f571fd..2b8307461b8f57ea73503cf6ad4e8a90cdba652c 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py @@ -27,6 +27,7 @@ from paddle.fluid.dygraph.nn import Conv2D, Linear, Pool2D from paddle.fluid.optimizer import AdamOptimizer from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX from paddle.fluid.dygraph.dygraph_to_static import ProgramTranslator +from paddle.fluid.framework import _test_eager_guard from predictor_utils import PredictorTools @@ -155,6 +156,13 @@ class TestMNISTWithToStatic(TestMNIST): np.allclose(dygraph_loss, static_loss), msg='dygraph is {}\n static_res is \n{}'.format(dygraph_loss, static_loss)) + with _test_eager_guard(): + dygraph_loss = self.train_dygraph() + static_loss = self.train_static() + self.assertTrue( + np.allclose(dygraph_loss, static_loss), + msg='dygraph is {}\n static_res is \n{}'.format(dygraph_loss, + static_loss)) def test_mnist_declarative_cpu_vs_mkldnn(self): dygraph_loss_cpu = self.train_dygraph() diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py index 06d69daa75d1c755d3c9f2b111e31297c4905d8f..d05be03bbfb193ae25ee039aef1608afdef4f585 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py @@ -223,6 +223,12 @@ def dyfunc_len_paddle_shape(): print(x) +def dyfunc_dict_assign_shape(): + x = paddle.to_tensor([1, 2]) + a = {} + a['shape'] = x.shape[0] + + # 1. Basic tests without control flow class TestTensorShapeBasic(unittest.TestCase): def setUp(self): @@ -592,6 +598,8 @@ class TestPaddleShape(unittest.TestCase): def test_paddle_shape(self): func = paddle.jit.to_static(dyfunc_len_paddle_shape) self.assertEqual('paddle.shape(x)' in func.code, True) + func = paddle.jit.to_static(dyfunc_dict_assign_shape) + self.assertEqual("__static_convert_var_shape_suffix" in func.code, True) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_elementwise_add_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_elementwise_add_fuse_pass.py index 66c547de2c28068a9605c506416baabdf229265d..2e84607e2f5c24bde4b3d291cc1b7f33f89c5751 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_elementwise_add_fuse_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_elementwise_add_fuse_pass.py @@ -25,17 +25,120 @@ from hypothesis import given, settings, seed, example, assume import hypothesis.strategies as st -class TestConvElementwiseAddMkldnnFusePass(PassAutoScanTest): +# the two inputs of elementwise_add are tensor +class TestConvElementwiseAddMkldnnFusePass1(PassAutoScanTest): def is_program_valid(self, program_config: ProgramConfig) -> bool: attrs = [ program_config.ops[i].attrs for i in range(len(program_config.ops)) ] - # If the problem has been fixed, the judgment - # needs to be deleted!!! - if attrs[1]['data_format'] == "NHWC": + if attrs[1]['data_format'] == "NHWC" and attrs[3]['axis'] == 0: + return False + if attrs[1]['data_format'] == "NCHW" and attrs[3]['axis'] == -1: return False + return True + + def sample_program_config(self, draw): + data_format = draw(st.sampled_from(["NCHW", "NHWC"])) + dilations = draw(st.sampled_from([[1, 1], [2, 2], [1, 2]])) + padding_algorithm = draw(st.sampled_from(["EXPLICIT", "SAME", "VALID"])) + groups = draw(st.sampled_from([1, 2, 4])) + paddings = draw(st.sampled_from([[0, 3], [1, 1], [1, 2, 3, 4]])) + strides = draw(st.sampled_from([[1, 1], [2, 2], [1, 2]])) + axis = draw(st.sampled_from([-1, 0])) + batch_size = draw(st.integers(min_value=1, max_value=4)) + + def generate_input(): + if data_format == "NCHW": + return np.random.random( + [batch_size, 48, 64, 64]).astype(np.float32) + else: + return np.random.random( + [batch_size, 64, 64, 48]).astype(np.float32) + + def generate_weight(): + return np.random.random( + [48, int(48 / groups), 3, 3]).astype(np.float32) + + relu_op = OpConfig( + type="relu", + inputs={"X": ["input_data"]}, + outputs={"Out": ["relu_out"]}, + attrs={}) + + conv2d_op1 = OpConfig( + type="conv2d", + inputs={"Input": ["relu_out"], + "Filter": ["conv_weight1"]}, + outputs={"Output": ["conv_output1"]}, + attrs={ + "data_format": data_format, + "dilations": dilations, + "padding_algorithm": padding_algorithm, + "groups": groups, + "paddings": paddings, + "strides": strides + }) + + conv2d_op2 = OpConfig( + type="conv2d", + inputs={"Input": ["input_data"], + "Filter": ["conv_weight2"]}, + outputs={"Output": ["conv_output2"]}, + attrs={ + "data_format": data_format, + "dilations": dilations, + "padding_algorithm": padding_algorithm, + "groups": groups, + "paddings": paddings, + "strides": strides + }) + + elt_op = OpConfig( + type="elementwise_add", + inputs={"X": ["conv_output1"], + "Y": ["conv_output2"]}, + outputs={"Out": ["elementwise_output"]}, + attrs={'axis': axis}) + model_net = [relu_op, conv2d_op1, conv2d_op2, elt_op] + + program_config = ProgramConfig( + ops=model_net, + weights={ + "conv_weight1": TensorConfig(data_gen=partial(generate_weight)), + "conv_weight2": TensorConfig(data_gen=partial(generate_weight)) + }, + inputs={ + "input_data": TensorConfig(data_gen=partial(generate_input)) + }, + outputs=["elementwise_output"]) + + return program_config + + def sample_predictor_configs(self, program_config): + config = self.create_inference_config(use_mkldnn=True) + yield config, ["relu", "conv2d", "conv2d"], (1e-5, 1e-5) + + def test(self): + self.run_and_statis( + quant=False, passes=["conv_elementwise_add_mkldnn_fuse_pass"]) + + +''' +class TestConvElementwiseAddMkldnnFusePass(PassAutoScanTest): + def is_program_valid(self, program_config: ProgramConfig) -> bool: + attrs = [ + program_config.ops[i].attrs + for i in range(len(program_config.ops)) + ] + if "elementwise_weight" in program_config.weights: + if program_config.weights["elementwise_weight"].shape[0] == program_config.inputs["input_data1"].shape[1]: + if attrs[2]['axis'] != 1: + return False + if program_config.weights["elementwise_weight"].shape[0] == program_config.inputs["input_data1"].shape[3]: + if attrs[2]['axis'] != -1: + return False return True def sample_program_config(self, draw): @@ -101,7 +204,7 @@ class TestConvElementwiseAddMkldnnFusePass(PassAutoScanTest): "strides": strides }) - if axis == -1 or axis == 0: + if axis == 0: elt_op = OpConfig( type="elementwise_add", inputs={"X": ["input_data1"], @@ -118,14 +221,12 @@ class TestConvElementwiseAddMkldnnFusePass(PassAutoScanTest): model_net = [relu_op, conv2d_op, elt_op] - if axis == 1: + if axis == 0: program_config = ProgramConfig( ops=model_net, weights={ "conv_weight": - TensorConfig(data_gen=partial(generate_weight1)), - "elementwise_weight": - TensorConfig(data_gen=partial(generate_weight2)) + TensorConfig(data_gen=partial(generate_weight1)) }, inputs={ "input_data1": @@ -137,7 +238,9 @@ class TestConvElementwiseAddMkldnnFusePass(PassAutoScanTest): ops=model_net, weights={ "conv_weight": - TensorConfig(data_gen=partial(generate_weight1)) + TensorConfig(data_gen=partial(generate_weight1)), + "elementwise_weight": + TensorConfig(data_gen=partial(generate_weight2)) }, inputs={ "input_data1": @@ -154,7 +257,7 @@ class TestConvElementwiseAddMkldnnFusePass(PassAutoScanTest): def test(self): self.run_and_statis( quant=False, passes=["conv_elementwise_add_mkldnn_fuse_pass"]) - +''' if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_gelu_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_gelu_fuse_pass.py index 33df428388882f2e536ecebb15a1f5dae6a6afc5..81bb182802ede6a2b78ffc44345cdcf382d344af 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_gelu_fuse_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_gelu_fuse_pass.py @@ -19,6 +19,7 @@ import paddle.inference as paddle_infer from functools import partial from typing import Optional, List, Callable, Dict, Any, Set import unittest +import paddle import hypothesis from hypothesis import given, settings, seed, example, assume @@ -104,4 +105,5 @@ class TestConvGeluMkldnnFusePass(PassAutoScanTest): if __name__ == "__main__": + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt index 8e31d58195be8b17243fd5203fd8ced17c11f183..e9d9af5c11366c258d1fdab34b1e9ea345b0bfad 100644 --- a/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt @@ -22,4 +22,5 @@ if (WITH_ASCEND_CL) set_tests_properties(test_conv2d_transpose_op_npu PROPERTIES TIMEOUT 200) set_tests_properties(test_conv2d_op_npu PROPERTIES TIMEOUT 300) set_tests_properties(test_matmulv2_op_npu PROPERTIES TIMEOUT 300) + set_tests_properties(test_elementwise_add_op_npu PROPERTIES TIMEOUT 200) endif() diff --git a/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py index 877f9904f3407c8e600995c2cf65cf849d49cdd5..e01b2b691a28aa788836a7f0d66fb2723fc1b364 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py @@ -144,6 +144,7 @@ class TestBatchNormOpTraining(unittest.TestCase): def setUp(self): self.set_npu() + self.init_dtype() self.use_mkldnn = False self.fuse_with_relu = False self.data_formats = ["NCHW", "NHWC"] @@ -153,6 +154,9 @@ class TestBatchNormOpTraining(unittest.TestCase): self.init_kernel_type() self.init_test_case() + def init_dtype(self): + self.dtype = np.float32 + def init_test_case(self): self.use_global_stats = False self.no_grad_set = set() @@ -210,11 +214,16 @@ class TestBatchNormOpTraining(unittest.TestCase): scale_shape = [c] np.random.seed(123) - x = np.random.random_sample(shape).astype(np.float32) + x = np.random.random_sample(shape).astype(self.dtype) scale = np.random.random_sample(scale_shape).astype(np.float32) bias = np.random.random_sample(scale_shape).astype(np.float32) mean, variance = self.set_mean_variance(scale_shape, x, data_layout) - y_grad = np.random.random_sample(shape).astype(np.float32) + + if self.dtype == np.float16: + mean = mean.astype(np.float32) + variance = variance.astype(np.float32) + + y_grad = np.random.random_sample(shape).astype(self.dtype) momentum_var = np.array([momentum]).astype(np.float32) y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad = self.ref_forward_backward( @@ -275,7 +284,7 @@ class TestBatchNormOpTraining(unittest.TestCase): inputs=inputs, outputs=outputs, attrs=attrs) - block.create_var(name='y@GRAD', dtype='float32', shape=y.shape) + block.create_var(name='y@GRAD', dtype=self.dtype, shape=y.shape) # generate backward op_desc grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc( @@ -320,6 +329,11 @@ class TestBatchNormOpTraining(unittest.TestCase): pass +class TestFP16BatchNormOpTraining(TestBatchNormOpTraining): + def init_dtype(self): + self.dtype = np.float16 + + class TestBatchNormOpTrainingCase1(TestBatchNormOpTraining): def init_test_case(self): self.use_global_stats = False diff --git a/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_depthwise_conv_npu.py b/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_depthwise_conv_npu.py index 012a6e59e775f8ab2d27c23c779571022d6c194f..2e15a1eac2b4b891712fb5889a8974a04c5766c0 100755 --- a/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_depthwise_conv_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_depthwise_conv_npu.py @@ -132,36 +132,50 @@ class TestDepthwiseConvNPU(OpTest): self.check_output_with_place(self.place, atol=1e-2) def test_check_grad(self): - if self.dtype == np.float16: - return if self.dilations[0] == 1 and self.dilations[1] == 1: - self.check_grad_with_place( - self.place, {'Input', 'Filter'}, - 'Output', - max_relative_error=0.03, - numeric_place=paddle.CPUPlace()) + if self.dtype == np.float16: + self.check_grad_with_place( + self.place, {'Input', 'Filter'}, + 'Output', + max_relative_error=0.9) + else: + self.check_grad_with_place( + self.place, {'Input', 'Filter'}, + 'Output', + max_relative_error=0.03, + numeric_place=paddle.CPUPlace()) def test_check_grad_no_filter(self): if self.dtype == np.float16: - return - self.check_grad_with_place( - self.place, ['Input'], - 'Output', - no_grad_set=set(['Filter']), - max_relative_error=0.03, - numeric_place=paddle.CPUPlace()) - - def test_check_grad_no_input(self): - if self.dtype == np.float16: - return - if self.dilations[0] == 1 and self.dilations[1] == 1: self.check_grad_with_place( - self.place, ['Filter'], + self.place, ['Input'], 'Output', - no_grad_set=set(['Input']), + no_grad_set=set(['Filter']), + max_relative_error=0.9) + else: + self.check_grad_with_place( + self.place, ['Input'], + 'Output', + no_grad_set=set(['Filter']), max_relative_error=0.03, numeric_place=paddle.CPUPlace()) + def test_check_grad_no_input(self): + if self.dilations[0] == 1 and self.dilations[1] == 1: + if self.dtype == np.float16: + self.check_grad_with_place( + self.place, ['Filter'], + 'Output', + no_grad_set=set(['Input']), + max_relative_error=0.9) + else: + self.check_grad_with_place( + self.place, ['Filter'], + 'Output', + no_grad_set=set(['Input']), + max_relative_error=0.03, + numeric_place=paddle.CPUPlace()) + def init_data_format(self): self.data_format = "NCHW" @@ -267,32 +281,46 @@ class TestDepthwiseConvNPU_Padding(OpTest): def test_check_grad(self): if self.dtype == np.float16: - return - self.check_grad_with_place( - self.place, {'Input', 'Filter'}, - 'Output', - max_relative_error=0.03, - numeric_place=paddle.CPUPlace()) + self.check_grad_with_place( + self.place, {'Input', 'Filter'}, + 'Output', + max_relative_error=1.2) + else: + self.check_grad_with_place( + self.place, {'Input', 'Filter'}, + 'Output', + max_relative_error=0.03, + numeric_place=paddle.CPUPlace()) def test_check_grad_no_filter(self): if self.dtype == np.float16: - return - self.check_grad_with_place( - self.place, ['Input'], - 'Output', - max_relative_error=0.03, - no_grad_set=set(['Filter']), - numeric_place=paddle.CPUPlace()) + self.check_grad_with_place( + self.place, ['Input'], + 'Output', + max_relative_error=0.7, + no_grad_set=set(['Filter'])) + else: + self.check_grad_with_place( + self.place, ['Input'], + 'Output', + max_relative_error=0.03, + no_grad_set=set(['Filter']), + numeric_place=paddle.CPUPlace()) def test_check_grad_no_input(self): if self.dtype == np.float16: - return - self.check_grad_with_place( - self.place, ['Filter'], - 'Output', - max_relative_error=0.03, - no_grad_set=set(['Input']), - numeric_place=paddle.CPUPlace()) + self.check_grad_with_place( + self.place, ['Filter'], + 'Output', + max_relative_error=0.8, + no_grad_set=set(['Input'])) + else: + self.check_grad_with_place( + self.place, ['Filter'], + 'Output', + max_relative_error=0.03, + no_grad_set=set(['Input']), + numeric_place=paddle.CPUPlace()) def init_data_format(self): self.data_format = "NCHW" diff --git a/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_npu.py index d0dc86055a1635c8bac644570f17e158cb2adda3..4070d0267d95b6cec2d3a2cb9926f9b389b69c50 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_npu.py @@ -127,8 +127,6 @@ class TestConv2DOp(OpTest): self.check_output_with_place(fluid.NPUPlace(0), atol=1e-2) def test_check_grad(self): - if self.dtype == np.float16: - return self.check_grad_with_place( fluid.NPUPlace(0), {'Input', 'Filter'}, 'Output', @@ -136,8 +134,6 @@ class TestConv2DOp(OpTest): numeric_place=paddle.CPUPlace()) def test_check_grad_no_filter(self): - if self.dtype == np.float16: - return self.check_grad_with_place( fluid.NPUPlace(0), ['Input'], 'Output', @@ -146,8 +142,6 @@ class TestConv2DOp(OpTest): numeric_place=paddle.CPUPlace()) def test_check_grad_no_input(self): - if self.dtype == np.float16: - return self.check_grad_with_place( fluid.NPUPlace(0), ['Filter'], 'Output', @@ -276,10 +270,13 @@ class TestConv2DOp_v2(OpTest): def set_npu(self): self.__class__.use_npu = True + def init_dtype(self): + self.dtype = np.float32 + def setUp(self): self.set_npu() self.op_type = "conv2d" - self.dtype = np.float32 + self.init_dtype() self.init_kernel_type() self.init_group() self.init_dilation() @@ -320,31 +317,45 @@ class TestConv2DOp_v2(OpTest): def test_check_grad(self): if self.dtype == np.float16: - return - self.check_grad_with_place( - paddle.NPUPlace(0), {'Input', 'Filter'}, - 'Output', - max_relative_error=0.02, - numeric_place=paddle.CPUPlace()) + self.check_grad_with_place( + paddle.NPUPlace(0), {'Input', 'Filter'}, + 'Output', + max_relative_error=1.1) + else: + self.check_grad_with_place( + paddle.NPUPlace(0), {'Input', 'Filter'}, + 'Output', + max_relative_error=0.02, + numeric_place=paddle.CPUPlace()) def test_check_grad_no_filter(self): if self.dtype == np.float16: - return - self.check_grad_with_place( - paddle.NPUPlace(0), ['Input'], - 'Output', - max_relative_error=0.02, - no_grad_set=set(['Filter']), - numeric_place=paddle.CPUPlace()) + self.check_grad_with_place( + paddle.NPUPlace(0), ['Input'], + 'Output', + max_relative_error=0.99, + no_grad_set=set(['Filter'])) + else: + self.check_grad_with_place( + paddle.NPUPlace(0), ['Input'], + 'Output', + max_relative_error=0.02, + no_grad_set=set(['Filter']), + numeric_place=paddle.CPUPlace()) def test_check_grad_no_input(self): if self.dtype == np.float16: - return - self.check_grad_with_place( - paddle.NPUPlace(0), ['Filter'], - 'Output', - no_grad_set=set(['Input']), - numeric_place=paddle.CPUPlace()) + self.check_grad_with_place( + paddle.NPUPlace(0), ['Filter'], + 'Output', + max_relative_error=0.99, + no_grad_set=set(['Input'])) + else: + self.check_grad_with_place( + paddle.NPUPlace(0), ['Filter'], + 'Output', + no_grad_set=set(['Input']), + numeric_place=paddle.CPUPlace()) def init_test_case(self): self.pad = [0, 0] diff --git a/python/paddle/fluid/tests/unittests/npu/test_cos_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_cos_op_npu.py index 9b29fc812faedde2aa28c9b597c6e8449bbd36b0..a4769442b083eb845daa9f7989c8621a3d475ef8 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_cos_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_cos_op_npu.py @@ -51,8 +51,6 @@ class TestCos(OpTest): self.check_output_with_place(self.place, atol=1e-7) def test_check_grad(self): - if self.dtype == np.float16: - return self.check_grad_with_place(self.place, ['X'], 'Out') diff --git a/python/paddle/fluid/tests/unittests/npu/test_dropout_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_dropout_op_npu.py index bd9022f56a3e77fa92c74637d5947869b201ac54..fea8502f2d7664b2717b42df9923171f880a1db2 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_dropout_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_dropout_op_npu.py @@ -56,8 +56,6 @@ class TestDropoutOp(OpTest): self.check_output_with_place(self.place) def test_check_grad_normal(self): - if self.dtype == np.float16: - return self.check_grad_with_place(self.place, ['X'], 'Out') diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py index 75c70e0a131ac996395427d9d3cdb7f2b7dd8ff7..f24c6c455a0cb306df4ea048641351c5309f5acd 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py @@ -65,36 +65,59 @@ class TestElementwiseAddOp(OpTest): self.check_output_with_place(self.place) def test_check_grad_normal(self): - if self.dtype == np.float16 or self.dtype == np.int64: + if self.dtype == np.int64: return - self.check_grad_with_place( - self.place, - ['X', 'Y'], - 'Out', - max_relative_error=0.006, ) + if self.dtype == np.float16: + self.check_grad_with_place( + self.place, + ['X', 'Y'], + 'Out', + max_relative_error=0.15, ) + else: + self.check_grad_with_place( + self.place, + ['X', 'Y'], + 'Out', + max_relative_error=0.006, ) def test_check_grad_ingore_x(self): - if self.dtype == np.float16 or self.dtype == np.int64: + if self.dtype == np.int64: return - self.check_grad_with_place( - self.place, - ['Y'], - 'Out', - no_grad_set=set("X"), - max_relative_error=0.006, ) + if self.dtype == np.float16: + self.check_grad_with_place( + self.place, + ['Y'], + 'Out', + no_grad_set=set("X"), + max_relative_error=0.92, ) + else: + self.check_grad_with_place( + self.place, + ['Y'], + 'Out', + no_grad_set=set("X"), + max_relative_error=0.006, ) def test_check_grad_ingore_y(self): - if self.dtype == np.float16 or self.dtype == np.int64: + if self.dtype == np.int64: return - self.check_grad_with_place( - self.place, - ['X'], - 'Out', - no_grad_set=set("Y"), - max_relative_error=0.006, ) + if self.dtype == np.float16: + self.check_grad_with_place( + self.place, + ['X'], + 'Out', + no_grad_set=set("Y"), + max_relative_error=0.8, ) + else: + self.check_grad_with_place( + self.place, + ['X'], + 'Out', + no_grad_set=set("Y"), + max_relative_error=0.006, ) class TestFP16ElementwiseAddOp(TestElementwiseAddOp): diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_max_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_max_op_npu.py index 461e15352e3837f90dfa290bf32dddc2ab26b6b8..cbfc07f35447939c9db7e216db0d3a1f530630fe 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_max_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_max_op_npu.py @@ -116,19 +116,13 @@ class TestElementwiseMaxOp(OpTest): self.check_output_with_place(self.place) def test_check_grad_normal(self): - if self.dtype == np.float16: - return self.check_grad_with_place(self.place, ['X', 'Y'], 'Out') def test_check_grad_ingore_x(self): - if self.dtype == np.float16: - return self.check_grad_with_place( self.place, ['Y'], 'Out', no_grad_set=set("X")) def test_check_grad_ingore_y(self): - if self.dtype == np.float16: - return self.check_grad_with_place( self.place, ['X'], 'Out', no_grad_set=set("Y")) @@ -213,15 +207,11 @@ class TestElementwiseMaxOp_broadcast_2(TestElementwiseMaxOp): self.out = np.maximum(self.x, self.y.reshape(1, 1, 100)) def test_check_grad_normal(self): - if self.dtype == np.float16: - return dx, dy = ComputeGrad(self.x, self.y, self.out, self.axis) self.check_grad_with_place( self.place, ['X', 'Y'], 'Out', user_defined_grads=[dx, dy]) def test_check_grad_ingore_x(self): - if self.dtype == np.float16: - return _, dy = ComputeGrad(self.x, self.y, self.out, self.axis) self.check_grad_with_place( self.place, ['Y'], @@ -230,8 +220,6 @@ class TestElementwiseMaxOp_broadcast_2(TestElementwiseMaxOp): user_defined_grads=[dy]) def test_check_grad_ingore_y(self): - if self.dtype == np.float16: - return dx, _ = ComputeGrad(self.x, self.y, self.out, self.axis) self.check_grad_with_place( self.place, ['X'], diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py index 51cf5cdaf6d1afb4a6aad64ddac4600b8d800358..e191224df81ee419e58b843dfd90b74c3fd113c1 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py @@ -64,32 +64,41 @@ class TestElementwiseMinOp(OpTest): def test_check_grad_normal(self): if self.dtype == np.float16: - return - - self.check_grad_with_place( - self.place, - ['X', 'Y'], - 'Out', ) + self.check_grad_with_place( + self.place, ['X', 'Y'], 'Out', max_relative_error=0.5) + else: + self.check_grad_with_place( + self.place, + ['X', 'Y'], + 'Out', ) def test_check_grad_ingore_x(self): if self.dtype == np.float16: - return - - self.check_grad_with_place( - self.place, - ['Y'], - 'Out', - no_grad_set=set("X"), ) + self.check_grad_with_place( + self.place, ['Y'], + 'Out', + no_grad_set=set("X"), + max_relative_error=0.9) + else: + self.check_grad_with_place( + self.place, + ['Y'], + 'Out', + no_grad_set=set("X"), ) def test_check_grad_ingore_y(self): if self.dtype == np.float16: - return - - self.check_grad_with_place( - self.place, - ['X'], - 'Out', - no_grad_set=set("Y"), ) + self.check_grad_with_place( + self.place, ['X'], + 'Out', + no_grad_set=set("Y"), + max_relative_error=0.1) + else: + self.check_grad_with_place( + self.place, + ['X'], + 'Out', + no_grad_set=set("Y"), ) class TestElementwiseMinOpFp16(TestElementwiseMinOp): diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_pow_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_pow_op_npu.py index ce645f317d054c264a730c150df42bccbfabbeee..907e149c8b2c3fb6093b279849a0aff48abfdb39 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_pow_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_pow_op_npu.py @@ -114,8 +114,6 @@ class TestElementwisePow(OpTest): self.out = np.power(self.x, self.y) def test_check_grad_normal(self): - if self.dtype == np.float16: - return dx, dy = ComputeGrad(self.x, self.y, self.out, self.axis) self.check_grad_with_place( self.place, ['X', 'Y'], 'Out', user_defined_grads=[dx, dy]) @@ -184,8 +182,6 @@ class TestElementwisePowOp_broadcast_0(TestElementwisePow): self.out = np.power(self.x, self.y) def test_check_grad_normal(self): - if self.dtype == np.float16: - return dx, dy = ComputeGrad(self.x, self.y, self.out, self.axis) self.check_grad_with_place( self.place, ['X', 'Y'], 'Out', user_defined_grads=[dx, dy]) @@ -218,8 +214,6 @@ class TestElementwisePowOp_broadcast_1(TestElementwisePow): self.out = np.power(self.x, self.y.reshape(1, 100, 1)) def test_check_grad_normal(self): - if self.dtype == np.float16: - return dx, dy = ComputeGrad(self.x, self.y, self.out, self.axis) self.check_grad_with_place( self.place, ['X', 'Y'], 'Out', user_defined_grads=[dx, dy]) @@ -252,8 +246,6 @@ class TestElementwisePowOp_broadcast_2(TestElementwisePow): self.out = np.power(self.x, self.y.reshape(100, 1, 1)) def test_check_grad_normal(self): - if self.dtype == np.float16: - return dx, dy = ComputeGrad(self.x, self.y, self.out, self.axis) self.check_grad_with_place( self.place, ['X', 'Y'], 'Out', user_defined_grads=[dx, dy]) diff --git a/python/paddle/fluid/tests/unittests/npu/test_exp_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_exp_op_npu.py index ccd5f0649d8dc68bb9cc8bb3e1736ced26c7cf7f..6be2fe0086b128851a79016fbeb2eaf705111199 100755 --- a/python/paddle/fluid/tests/unittests/npu/test_exp_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_exp_op_npu.py @@ -50,8 +50,6 @@ class TestExpNPUOP(OpTest): self.check_output_with_place(self.place) def test_check_grad(self): - if self.dtype == np.float16: - return self.check_grad_with_place(self.place, ['X'], 'Out') def init_dtype(self): diff --git a/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py index 89ac9e09aa3488c25000c7801f108e036f33934e..83b65630d801a40aebf59e0f8e464aae5827d84a 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py @@ -34,7 +34,7 @@ class TestExpand(OpTest): self.init_dtype() np.random.seed(SEED) - x = np.random.randn(3, 1, 7).astype(self.dtype) + x = np.random.randn(30, 1, 7).astype(self.dtype) out = np.tile(x, [1, 10, 1]) self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} @@ -50,12 +50,8 @@ class TestExpand(OpTest): def test_check_output(self): self.check_output_with_place(self.place) - # TODO(ascendrc): Add grad test - # def test_check_grad(self): - # if self.dtype == np.float16: - # return - # self.check_grad(['X'], 'Out') - # + def test_check_grad(self): + self.check_grad(['X'], 'Out') class TestExpandV2(TestExpand): @@ -66,7 +62,7 @@ class TestExpandV2(TestExpand): self.init_dtype() np.random.seed(SEED) - x = np.random.randn(3, 1, 7).astype(self.dtype) + x = np.random.randn(30, 1, 7).astype(self.dtype) out = np.tile(x, [1, 10, 1]) expand_times = np.array([1, 10, 1]).astype(np.int32) @@ -145,7 +141,7 @@ class TestExpand_expand_times_all_one(TestExpand): self.init_dtype() np.random.seed(SEED) - x = np.random.randn(3, 1, 7).astype(self.dtype) + x = np.random.randn(30, 1, 7).astype(self.dtype) out = np.tile(x, [1, 1, 1]) self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} diff --git a/python/paddle/fluid/tests/unittests/npu/test_hard_sigmoid_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_hard_sigmoid_op_npu.py index d7aafccc88cf8d9ffde9c0b4923239abe14c3cc9..f1d89cb8d561b2cb0b10e94d0d1f084cf8733ea1 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_hard_sigmoid_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_hard_sigmoid_op_npu.py @@ -59,9 +59,6 @@ class TestNPUHardSigmoid(OpTest): self.check_output_with_place(self.place, atol=1e-5) def test_check_grad(self): - if self.dtype == np.float16: - return - self.check_grad_with_place(self.place, ['X'], 'Out') def set_npu(self): diff --git a/python/paddle/fluid/tests/unittests/npu/test_hard_swish_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_hard_swish_op_npu.py index 32042ba83a9f7723a03f3865319dafc13e1ae649..9495cdb8a55aa9e4e62ad66117cc9b41308d5d76 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_hard_swish_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_hard_swish_op_npu.py @@ -66,8 +66,6 @@ class TestHardSwishNPU(OpTest): self.check_output_with_place(self.place) def test_check_grad(self): - if self.dtype == np.float16: - return # There is a problem that precision of grad result using float32 # can't satisfy the default precision requirement # when compared with numeric_grads, but the results on diff --git a/python/paddle/fluid/tests/unittests/npu/test_huber_loss_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_huber_loss_op_npu.py index 1c9f499d22db42bf89a40b64e8f05a131785956e..a9c195bb8cd29f2c278ee974601eca1ad7e0358d 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_huber_loss_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_huber_loss_op_npu.py @@ -81,13 +81,9 @@ class TestHuberLossOp(OpTest): self.check_output_with_place(self.place) def test_check_grad_normal(self): - if self.dtype == np.float16: - return self.check_grad_with_place(self.place, ['X', 'Y'], 'Out') def test_check_grad_ingore_x(self): - if self.dtype == np.float16: - return self.check_grad_with_place( self.place, ['Y'], 'Out', @@ -95,8 +91,6 @@ class TestHuberLossOp(OpTest): no_grad_set=set("residual")) def test_check_grad_ingore_y(self): - if self.dtype == np.float16: - return self.check_grad_with_place( self.place, ['X'], 'Out', diff --git a/python/paddle/fluid/tests/unittests/npu/test_label_smooth_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_label_smooth_op_npu.py index 6e5b4c012053f7e5e8cee28c7d54be3152ecb4cd..d02ddae461ba5c4182c03c70f7b7e39b639baa9d 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_label_smooth_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_label_smooth_op_npu.py @@ -78,8 +78,10 @@ class TestLabelSmoothOp(OpTest): def test_check_grad(self): if self.dtype == np.float16: - return - self.check_grad_with_place(self.place, ['X'], 'Out') + self.check_grad_with_place( + self.place, ['X'], 'Out', max_relative_error=0.5) + else: + self.check_grad_with_place(self.place, ['X'], 'Out') class TestLabelSmoothOpWithPriorDist(TestLabelSmoothOp): diff --git a/python/paddle/fluid/tests/unittests/npu/test_leaky_relu_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_leaky_relu_op_npu.py index 590a961269989548ee03ed550bcb6ef3faa527f0..a0472f9611eb01c8230efae3555025967398f2f0 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_leaky_relu_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_leaky_relu_op_npu.py @@ -63,8 +63,10 @@ class TestLeadyRelu(OpTest): def test_check_grad(self): if self.dtype == np.float16: - return - self.check_grad_with_place(self.place, ['X'], 'Out') + self.check_grad_with_place( + self.place, ['X'], 'Out', max_relative_error=0.006) + else: + self.check_grad_with_place(self.place, ['X'], 'Out') class TestLeadyReluFP16(TestLeadyRelu): diff --git a/python/paddle/fluid/tests/unittests/npu/test_log_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_log_op_npu.py index 9534431e99a7a2e0218fe08dfd95a770b9924915..5da3cb0ce56503da8edda2506077f7de273375ef 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_log_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_log_op_npu.py @@ -50,12 +50,8 @@ class TestLog(OpTest): def test_check_output(self): self.check_output_with_place(self.place) - # TODO(ascendrc): Add grad test - # def test_check_grad(self): - # if self.dtype == np.float16: - # return - # self.check_grad(['X'], 'Out') - # + def test_check_grad(self): + self.check_grad(['X'], 'Out') class TestLogFp16(OpTest): diff --git a/python/paddle/fluid/tests/unittests/npu/test_log_softmax_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_log_softmax_op_npu.py index f6baefec7f29e769a23a8777b0a5796289c6606d..10ec8621ffa58d9a2ada40f2ff6537a685094cc5 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_log_softmax_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_log_softmax_op_npu.py @@ -63,9 +63,13 @@ class TestLogSoftmaxNPUOp(OpTest): def test_check_grad(self): if self.dtype == np.float16: - return - self.check_grad_with_place( - self.place, ['X'], ['Out'], user_defined_grads=[self.x_grad]) + self.check_grad_with_place( + self.place, ['X'], ['Out'], + user_defined_grads=[self.x_grad], + max_relative_error=0.02) + else: + self.check_grad_with_place( + self.place, ['X'], ['Out'], user_defined_grads=[self.x_grad]) def test_class(op_type, typename): diff --git a/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py index fefff0974ae40d2b9ac9d1a5f81410283cef0761..8ec9eb1cf3572703c656408da21a1f2f3d79123e 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py @@ -77,8 +77,10 @@ class TestLookupTableV2(OpTest): def test_check_grad(self): if self.dtype == np.float16: - return - self.check_grad_with_place(self.place, ['W'], 'Out') + self.check_grad_with_place( + self.place, ['W'], 'Out', max_relative_error=0.01) + else: + self.check_grad_with_place(self.place, ['W'], 'Out') class TestLookupTableV2FP16(TestLookupTableV2): diff --git a/python/paddle/fluid/tests/unittests/npu/test_nearest_interp_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_nearest_interp_v2_op_npu.py index f3df1fca30749e55599c7f19d336ddb9ff41edbd..ec51dcf3f8e3e107574dc02ee69693664b74ff36 100755 --- a/python/paddle/fluid/tests/unittests/npu/test_nearest_interp_v2_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_nearest_interp_v2_op_npu.py @@ -39,10 +39,11 @@ class TestNearestInterpOp(OpTest): self.set_npu() self.out_size = None self.actual_shape = None + self.init_dtype() self.data_layout = 'NCHW' self.init_test_case() self.op_type = "nearest_interp_v2" - input_np = np.random.random(self.input_shape).astype("float32") + input_np = np.random.random(self.input_shape).astype(self.dtype) if self.data_layout == "NCHW": in_h = self.input_shape[2] @@ -95,8 +96,21 @@ class TestNearestInterpOp(OpTest): self.check_output_with_place(self.place) def test_check_grad(self): - self.check_grad_with_place( - self.place, ['X'], 'Out', in_place=True, max_relative_error=0.006) + if self.dtype == np.float16: + self.check_grad_with_place( + self.place, ['X'], + 'Out', + in_place=True, + max_relative_error=0.02) + else: + self.check_grad_with_place( + self.place, ['X'], + 'Out', + in_place=True, + max_relative_error=0.006) + + def init_dtype(self): + self.dtype = np.float32 def init_test_case(self): self.interp_method = 'nearest' @@ -108,6 +122,11 @@ class TestNearestInterpOp(OpTest): self.align_corners = False +class TestNearestNeighborInterpFP16(TestNearestInterpOp): + def init_dtype(self): + self.dtype = np.float16 + + class TestNearestNeighborInterpCase1(TestNearestInterpOp): def init_test_case(self): self.interp_method = 'nearest' diff --git a/python/paddle/fluid/tests/unittests/npu/test_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_norm_op_npu.py index 2c41f09ff51488dd8e6eff48fa0dec0a6917bf50..8e28b3fe413b071d63123203d0f2a842f6d041ba 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_norm_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_norm_op_npu.py @@ -54,9 +54,6 @@ class TestNPUNormOp(OpTest): self.check_output_with_place(self.place) def test_check_grad(self): - if self.dtype == np.float16: - return - self.check_grad_with_place( self.place, ['X'], 'Out', max_relative_error=0.006) diff --git a/python/paddle/fluid/tests/unittests/npu/test_p_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_p_norm_op_npu.py index 3b75cba60b103fce118d2b0aca6eacf50fe9b809..a7ca4edc524be12e00536fa8f08bb6223004943f 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_p_norm_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_p_norm_op_npu.py @@ -51,8 +51,6 @@ class TestPnormOp(OpTest): self.check_output_with_place(paddle.NPUPlace(0)) def test_check_grad(self): - if self.dtype == "float16": - return self.check_grad_with_place( paddle.NPUPlace(0), ['X'], 'Out', user_defined_grads=self.gradient) diff --git a/python/paddle/fluid/tests/unittests/npu/test_pad_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_pad_op_npu.py index 7d6c3b9bdb444667e986fa92f6be5963eaf71f97..d1d2e8b3467be10ff075f357ccf7cb43ef263db7 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_pad_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_pad_op_npu.py @@ -50,9 +50,10 @@ class TestPadOp(OpTest): def test_check_grad_normal(self): if self.dtype == np.float16: - return - - self.check_grad_with_place(self.place, ['X'], 'Out') + self.check_grad_with_place( + self.place, ['X'], 'Out', max_relative_error=0.6) + else: + self.check_grad_with_place(self.place, ['X'], 'Out') def set_npu(self): self.__class__.use_npu = True diff --git a/python/paddle/fluid/tests/unittests/npu/test_pool2d_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_pool2d_op_npu.py index 2b8550a88de592d70299111f8d33b4e978f2177a..4822abc3b25ebed695bc6d0a9fe6b564cef3ab63 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_pool2d_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_pool2d_op_npu.py @@ -67,9 +67,6 @@ def create_test_fp16_class(parent): self.use_cudnn = False self.dtype = np.float16 - def test_check_grad(self): - return - cls_name = "{0}_{1}".format(parent.__name__, "Fp16Op") TestFp16Case.__name__ = cls_name globals()[cls_name] = TestFp16Case diff --git a/python/paddle/fluid/tests/unittests/npu/test_reciprocal_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_reciprocal_op_npu.py index e8f5de005d421566451bff7a211961a311da3195..899d4ef43bd860251da77cb42b482343d5643fba 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_reciprocal_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_reciprocal_op_npu.py @@ -40,8 +40,6 @@ class TestNPUReciprocal(OpTest): self.check_output_with_place(self.place) def test_check_grad(self): - if self.dtype == np.float16: - return self.check_grad_with_place( self.place, ['X'], 'Out', max_relative_error=0.01) diff --git a/python/paddle/fluid/tests/unittests/npu/test_relu6_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_relu6_op_npu.py index 601a351c015f3258ebd23732dd0f76282e8f7d8e..b1cb5e02a731f8bbbc36097a73b609909fc2320b 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_relu6_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_relu6_op_npu.py @@ -56,8 +56,6 @@ class TestRelu6(OpTest): self.check_output_with_place(self.place) def test_check_grad(self): - if self.dtype == np.float16: - return self.check_grad_with_place(self.place, ['X'], 'Out') def init_dtype(self): diff --git a/python/paddle/fluid/tests/unittests/npu/test_relu_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_relu_op_npu.py index a2547808e6f161ae1cdac5ea5944863d7c640d24..c909b14b5141fe1725e10642abec57eb416c1af8 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_relu_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_relu_op_npu.py @@ -34,11 +34,12 @@ class TestRelu(OpTest): self.init_dtype() np.random.seed(SEED) - x = np.random.rand(3, 2).astype(self.dtype) - out = x - self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} - self.attrs = {} + x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype) + # The same reason with TestAbs + x[np.abs(x) < 0.005] = 0.02 + out = np.maximum(x, 0) + self.inputs = {'X': x} self.outputs = {'Out': out} def set_npu(self): @@ -50,32 +51,18 @@ class TestRelu(OpTest): def test_check_output(self): self.check_output_with_place(self.place) + def test_check_grad(self): + if self.dtype == np.float16: + self.check_grad_with_place( + self.place, ['X'], 'Out', max_relative_error=0.006) + else: + self.check_grad_with_place(self.place, ['X'], 'Out') -class TestReluFp16(OpTest): - def setUp(self): - self.set_npu() - self.op_type = "relu" - self.place = paddle.NPUPlace(0) - - self.init_dtype() - np.random.seed(SEED) - x = np.random.rand(3, 2).astype(self.dtype) - out = x - - self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} - self.attrs = {} - self.outputs = {'Out': out} - - def set_npu(self): - self.__class__.use_npu = True - self.__class__.no_need_check_grad = True +class TestReluFp16(TestRelu): def init_dtype(self): self.dtype = np.float16 - def test_check_output(self): - self.check_output_with_place(self.place, atol=1e-5) - class TestReluNeg(OpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/npu/test_sigmoid_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_sigmoid_op_npu.py index 4516b25b59d9c080a4ff12de162440da1f196150..489f8bfb116a19cfaf3348f647cd584483788a75 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_sigmoid_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_sigmoid_op_npu.py @@ -44,8 +44,6 @@ class TestNPUSigmoid(OpTest): self.check_output_with_place(self.place) def test_check_grad(self): - if self.dtype == np.float16: - return self.check_grad_with_place( self.place, ['X'], 'Out', max_relative_error=0.01) diff --git a/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py index 611691109e187b98d67379a3952fea0e0afd88e9..a5b203b6eea2a6c147194aabe36cbc6c600ae971 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py @@ -58,12 +58,17 @@ class TestSliceOp(OpTest): self.place = paddle.NPUPlace(0) def test_check_output(self): - self.check_output_with_place(self.place) + if self.dtype == np.float16: + self.check_output_with_place(self.place) + else: + self.check_output_with_place(self.place) def test_check_grad_normal(self): if self.dtype == np.float16: - return - self.check_grad_with_place(self.place, ['Input'], 'Out') + self.check_grad_with_place( + self.place, ['Input'], 'Out', max_relative_error=0.02) + else: + self.check_grad_with_place(self.place, ['Input'], 'Out') class TestSliceOp2(TestSliceOp): @@ -347,8 +352,10 @@ class TestSliceOpDecsDim(OpTest): def test_check_grad_normal(self): if self.dtype == np.float16: - return - self.check_grad_with_place(self.place, ['Input'], 'Out') + self.check_grad_with_place( + self.place, ['Input'], 'Out', max_relative_error=0.5) + else: + self.check_grad_with_place(self.place, ['Input'], 'Out') class TestSliceOpDecsDimFp16(TestSliceOpDecsDim): diff --git a/python/paddle/fluid/tests/unittests/npu/test_softmax_with_cross_entropy_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_softmax_with_cross_entropy_op_npu.py index 8d78ee6a97efdd1df99c9636e8e18a2905d858a5..f0ca7788345765f5fcef3ebf23e5ca25bc97eaea 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_softmax_with_cross_entropy_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_softmax_with_cross_entropy_op_npu.py @@ -87,8 +87,6 @@ class TestSoftmaxWithCrossEntropyOp(OpTest): self.check_output_with_place(self.place) def test_check_grad(self): - if self.dtype == np.float16: - return # fp32 has low precision, cpu and npu both need to relax the max_relative_error if using fp32 self.check_grad_with_place( self.place, ['Logits'], diff --git a/python/paddle/fluid/tests/unittests/npu/test_sqrt_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_sqrt_op_npu.py index acb99746d231ded16032bfdc1839b6b0f3120f62..24b34fa625c6339f6b990076fe6d0a874e7ba316 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_sqrt_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_sqrt_op_npu.py @@ -50,12 +50,11 @@ class TestSqrt(OpTest): def test_check_output(self): self.check_output_with_place(self.place) - # TODO(ascendrc): Add grad test - # def test_check_grad(self): - # if self.dtype == np.float16: - # return - # self.check_grad(['X'], 'Out') - # + def test_check_grad(self): + if self.dtype == np.float16: + self.check_grad(['X'], 'Out', max_relative_error=0.009) + else: + self.check_grad(['X'], 'Out', max_relative_error=0.009) class TestSqrtFp16(OpTest): diff --git a/python/paddle/fluid/tests/unittests/npu/test_square_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_square_op_npu.py index caf55b4850f0b18f0fb20ed5692119c2b4ceccc2..170f6b6ca4f934c1bf29433502718b5fc35b25d4 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_square_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_square_op_npu.py @@ -51,8 +51,6 @@ class TestSquare(OpTest): self.check_output_with_place(self.place) def test_check_grad(self): - if self.dtype == np.float16: - return self.check_grad_with_place(self.place, ['X'], 'Out') diff --git a/python/paddle/fluid/tests/unittests/npu/test_tanh_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_tanh_op_npu.py index 55be94da2b7e0346d8c6783d244c9d3a2c43273e..375eef12291ec50af416c38292adecf17fa83277 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_tanh_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_tanh_op_npu.py @@ -50,12 +50,11 @@ class TestTanh(OpTest): def test_check_output(self): self.check_output_with_place(self.place) - # TODO(ascendrc): Add grad test - # def test_check_grad(self): - # if self.dtype == np.float16: - # return - # self.check_grad(['X'], 'Out') - # + def test_check_grad(self): + if self.dtype == np.float16: + self.check_grad(['X'], 'Out', max_relative_error=0.009) + else: + self.check_grad(['X'], 'Out', max_relative_error=0.009) class TestTanhFp16(OpTest): diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 457f20ac5b06be0afb6929dc74148eb42b3ce4db..530ea2838a76fcf01a3047be56f46dea0232619e 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -715,10 +715,11 @@ class OpTest(unittest.TestCase): assert related_idx >= 0, "%d-th arguments don't have default value" % idx return defaults[related_idx] - def remove_name(x): - if isinstance(x, list): return [i for i in x if i != 'name'] + def filter_by_name(x): + names = set(['name', 'out', 'output']) + if isinstance(x, list): return [i for i in x if i not in names] if isinstance(x, dict): - return {k: v for k, v in x.items() if k != 'name'} + return {k: v for k, v in x.items() if k not in names} assert False, "Only support list or dict." def to_defaults_list(params, defaults): @@ -728,7 +729,7 @@ class OpTest(unittest.TestCase): # Because we don't know the python api name of each arguments. # using parse_arg_and_kwargs, we can get the all api information we need. api_params, api_defaults = [ - remove_name(item) for item in parse_arg_and_kwargs(api) + filter_by_name(item) for item in parse_arg_and_kwargs(api) ] api_defaults = to_defaults_list(api_params, api_defaults) inputs_sig, attrs_sig, outputs_sig = kernel_sig @@ -784,10 +785,10 @@ class OpTest(unittest.TestCase): block = fluid.default_main_program().global_block() op_proto = OpProtoHolder.instance().get_op_proto(self.op_type) # prepare input variable - inputs = self.append_input_output_for_dygraph(op_proto, self.inputs, - True, False, block) + eager_tensor_inputs = self.append_input_output_for_dygraph( + op_proto, self.inputs, True, False, block) # prepare output variable - outputs = self.append_input_output_for_dygraph( + eager_tensor_outputs = self.append_input_output_for_dygraph( op_proto, self.outputs, False, False, block) # prepare attrbutes @@ -798,13 +799,14 @@ class OpTest(unittest.TestCase): attrs_outputs[attrs_name] = self.attrs[attrs_name] kernel_sig = _dygraph_tracer()._get_kernel_signature( - self.op_type, inputs, outputs, attrs_outputs) + self.op_type, eager_tensor_inputs, eager_tensor_outputs, + attrs_outputs) assert hasattr( self, "python_api" ), "Please set the `self.python_api` if you want to compare python api output." - args = prepare_python_api_arguments(self.python_api, inputs, - attrs_outputs, kernel_sig) + args = prepare_python_api_arguments( + self.python_api, eager_tensor_inputs, attrs_outputs, kernel_sig) """ we directly return the cal_python_api value because the value is already tensor. """ return cal_python_api(self.python_api, args, kernel_sig) @@ -1286,11 +1288,11 @@ class OpTest(unittest.TestCase): with _test_eager_guard(): eager_dygraph_outs = self._calc_dygraph_output( place, no_check_set=no_check_set) - # we only check end2end api when check_eager=True - if hasattr(self, "python_api"): - api_outs = self._calc_python_api_output(place) - self._check_api_outs_by_dygraph_outs(api_outs, dygraph_outs, - place) + # we only check end2end api when check_eager=True + if hasattr(self, "python_api"): + api_outs = self._calc_python_api_output(place) + self._check_api_outs_by_dygraph_outs(api_outs, dygraph_outs, + place) outs, fetch_list = self._calc_output(place, no_check_set=no_check_set) diff --git a/python/paddle/fluid/tests/unittests/op_test_xpu.py b/python/paddle/fluid/tests/unittests/op_test_xpu.py index 50ea065209422d2c972e480fbbd9a9442b5e5c25..6c964a828eed7eb01bce68b81baab61c66c5cf43 100644 --- a/python/paddle/fluid/tests/unittests/op_test_xpu.py +++ b/python/paddle/fluid/tests/unittests/op_test_xpu.py @@ -123,17 +123,26 @@ class XPUOpTest(OpTest): return super().check_grad_with_place( place, inputs_to_check, output_names, no_grad_set, numeric_grad_delta, in_place, max_relative_error, - user_defined_grads, user_defined_grads, check_dygraph) + user_defined_grads, user_defined_grad_outputs, check_dygraph) a1 = self.get_grad_with_place( - place, inputs_to_check, output_names, no_grad_set=no_grad_set) + place, + inputs_to_check, + output_names, + no_grad_set=no_grad_set, + user_defined_grad_outputs=user_defined_grad_outputs) a2 = self.get_grad_with_place( - place, inputs_to_check, output_names, no_grad_set=no_grad_set) + place, + inputs_to_check, + output_names, + no_grad_set=no_grad_set, + user_defined_grad_outputs=user_defined_grad_outputs) a3 = self.get_grad_with_place( paddle.CPUPlace(), inputs_to_check, output_names, - no_grad_set=no_grad_set) + no_grad_set=no_grad_set, + user_defined_grad_outputs=user_defined_grad_outputs) self._assert_is_close(a1, a2, inputs_to_check, 0.00000001, "Gradient Check On two xpu") self._assert_is_close(a1, a3, inputs_to_check, max_relative_error, @@ -147,7 +156,7 @@ class XPUOpTest(OpTest): numeric_grad_delta=0.005, in_place=False, max_relative_error=0.005, - user_defined_grads=None, + user_defined_grad_outputs=None, check_dygraph=True): self.scope = core.Scope() op_inputs = self.inputs if hasattr(self, "inputs") else dict() @@ -197,6 +206,10 @@ class XPUOpTest(OpTest): if not type(output_names) is list: output_names = [output_names] - analytic_grads = self._get_gradient(inputs_to_check, place, - output_names, no_grad_set) + analytic_grads = self._get_gradient( + inputs_to_check, + place, + output_names, + no_grad_set, + user_defined_grad_outputs=user_defined_grad_outputs) return analytic_grads diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_dataparallel_in_eager_mode.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_dataparallel_in_eager_mode.py index 8ff68a1ce0d69307547db2fd1f83526094c9bfcf..91c340c35d478d9576dcc3f1b15d4d2300692c5a 100644 --- a/python/paddle/fluid/tests/unittests/parallel_dygraph_dataparallel_in_eager_mode.py +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_dataparallel_in_eager_mode.py @@ -19,6 +19,7 @@ import unittest import os import numpy as np import random +import socket import paddle import paddle.nn as nn @@ -31,13 +32,26 @@ from paddle.optimizer import SGD from paddle.fluid.initializer import NumpyArrayInitializer +def net_is_used(port, ip='127.0.0.1'): + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + try: + s.connect((ip, port)) + s.shutdown(2) + return True + except Exception as e: + return False + + def init_process_group(strategy=None): nranks = ParallelEnv().nranks rank = ParallelEnv().local_rank is_master = True if rank == 0 else False - store = paddle.fluid.core.TCPStore("127.0.0.1", 6172, is_master, nranks) - group = core.ProcessGroupNCCL(store, rank, nranks) - return group + for port in range(20000, 21000): + if not net_is_used(port): + store = paddle.fluid.core.TCPStore("127.0.0.1", port, is_master, + nranks) + group = core.ProcessGroupNCCL(store, rank, nranks) + return group class LinearModel(nn.Layer): diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check_in_eager_mode.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check_in_eager_mode.py new file mode 100644 index 0000000000000000000000000000000000000000..214f41c78a3a5b2c285c7b412241bb59c8ee0a75 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check_in_eager_mode.py @@ -0,0 +1,163 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import division +from __future__ import print_function + +import unittest +import os + +import paddle +import numpy as np +import paddle.distributed as dist +import paddle.fluid as fluid +from paddle.fluid.dygraph.nn import Linear +from paddle.fluid.framework import _test_eager_guard +from paddle.fluid.dygraph.parallel import ParallelEnv +import paddle.fluid.core as core + +paddle.seed(1024) +np.random.seed(2021) + +batch = 5 +in_dim = 10 +out_dim = 20 + + +def init_process_group(strategy=None): + nranks = ParallelEnv().nranks + rank = ParallelEnv().local_rank + is_master = True if rank == 0 else False + store = paddle.fluid.core.TCPStore("127.0.0.1", 6174, is_master, nranks) + group = core.ProcessGroupNCCL(store, rank, nranks) + return group + + +class SimpleNet(fluid.Layer): + def __init__(self, train_id): + super(SimpleNet, self).__init__() + self.w1 = self.create_parameter( + shape=[in_dim, out_dim], dtype="float32") + self.w2 = self.create_parameter( + shape=[in_dim, out_dim], dtype="float32") + self.share_net = Linear(out_dim, 10) + + self.unused_param = self.create_parameter( + shape=[out_dim, in_dim], dtype="float64") + + # just for test sync_params_buffers + # self.register_buffer("queue", paddle.randn([10, 5])) + # self.queue = paddle.nn.functional.normalize(self.queue, axis=0) + # self.register_buffer("queue_ptr", paddle.zeros([1], 'int64')) + + self.trainer_id = train_id + + def forward(self, x): + is_use = (paddle.equal_all( + x, paddle.ones(shape=(batch, in_dim))).numpy()[0] and + self.trainer_id == 1) + + if is_use: + tmp = paddle.matmul(x, self.w1) + else: + tmp = paddle.matmul(x, self.w2) + + return self.share_net(tmp) + + +class TestDistTraning(unittest.TestCase): + def test_multiple_gpus(self): + dist.init_parallel_env() + self.trainer_id = dist.get_rank() + + process_group = init_process_group() + self.pg = process_group + with _test_eager_guard(): + + model_a = SimpleNet(self.trainer_id) + model_b = SimpleNet(self.trainer_id) + + state_dict = model_a.state_dict() + model_b.set_state_dict(state_dict) + + model_a = paddle.DataParallel( + model_a, + find_unused_parameters=True, + process_group=process_group) + model_b = paddle.DataParallel( + model_b, + find_unused_parameters=True, + process_group=process_group) + + ones_input = paddle.ones(shape=(batch, in_dim)) + ones_input.stop_gradient = True + + w1_grad_sum = np.zeros((in_dim, out_dim), dtype='float32') + w2_grad_sum = np.zeros((in_dim, out_dim), dtype='float32') + + for step_id in range(5): + print("==============", step_id) + random_input = paddle.rand(shape=(batch, in_dim)) + random_input.stop_gradient = True + + if step_id % 2 == 0: + out_a = model_a(random_input) + out_b = model_b(random_input) + else: + out_a = model_a(ones_input) + out_b = model_b(ones_input) + + out_a.sum().backward() + out_b.sum().backward() + + self.check_gradient(model_a.parameters()) + self.check_gradient(model_b.parameters()) + + # test acc gradient + w1_grad_sum = self.check_acc(model_a._layers.w1.grad, + w1_grad_sum, + model_b._layers.w1.grad) + w2_grad_sum = self.check_acc(model_a._layers.w2.grad, + w2_grad_sum, + model_b._layers.w2.grad) + + model_a.clear_gradients() + + def check_acc(self, grad, grad_sum, acc_grad): + if grad is not None: + grad_sum = grad_sum + grad.numpy() + acc_grad = acc_grad.numpy() if acc_grad is not None else None + np.testing.assert_allclose(grad_sum, acc_grad, rtol=1e-6) + return grad_sum + + def print_trainer_0(self, *args): + if self.trainer_id == 0: + print(*args) + + def broadcast_param(self, param, root): + self.pg.broadcast(param, root) + return param + + def check_gradient(self, params): + other_param = [] + for param in params: + if param.trainable and (param.grad is not None): + grad = param.grad + other_grad = self.broadcast_param(grad, root=1) + if self.trainer_id == 0: + np.testing.assert_allclose(other_grad.numpy(), grad.numpy()) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/process_group_gloo.py b/python/paddle/fluid/tests/unittests/process_group_gloo.py index c62c4615f74707796946137d3b44efc3cc8aeee9..b1f3a71ab3e94c7db53048b95d73795d155bd122 100644 --- a/python/paddle/fluid/tests/unittests/process_group_gloo.py +++ b/python/paddle/fluid/tests/unittests/process_group_gloo.py @@ -47,9 +47,7 @@ class TestProcessGroupFp32(unittest.TestCase): is_master = True if rank == 0 else False store = paddle.fluid.core.TCPStore("127.0.0.1", 6172, is_master, nranks, datetime.timedelta(0)) - gloo_store = paddle.fluid.core.GlooStore(store) - opt = paddle.fluid.core.GlooOptions() - pg = paddle.fluid.core.ProcessGroupGloo(gloo_store, rank, nranks) + pg = paddle.fluid.core.ProcessGroupGloo(store, rank, nranks) # test allreduce sum # rank 0 diff --git a/python/paddle/fluid/tests/unittests/test_egr_python_api.py b/python/paddle/fluid/tests/unittests/test_egr_python_api.py index 27aec284de4cdebb5ebb9191bfb67d48c1b327f5..98ef339e04535bb943add02b6cf6efe490f0354b 100644 --- a/python/paddle/fluid/tests/unittests/test_egr_python_api.py +++ b/python/paddle/fluid/tests/unittests/test_egr_python_api.py @@ -52,7 +52,7 @@ class EagerScaleTestCase(unittest.TestCase): out_eager = core.eager.scale(data_eager, 1.0, 0.9, True, True) self.assertIsNone(data_eager.grad) out_eager.backward(grad_eager, False) - self.assertTrue(data_eager.grad._is_initialized()) + self.assertIsNotNone(data_eager.grad) self.assertTrue(np.array_equal(data_eager.grad.numpy(), input_data)) def test_retain_grad_and_run_backward_raises(self): diff --git a/python/paddle/fluid/tests/unittests/test_einsum.py b/python/paddle/fluid/tests/unittests/test_einsum.py index 13e763bee630517d1caaa5432f86882c37aab449..43b5ce96a390150db7e29588e4107271b240b23f 100644 --- a/python/paddle/fluid/tests/unittests/test_einsum.py +++ b/python/paddle/fluid/tests/unittests/test_einsum.py @@ -26,14 +26,14 @@ class TestErrors(unittest.TestCase): def test_diagonalize_errors(self): a = np.arange(4 * 3 * 4 * 4).reshape(4, 3, 4, 4).astype('float') a = paddle.to_tensor(a) - with self.assertRaisesRegex(AssertionError, ( - 'Diagonal and trace not implemented yet.')): + with self.assertRaisesRegex(AssertionError, + ('Duplicate labels are not supported.')): paddle.einsum('...ii->...i', a) - with self.assertRaisesRegex(AssertionError, ( - 'Diagonal and trace not implemented yet.')): + with self.assertRaisesRegex(AssertionError, + ('Duplicate labels are not supported.')): paddle.einsum('i...i', a) - with self.assertRaisesRegex(AssertionError, ( - 'Diagonal and trace not implemented yet.')): + with self.assertRaisesRegex(AssertionError, + ('Duplicate labels are not supported.')): paddle.einsum('i...i->i...', a) def test_param_errors(self): @@ -396,6 +396,51 @@ class TestNumpyTests(unittest.TestCase): self.check_output('a...b,b...c,c...a', a, a, a) self.check_output('...ab,...ba,...ab,...ab', a, a, a, a) + def test_static_graph(self): + paddle.enable_static() + fluid = paddle.fluid + if fluid.core.is_compiled_with_cuda(): + self.place = fluid.CUDAPlace(0) + else: + self.place = fluid.CPUPlace() + main = fluid.Program() + startup = fluid.Program() + with fluid.program_guard(main, startup): + a = paddle.static.data( + name='a', shape=[3, None, None, None], dtype='float') + b = paddle.static.data( + name='b', shape=[2, None, None, None], dtype='float') + c = paddle.static.data( + name='c', shape=[None, None, 2, None], dtype='float') + d = paddle.static.data( + name='d', shape=[None, None, 5], dtype='float') + e = paddle.static.data( + name='e', shape=[None, 2, None], dtype='float') + + outs = [] + outs.append(paddle.einsum("ibnd,jbnd->bnij", a, b)) + outs.append(paddle.einsum('...ik, ...j', c, d)) + outs.append(paddle.einsum('...kj, ...ik', d, e)) + outs.append(paddle.einsum('ijk..., ikj', c, e)) + outs.append(paddle.einsum('ijk..., ikj->...ij', c, e)) + exe = fluid.Executor(self.place) + exe.run(startup) + a = np.arange(72).reshape(3, 2, 3, 4).astype('float') + b = np.arange(48).reshape(2, 2, 3, 4).astype('float') + c = np.arange(48).reshape(2, 3, 2, 4).astype('float') + d = np.arange(30).reshape(2, 3, 5).astype('float') + e = np.arange(12).reshape(2, 2, 3).astype('float') + feeds = {'a': a, 'b': b, 'c': c, 'd': d, 'e': e} + actual = exe.run(main, feed=feeds, fetch_list=[outs]) + expect = [] + expect.append(np.einsum("ibnd,jbnd->bnij", a, b)) + expect.append(np.einsum('...ik, ...j', c, d)) + expect.append(np.einsum('...kj, ...ik', d, e)) + expect.append(np.einsum('ijk..., ikj', c, e)) + expect.append(np.einsum('ijk..., ikj->...ij', c, e)) + for a, e in zip(actual, expect): + self.check_output_equal(a, e) + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py index d1d391a3949ead28697c0756803e873c41914079..318e826058f2c111f825b113c8ee4676ff87d630 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py @@ -17,7 +17,7 @@ import unittest import numpy as np import paddle import paddle.fluid.core as core -from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16 +from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16 import paddle.fluid as fluid from paddle.fluid import compiler, Program, program_guard diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py index 00967cb503fe5fd677839a869798964bb5fb0b71..b35b2840ed30a2650e6e19a4cfbc381f50fd5024 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py @@ -23,7 +23,7 @@ import paddle.fluid.core as core from paddle.fluid import Program, compiler, program_guard from paddle.fluid.op import Operator -from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16 +from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16 class ElementwiseMulOp(OpTest): diff --git a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py index d2bffbe074f2a9bf63975831560597067508aaf5..0ae005430e03b046d609c393fcc0641a0d3db49e 100755 --- a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py @@ -213,9 +213,9 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer): set(parameters), set([ 'fc_2.b_0', 'num_good_steps_0', 'fc_2.w_0', 'loss_scaling_0', - 'num_bad_steps_0', 'fc_2.w_0_velocity_0', 'fc_2.w_0_asp_mask', - 'learning_rate_0', 'fc_1.b_0', 'fc_1.w_0_asp_mask', - 'fc_0.w_0_asp_mask', 'fc_1.b_0_velocity_0', + 'num_bad_steps_0', 'fc_2.w_0_velocity_0', 'fc_2.w_0.asp_mask', + 'learning_rate_0', 'fc_1.b_0', 'fc_1.w_0.asp_mask', + 'fc_0.w_0.asp_mask', 'fc_1.b_0_velocity_0', 'fc_2.b_0_velocity_0' ])) self.assertEqual(ops, [ diff --git a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py index cd4ba5b054264afca65d4c4d8359eb1854fbb658..7436e9eb7b12623296d7a714e742cc4212c4ca91 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -19,6 +19,9 @@ from paddle.vision.models import resnet50, resnet101 import unittest from unittest import TestCase import numpy as np +import paddle.compat as cpt +from paddle.fluid.framework import _test_eager_guard +import paddle.fluid.core as core def _dygraph_guard_(func): @@ -40,6 +43,80 @@ def random_var(size, low=-1, high=1, dtype='float32'): return fluid.dygraph.to_variable(x_np) +class TestEagerGrad(TestCase): + def func_simple_example_eager_grad(self): + np.random.seed(2021) + paddle.set_device('cpu') + np_x = np.random.random((3, 3)) + np_y = np.random.random((3, 1)) + x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False) + y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False) + out = paddle.matmul(x, y) + dx = fluid.dygraph.grad(out, x) + + dout = np.ones_like(np_y) + expected_dx = np.matmul(dout, np.transpose(np_y)) + + # stop_gradient = !create_graph, create_graph default false + self.assertEqual(dx[0].stop_gradient, True) + self.assertTrue(np.allclose(dx[0].numpy(), expected_dx[0])) + + def test_simple_example_eager_grad(self): + with _test_eager_guard(): + self.func_simple_example_eager_grad() + self.func_simple_example_eager_grad() + + def func_simple_example_eager_grad_allow_unused(self): + np.random.seed(2021) + paddle.set_device('cpu') + np_x = np.random.random((3, 3)) + np_y = np.random.random((3, 1)) + np_z = np.random.random((3, 1)) + x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False) + y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False) + z = paddle.to_tensor(np_z, dtype="float64", stop_gradient=False) + out_z = paddle.nn.functional.sigmoid(z) + out = paddle.matmul(x, y) + + dx = fluid.dygraph.grad(out, [x, z], allow_unused=True) + dout = np.ones_like(np_y) + expected_dx = np.matmul(dout, np.transpose(np_y)) + self.assertTrue(np.allclose(dx[0].numpy(), expected_dx[0])) + # stop_gradient = !create_graph, create_graph default false + self.assertEqual(dx[0].stop_gradient, True) + # x is unused input in the graph + self.assertEqual(dx[1], None) + + def test_simple_example_eager_grad_allow_unused(self): + with _test_eager_guard(): + self.func_simple_example_eager_grad_allow_unused() + self.func_simple_example_eager_grad_allow_unused() + + def func_simple_example_eager_grad_not_allow_unused(self): + np.random.seed(2021) + paddle.set_device('cpu') + np_x = np.random.random((3, 3)) + np_y = np.random.random((3, 1)) + np_z = np.random.random((3, 1)) + x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False) + y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False) + z = paddle.to_tensor(np_z, dtype="float64", stop_gradient=False) + out_z = paddle.nn.functional.sigmoid(z) + out = paddle.matmul(x, y) + + try: + # allow_unused is false in default + dx = fluid.dygraph.grad(out, [x, z]) + except ValueError as e: + error_msg = cpt.get_exception_message(e) + assert error_msg.find("allow_unused") > 0 + + def test_simple_example_eager_grad_not_allow_unused(self): + with _test_eager_guard(): + self.func_simple_example_eager_grad_not_allow_unused() + self.func_simple_example_eager_grad_not_allow_unused() + + class TestDygraphDoubleGrad(TestCase): def setUp(self): self.sort_sum_gradient = False @@ -64,7 +141,7 @@ class TestDygraphDoubleGrad(TestCase): allow_unused=allow_unused) @dygraph_guard - def test_exception(self): + def func_exception(self): with self.assertRaises(AssertionError): self.grad(None, None) @@ -93,8 +170,13 @@ class TestDygraphDoubleGrad(TestCase): with self.assertRaises(AssertionError): self.grad([random_var(shape)], [random_var(shape)], no_grad_vars=1) + def test_exception(self): + with _test_eager_guard(): + self.func_exception() + self.func_exception() + @dygraph_guard - def test_simple_example(self): + def func_simple_example(self): x = random_var(self.shape) x.stop_gradient = False y = x + 1 @@ -123,8 +205,44 @@ class TestDygraphDoubleGrad(TestCase): self.assertNotEqual(grad_with_none_and_not_none.stop_gradient, create_graph) + def test_simple_example(self): + with _test_eager_guard(): + self.func_simple_example() + self.func_simple_example() + @dygraph_guard - def test_none_one_initial_gradient(self): + def func_example_no_grad_vars(self): + x = random_var(self.shape) + x_np = x.numpy() + numel = x_np.size + x.stop_gradient = False + + y1 = fluid.layers.relu(x) + y2 = fluid.layers.relu(x) + z = y1 + y2 + w = z * z + + w_mean = fluid.layers.reduce_mean(w) + del y1, z, w + + dx_actual, = self.grad( + [w_mean], [x], create_graph=True, no_grad_vars=[y2]) + + self.assertFalse(y2.stop_gradient) + self.assertFalse(dx_actual.stop_gradient) + + dx_expected = (1.0 / float(numel) * (np.maximum(x_np, 0) + y2.numpy()) * + (x_np > 0) * 2).astype('float32') + + self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected)) + + def test_example_no_grad_vars(self): + with _test_eager_guard(): + self.func_example_no_grad_vars() + self.func_example_no_grad_vars() + + @dygraph_guard + def func_none_one_initial_gradient(self): numel = 1 for s in self.shape: numel *= s @@ -190,8 +308,13 @@ class TestDygraphDoubleGrad(TestCase): np.array_equal(grad_z.numpy(), original_random_grad_z)) + def test_none_one_initial_gradient(self): + with _test_eager_guard(): + self.func_none_one_initial_gradient() + self.func_none_one_initial_gradient() + @dygraph_guard - def test_example_with_gradient_accumulation_and_create_graph(self): + def func_example_with_gradient_accumulation_and_create_graph(self): x = random_var(self.shape) x_np = x.numpy() numel = x_np.size @@ -214,25 +337,33 @@ class TestDygraphDoubleGrad(TestCase): (x_np > 0) * 2).astype('float32') self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected)) - loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) - loss.backward(retain_graph=True) - - x_grad_actual = x.gradient() - x_grad_expected = (2.0 / float(numel) * - (x_np + dx_expected * - (x_np > 0) * 2 / float(numel))).astype('float32') - self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) - - for i in range(5): + if core._in_eager_mode(): + pass + else: + loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) loss.backward(retain_graph=True) + x_grad_actual = x.gradient() - x_grad_expected = (i + 2) * (2.0 / float(numel) * ( + x_grad_expected = (2.0 / float(numel) * ( x_np + dx_expected * (x_np > 0) * 2 / float(numel))).astype('float32') self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) + for i in range(5): + loss.backward(retain_graph=True) + x_grad_actual = x.gradient() + x_grad_expected = (i + 2) * (2.0 / float(numel) * ( + x_np + dx_expected * + (x_np > 0) * 2 / float(numel))).astype('float32') + self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) + + def test_example_with_gradient_accumulation_and_create_graph(self): + with _test_eager_guard(): + self.func_example_with_gradient_accumulation_and_create_graph() + self.func_example_with_gradient_accumulation_and_create_graph() + @dygraph_guard - def test_example_with_gradient_accumulation_and_no_grad_vars(self): + def func_example_with_gradient_accumulation_and_no_grad_vars(self): x = random_var(self.shape) x_np = x.numpy() numel = x_np.size @@ -256,17 +387,25 @@ class TestDygraphDoubleGrad(TestCase): (x_np > 0) * 2).astype('float32') self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected)) - loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) - loss.backward() + if core._in_eager_mode(): + pass + else: + loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) + loss.backward() - x_grad_actual = x.gradient() - x_grad_expected = (2.0 / float(numel) * - (x_np + dx_expected * - (x_np > 0) * 4 / float(numel))).astype('float32') - self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) + x_grad_actual = x.gradient() + x_grad_expected = (2.0 / float(numel) * ( + x_np + dx_expected * + (x_np > 0) * 4 / float(numel))).astype('float32') + self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) + + def test_example_with_gradient_accumulation_and_no_grad_vars(self): + with _test_eager_guard(): + self.func_example_with_gradient_accumulation_and_no_grad_vars() + self.func_example_with_gradient_accumulation_and_no_grad_vars() @dygraph_guard - def test_example_with_gradient_accumulation_and_not_create_graph(self): + def func_example_with_gradient_accumulation_and_not_create_graph(self): x = random_var(self.shape) x_np = x.numpy() numel = x_np.size @@ -289,12 +428,20 @@ class TestDygraphDoubleGrad(TestCase): self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected)) - loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) - loss.backward() + if core._in_eager_mode(): + pass + else: + loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) + loss.backward() - x_grad_actual = x.gradient() - x_grad_expected = (2.0 * x_np / float(numel)).astype('float32') - self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) + x_grad_actual = x.gradient() + x_grad_expected = (2.0 * x_np / float(numel)).astype('float32') + self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) + + def test_example_with_gradient_accumulation_and_not_create_graph(self): + with _test_eager_guard(): + self.func_example_with_gradient_accumulation_and_not_create_graph() + self.func_example_with_gradient_accumulation_and_not_create_graph() class TestDygraphDoubleGradSortGradient(TestDygraphDoubleGrad): @@ -304,7 +451,7 @@ class TestDygraphDoubleGradSortGradient(TestDygraphDoubleGrad): class TestDygraphDoubleGradVisitedUniq(TestCase): - def test_compare(self): + def func_compare(self): value = np.random.uniform(-0.5, 0.5, 100).reshape(10, 2, 5).astype("float32") @@ -349,6 +496,11 @@ class TestDygraphDoubleGradVisitedUniq(TestCase): self.assertTrue(np.array_equal(grad_1, grad_2)) + def test_compare(self): + with _test_eager_guard(): + self.func_compare() + self.func_compare() + class TestRaiseNoDoubleGradOp(TestCase): def raise_no_grad_op(self): diff --git a/python/paddle/fluid/tests/unittests/test_inplace_eager_fluid.py b/python/paddle/fluid/tests/unittests/test_inplace_eager_fluid.py new file mode 100644 index 0000000000000000000000000000000000000000..a434c56200061b656bc2daa0e66069f09b6949cf --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_inplace_eager_fluid.py @@ -0,0 +1,397 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np + +import paddle +import paddle.fluid.core as core +from paddle.fluid.framework import _test_eager_guard + + +class TestDygraphInplace(unittest.TestCase): + def setUp(self): + self.init_data() + self.set_np_compare_func() + + def init_data(self): + self.input_var_numpy = np.random.uniform(-5, 5, [10, 20, 1]) + self.dtype = "float32" + + def set_np_compare_func(self): + self.np_compare = np.array_equal + + def non_inplace_api_processing(self, var): + return paddle.squeeze(var) + + def inplace_api_processing(self, var): + return paddle.squeeze_(var) + + def test_inplace_api(self): + with _test_eager_guard(): + var = paddle.to_tensor(self.input_var_numpy).astype(self.dtype) + inplace_var = self.inplace_api_processing(var) + self.assertTrue(id(var) == id(inplace_var)) + + inplace_var.exp_() + self.assertTrue(np.array_equal(var.numpy(), inplace_var.numpy())) + + def test_forward_version(self): + with paddle.fluid.dygraph.guard(): + with _test_eager_guard(): + var = paddle.to_tensor(self.input_var_numpy).astype(self.dtype) + self.assertEqual(var.inplace_version, 0) + + inplace_var = self.inplace_api_processing(var) + self.assertEqual(var.inplace_version, 1) + + inplace_var.exp_() + self.assertEqual(var.inplace_version, 2) + + inplace_var = self.inplace_api_processing(inplace_var) + self.assertEqual(var.inplace_version, 3) + + def test_leaf_inplace_var_error(self): + with paddle.fluid.dygraph.guard(): + with _test_eager_guard(): + var = paddle.to_tensor(self.input_var_numpy).astype(self.dtype) + var.stop_gradient = False + + def leaf_inplace_error(): + self.inplace_api_processing(var) + + self.assertRaises(ValueError, leaf_inplace_error) + + def test_backward_error(self): + # It raises an error because the inplace operator will result + # in incorrect gradient computation. + with paddle.fluid.dygraph.guard(): + with _test_eager_guard(): + var_a = paddle.to_tensor(self.input_var_numpy).astype( + self.dtype) + var_a.stop_gradient = False + + var_b = var_a**2 + + # Here, the gradient computation will use the value of var_b + var_c = var_b**2 + self.inplace_api_processing(var_b) + + loss = paddle.nn.functional.relu(var_c) + with self.assertRaisesRegexp( + RuntimeError, + "received current_inplace_version:{} != inplace_version_snapshot_:{}". + format(1, 0)): + loss.backward() + + def test_backward_success_1(self): + # var_b is modified inplace before using it, the inplace operator doesn't result + # in incorrect gradient computation. + grad_var_a, grad_var_a_inplace = 0, 1 + with paddle.fluid.dygraph.guard(): + with _test_eager_guard(): + var_a = paddle.to_tensor(self.input_var_numpy).astype( + self.dtype) + var_a.stop_gradient = False + + var_b = var_a**2 + var_c = self.inplace_api_processing( + var_b) # var_b is modified inplace before using it + + # Here, the gradient computation will use the value of var_b + var_d = var_c**2 + loss = var_d.sum() + loss.backward() + grad_var_a_inplace = var_a.grad.numpy() + + with paddle.fluid.dygraph.guard(): + with _test_eager_guard(): + var_a = paddle.to_tensor(self.input_var_numpy).astype( + self.dtype) + var_a.stop_gradient = False + + var_b = var_a**2 + var_c = self.non_inplace_api_processing(var_b) + var_d = var_c**2 + loss = var_d.sum() + loss.backward() + grad_var_a = var_a.grad.numpy() + + self.assertTrue(self.np_compare(grad_var_a_inplace, grad_var_a)) + + def test_backward_success_2(self): + # Although var_b is modified inplace after using it, it does not used in gradient computation. + # The inplace operator doesn't result in incorrect gradient computation. + grad_var_a, grad_var_a_inplace = 0, 1 + with paddle.fluid.dygraph.guard(): + with _test_eager_guard(): + var_a = paddle.to_tensor(self.input_var_numpy).astype( + self.dtype) + var_a.stop_gradient = False + + var_b = var_a**2 + + var_c = self.inplace_api_processing( + var_b) # var_b is modified inplace before using it + + var_d = var_c + var_c # Here, the grad op of sum doesn't use the value of var_b + loss = var_d.sum() + + loss.backward() + grad_var_a_inplace = var_a.grad.numpy() + + with paddle.fluid.dygraph.guard(): + with _test_eager_guard(): + var_a = paddle.to_tensor(self.input_var_numpy).astype( + self.dtype) + var_a.stop_gradient = False + + var_b = var_a**2 + + var_c = self.non_inplace_api_processing( + var_b) # var_b is modified inplace before using it + + var_d = var_c + var_c # Here, the grad op of sum doesn't use the value of var_b + loss = var_d.sum() + + loss.backward() + grad_var_a = var_a.grad.numpy() + self.assertTrue(np.array_equal(grad_var_a_inplace, grad_var_a)) + + +class TestDygraphInplaceUnsqueeze(TestDygraphInplace): + def non_inplace_api_processing(self, var): + return paddle.unsqueeze(var, -1) + + def inplace_api_processing(self, var): + return paddle.unsqueeze_(var, -1) + + +class TestDygraphInplaceReshape(TestDygraphInplace): + def non_inplace_api_processing(self, var): + return paddle.reshape(var, [-1]) + + def inplace_api_processing(self, var): + return paddle.reshape_(var, [-1]) + + +class TestDygraphInplaceFlatten(TestDygraphInplace): + def non_inplace_api_processing(self, var): + return var.flatten() + + def inplace_api_processing(self, var): + return var.flatten_() + + +class TestDygraphInplaceScatter(TestDygraphInplace): + def init_data(self): + self.input_var_numpy = np.array([[1, 1], [2, 2], [3, 3]]) + self.dtype = "float32" + + def non_inplace_api_processing(self, var): + index = paddle.to_tensor([2, 1, 0, 1], dtype='int64') + updates = paddle.to_tensor( + [[1, 1], [2, 2], [3, 3], [4, 4]], dtype='float32') + + return paddle.scatter(var, index, updates, overwrite=False) + + def inplace_api_processing(self, var): + index = paddle.to_tensor([2, 1, 0, 1], dtype='int64') + updates = paddle.to_tensor( + [[1, 1], [2, 2], [3, 3], [4, 4]], dtype='float32') + + return paddle.scatter_(var, index, updates, overwrite=False) + + +class TestDygraphInplaceElu(TestDygraphInplace): + def non_inplace_api_processing(self, var): + return paddle.nn.functional.elu(var) + + def inplace_api_processing(self, var): + return paddle.nn.functional.elu_(var) + + +class TestDygraphInplaceRelu(TestDygraphInplace): + def non_inplace_api_processing(self, var): + return paddle.nn.functional.relu(var) + + def inplace_api_processing(self, var): + return paddle.nn.functional.relu_(var) + + +class TestDygraphInplaceSoftmax(TestDygraphInplace): + def non_inplace_api_processing(self, var): + return paddle.nn.functional.softmax(var) + + def inplace_api_processing(self, var): + return paddle.nn.functional.softmax_(var) + + +class TestDygraphInplaceTanh(TestDygraphInplace): + def non_inplace_api_processing(self, var): + return paddle.tanh(var) + + def inplace_api_processing(self, var): + return paddle.tanh_(var) + + +class TestDygraphInplaceCeil(TestDygraphInplace): + def non_inplace_api_processing(self, var): + return var.ceil() + + def inplace_api_processing(self, var): + return var.ceil_() + + +class TestDygraphInplaceFloor(TestDygraphInplace): + def non_inplace_api_processing(self, var): + return var.floor() + + def inplace_api_processing(self, var): + return var.floor_() + + +class TestDygraphInplaceExp(TestDygraphInplace): + def set_np_compare_func(self): + self.np_compare = np.allclose + + def non_inplace_api_processing(self, var): + return var.exp() + + def inplace_api_processing(self, var): + return var.exp_() + + +class TestDygraphInplaceReciprocal(TestDygraphInplace): + def non_inplace_api_processing(self, var): + return var.reciprocal() + + def inplace_api_processing(self, var): + return var.reciprocal_() + + +class TestDygraphInplaceRound(TestDygraphInplace): + def non_inplace_api_processing(self, var): + return var.round() + + def inplace_api_processing(self, var): + return var.round_() + + +class TestDygraphInplaceSqrt(TestDygraphInplace): + def init_data(self): + self.input_var_numpy = np.random.uniform(0, 5, [10, 20, 1]) + self.dtype = "float32" + + def non_inplace_api_processing(self, var): + return var.sqrt() + + def inplace_api_processing(self, var): + return var.sqrt_() + + +class TestDygraphInplaceRsqrt(TestDygraphInplaceSqrt): + def non_inplace_api_processing(self, var): + return var.rsqrt() + + def inplace_api_processing(self, var): + return var.rsqrt_() + + +class TestDygraphInplaceClip(TestDygraphInplace): + def non_inplace_api_processing(self, var): + return var.clip(0.6, 1.5) + + def inplace_api_processing(self, var): + return var.clip_(0.6, 1.5) + + +class TestDygraphInplaceScale(TestDygraphInplace): + def non_inplace_api_processing(self, var): + return var.scale(scale=2.0, bias=3.0) + + def inplace_api_processing(self, var): + return var.scale_(scale=2.0, bias=3.0) + + +class TestDygraphInplaceAdd(TestDygraphInplace): + def init_data(self): + self.input_var_numpy = np.random.rand(2, 3, 4) + self.dtype = "float32" + self.input_var_numpy_2 = np.random.rand(2, 3, 4).astype(self.dtype) + + def non_inplace_api_processing(self, var): + input_var_2 = paddle.to_tensor(self.input_var_numpy_2) + return var.add(input_var_2) + + def inplace_api_processing(self, var): + input_var_2 = paddle.to_tensor(self.input_var_numpy_2) + return var.add_(input_var_2) + + +class TestDygraphInplaceSubtract(TestDygraphInplaceAdd): + def non_inplace_api_processing(self, var): + input_var_2 = paddle.to_tensor(self.input_var_numpy_2) + return var.subtract(input_var_2) + + def inplace_api_processing(self, var): + input_var_2 = paddle.to_tensor(self.input_var_numpy_2) + return var.subtract_(input_var_2) + + +class TestLossIsInplaceVar(unittest.TestCase): + def test_loss_is_inplace_var(self): + with paddle.fluid.dygraph.guard(): + with _test_eager_guard(): + var_a = paddle.ones((2, 2)) + var_a.stop_gradient = False + + var_b = var_a * 2 + loss = var_b.tanh_() + + loss.backward() + inplace_grad_var_a = var_a.grad.numpy() + + with paddle.fluid.dygraph.guard(): + with _test_eager_guard(): + var_a = paddle.ones((2, 2)) + var_a.stop_gradient = False + + var_b = var_a * 2 + loss = var_b.tanh() + + loss.backward() + grad_var_a = var_a.grad.numpy() + + self.assertTrue(np.array_equal(inplace_grad_var_a, grad_var_a)) + + +class TestContinuouslyInplace(unittest.TestCase): + def test_continuously_inplace(self): + with _test_eager_guard(): + a = paddle.rand([2, 3]) + a.stop_gradient = False + b = a * 2 + + b.reshape_([-1]) + b.reshape_([2, 3]) + b.reshape_([-1]) + + b.backward() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py index ca9a489c7496f33cb084f1cd43158cebc7a1add6..b75dc2c964ca0b22219de1b33cdbfc3d74c19e45 100644 --- a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py +++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py @@ -215,6 +215,8 @@ class TestLayerNormOp(unittest.TestCase): for name in ['x', 'scale', 'bias', 'y@GRAD'] }, fetch_list=fetch_list) + # print(y) + # print(out[0]) self.__assert_close(y, out[0], "y") self.__assert_close(mean, out[1], "mean") self.__assert_close(variance, out[2], "variance", 1e-3) @@ -238,6 +240,7 @@ class TestLayerNormOp(unittest.TestCase): def test_check_forward_backward_with_scale_and_bias(self): self.check_forward_backward(shape=[1, 3, 4, 5], begin_norm_axis=1) + self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=1) self.check_forward_backward( shape=[2, 3, 4, 5], @@ -432,4 +435,5 @@ class TestGetSetKeepLayerNormScaleBiasFP32Flag(unittest.TestCase): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py index d0a40f38ba25721b6f285b48d45d7a3ead37bfee..65d0e289f81329561eaec73d10aa639689f0e1d3 100644 --- a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py +++ b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py @@ -542,7 +542,7 @@ class TestComplexMatMulOp(OpTest): 'Out', user_defined_grads=[self.grad_x, self.grad_y], user_defined_grad_outputs=[self.grad_out], - check_eager=False) + check_eager=True) def test_check_grad_ingore_x(self): self.check_grad( @@ -560,7 +560,7 @@ class TestComplexMatMulOp(OpTest): no_grad_set=set('Y'), user_defined_grads=[self.grad_x], user_defined_grad_outputs=[self.grad_out], - check_eager=False) + check_eager=True) class TestComplexMatMulOpBroadcast(OpTest): diff --git a/python/paddle/fluid/tests/unittests/test_number_count_op.py b/python/paddle/fluid/tests/unittests/test_number_count_op.py new file mode 100644 index 0000000000000000000000000000000000000000..0df9d2a3a41b44c18b7e008a271c10544ec4dfa0 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_number_count_op.py @@ -0,0 +1,80 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import op_test +import numpy as np +import unittest +import paddle +import paddle.fluid.core as core +from paddle.fluid.op import Operator +import paddle.fluid as fluid +from paddle.fluid import compiler, Program, program_guard +from paddle.fluid.backward import append_backward +from paddle.distributed.models.moe import utils + + +def count(x, upper_range): + res = np.zeros((upper_range, )).astype(int) + for i in x.reshape(-1): + if i >= 0 and i < len(res): + res[i] += 1 + return res + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestExpertCountOpInt64(op_test.OpTest): + def setUp(self): + expert_num = 16 + self.op_type = "number_count" + x = np.random.randint(-1, expert_num, size=(1000, 2)).astype('int64') + self.inputs = {'gate_idx': x} + self.outputs = {'Out': count(x, expert_num)} + self.attrs = {"upper_range": expert_num} + + def test_forward(self): + self.check_output_with_place(paddle.CUDAPlace(0)) + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestExpertCountAPI(unittest.TestCase): + def setUp(self): + self.upper_range = 320 + self.x = np.random.randint( + -1, self.upper_range, size=(6000, 200)).astype('int64') + self.out = count(self.x, self.upper_range) + self.place = paddle.CUDAPlace(0) + + def test_api_static(self): + paddle.enable_static() + with paddle.static.program_guard(paddle.static.Program()): + x = paddle.fluid.data('x', self.x.shape, dtype="int64") + out = utils._number_count(x, self.upper_range) + exe = paddle.static.Executor(self.place) + res = exe.run(feed={'x': self.x}, fetch_list=[out]) + assert np.allclose(res, self.out) + + def test_api_dygraph(self): + paddle.disable_static() + x = paddle.to_tensor(self.x) + out = utils._number_count(x, self.upper_range) + assert np.allclose(out.numpy(), self.out) + + +if __name__ == '__main__': + paddle.enable_static() + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py b/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py index 2ffe523ef6dda18a24813e702a1892c335ba6a68..531e9663a2b728a2871dff404425b063a0c47e67 100644 --- a/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py +++ b/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,6 +18,8 @@ import unittest from unittest import TestCase import numpy as np import paddle +from paddle.fluid.framework import _test_eager_guard +import paddle.fluid.core as core def _dygraph_guard_(func): @@ -62,7 +64,7 @@ class TestDygraphDoubleGrad(TestCase): allow_unused=allow_unused) @dygraph_guard - def test_exception(self): + def func_exception(self): with self.assertRaises(AssertionError): self.grad(None, None) @@ -91,8 +93,13 @@ class TestDygraphDoubleGrad(TestCase): with self.assertRaises(AssertionError): self.grad([random_var(shape)], [random_var(shape)], no_grad_vars=1) + def test_exception(self): + with _test_eager_guard(): + self.func_exception() + self.func_exception() + @dygraph_guard - def test_simple_example(self): + def func_simple_example(self): x = random_var(self.shape) x.stop_gradient = False y = x + 1 @@ -121,8 +128,13 @@ class TestDygraphDoubleGrad(TestCase): self.assertNotEqual(grad_with_none_and_not_none.stop_gradient, create_graph) + def test_simple_example(self): + with _test_eager_guard(): + self.func_simple_example() + self.func_simple_example() + @dygraph_guard - def test_none_one_initial_gradient(self): + def func_none_one_initial_gradient(self): numel = 1 for s in self.shape: numel *= s @@ -188,8 +200,13 @@ class TestDygraphDoubleGrad(TestCase): np.array_equal(grad_z.numpy(), original_random_grad_z)) + def test_none_one_initial_gradient(self): + with _test_eager_guard(): + self.func_none_one_initial_gradient() + self.func_none_one_initial_gradient() + @dygraph_guard - def test_example_with_gradient_accumulation_and_create_graph(self): + def func_example_with_gradient_accumulation_and_create_graph(self): x = random_var(self.shape) x_np = x.numpy() numel = x_np.size @@ -212,17 +229,25 @@ class TestDygraphDoubleGrad(TestCase): (x_np > 0) * 2).astype('float32') self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected)) - loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) - loss.backward() + if core._in_eager_mode(): + pass + else: + loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) + loss.backward() - x_grad_actual = x.gradient() - x_grad_expected = (2.0 / float(numel) * - (x_np + dx_expected * - (x_np > 0) * 2 / float(numel))).astype('float32') - self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) + x_grad_actual = x.gradient() + x_grad_expected = (2.0 / float(numel) * ( + x_np + dx_expected * + (x_np > 0) * 2 / float(numel))).astype('float32') + self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) + + def test_example_with_gradient_accumulation_and_create_graph(self): + with _test_eager_guard(): + self.func_example_with_gradient_accumulation_and_create_graph() + self.func_example_with_gradient_accumulation_and_create_graph() @dygraph_guard - def test_example_with_gradient_accumulation_and_no_grad_vars(self): + def func_example_with_gradient_accumulation_and_no_grad_vars(self): x = random_var(self.shape) x_np = x.numpy() numel = x_np.size @@ -246,17 +271,25 @@ class TestDygraphDoubleGrad(TestCase): (x_np > 0) * 2).astype('float32') self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected)) - loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) - loss.backward() + if core._in_eager_mode(): + pass + else: + loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) + loss.backward() + + x_grad_actual = x.gradient() + x_grad_expected = (2.0 / float(numel) * ( + x_np + dx_expected * + (x_np > 0) * 4 / float(numel))).astype('float32') + self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) - x_grad_actual = x.gradient() - x_grad_expected = (2.0 / float(numel) * - (x_np + dx_expected * - (x_np > 0) * 4 / float(numel))).astype('float32') - self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) + def test_example_with_gradient_accumulation_and_no_grad_vars(self): + with _test_eager_guard(): + self.func_example_with_gradient_accumulation_and_no_grad_vars() + self.func_example_with_gradient_accumulation_and_no_grad_vars() @dygraph_guard - def test_example_with_gradient_accumulation_and_not_create_graph(self): + def func_example_with_gradient_accumulation_and_not_create_graph(self): x = random_var(self.shape) x_np = x.numpy() numel = x_np.size @@ -279,12 +312,20 @@ class TestDygraphDoubleGrad(TestCase): self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected)) - loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) - loss.backward() + if core._in_eager_mode(): + pass + else: + loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) + loss.backward() - x_grad_actual = x.gradient() - x_grad_expected = (2.0 * x_np / float(numel)).astype('float32') - self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) + x_grad_actual = x.gradient() + x_grad_expected = (2.0 * x_np / float(numel)).astype('float32') + self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) + + def test_example_with_gradient_accumulation_and_not_create_graph(self): + with _test_eager_guard(): + self.func_example_with_gradient_accumulation_and_not_create_graph() + self.func_example_with_gradient_accumulation_and_not_create_graph() class TestDygraphDoubleGradSortGradient(TestDygraphDoubleGrad): diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py index 802fcc96288f6114e81f080ef17e527b2e7ad2bd..2530fc07753e8fac56cedff1a6a9798a42059dcb 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py @@ -205,5 +205,10 @@ class TestDataParallelInEagerMode(TestMultipleGpus): self.run_mnist_2gpu('parallel_dygraph_dataparallel_in_eager_mode.py') +class TestGradientCheckInEagerMode(TestMultipleGpus): + def test_multiple_gpus_dynamic(self): + self.run_mnist_2gpu('parallel_dygraph_gradient_check_in_eager_mode.py') + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_randperm_op.py b/python/paddle/fluid/tests/unittests/test_randperm_op.py index 4361a45f1568f5f047ee03090bd3ef28a8d6654f..2380ccb14aaeede7474c99ccbd2d5805849c2118 100644 --- a/python/paddle/fluid/tests/unittests/test_randperm_op.py +++ b/python/paddle/fluid/tests/unittests/test_randperm_op.py @@ -18,6 +18,7 @@ from op_test import OpTest import paddle import paddle.fluid.core as core from paddle.static import program_guard, Program +import os def check_randperm_out(n, data_np): @@ -129,5 +130,81 @@ class TestRandpermImperative(unittest.TestCase): paddle.enable_static() +class TestRandomValue(unittest.TestCase): + def test_fixed_random_number(self): + # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t' + if not paddle.is_compiled_with_cuda(): + return + + if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None): + return + + print("Test Fixed Random number on GPU------>") + paddle.disable_static() + paddle.set_device('gpu') + paddle.seed(2021) + + x = paddle.randperm(30000, dtype='int32').numpy() + expect = [ + 24562, 8409, 9379, 10328, 20503, 18059, 9681, 21883, 11783, 27413 + ] + self.assertTrue(np.array_equal(x[0:10], expect)) + expect = [ + 29477, 27100, 9643, 16637, 8605, 16892, 27767, 2724, 1612, 13096 + ] + self.assertTrue(np.array_equal(x[10000:10010], expect)) + expect = [ + 298, 4104, 16479, 22714, 28684, 7510, 14667, 9950, 15940, 28343 + ] + self.assertTrue(np.array_equal(x[20000:20010], expect)) + + x = paddle.randperm(30000, dtype='int64').numpy() + expect = [ + 6587, 1909, 5525, 23001, 6488, 14981, 14355, 3083, 29561, 8171 + ] + self.assertTrue(np.array_equal(x[0:10], expect)) + expect = [ + 23460, 12394, 22501, 5427, 20185, 9100, 5127, 1651, 25806, 4818 + ] + self.assertTrue(np.array_equal(x[10000:10010], expect)) + expect = [5829, 4508, 16193, 24836, 8526, 242, 9984, 9243, 1977, 11839] + self.assertTrue(np.array_equal(x[20000:20010], expect)) + + x = paddle.randperm(30000, dtype='float32').numpy() + expect = [ + 5154., 10537., 14362., 29843., 27185., 28399., 27561., 4144., + 22906., 10705. + ] + self.assertTrue(np.array_equal(x[0:10], expect)) + expect = [ + 1958., 18414., 20090., 21910., 22746., 27346., 22347., 3002., 4564., + 26991. + ] + self.assertTrue(np.array_equal(x[10000:10010], expect)) + expect = [ + 25580., 12606., 553., 16387., 29536., 4241., 20946., 16899., 16339., + 4662. + ] + self.assertTrue(np.array_equal(x[20000:20010], expect)) + + x = paddle.randperm(30000, dtype='float64').numpy() + expect = [ + 19051., 2449., 21940., 11121., 282., 7330., 13747., 24321., 21147., + 9163. + ] + self.assertTrue(np.array_equal(x[0:10], expect)) + expect = [ + 15483., 1315., 5723., 20954., 13251., 25539., 5074., 1823., 14945., + 17624. + ] + self.assertTrue(np.array_equal(x[10000:10010], expect)) + expect = [ + 10516., 2552., 29970., 5941., 986., 8007., 24805., 26753., 12202., + 21404. + ] + self.assertTrue(np.array_equal(x[20000:20010], expect)) + paddle.enable_static() + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py new file mode 100644 index 0000000000000000000000000000000000000000..8284771920e81db10d22f08cc96ecc58c422833d --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py @@ -0,0 +1,60 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import unittest +import numpy as np +import paddle +from paddle import _C_ops +from paddle.fluid.framework import _test_eager_guard + + +class TestSparseUtils(unittest.TestCase): + def test_to_sparse_coo(self): + with _test_eager_guard(): + x = [[0, 1, 0, 2], [0, 0, 3, 0], [4, 5, 0, 0]] + non_zero_indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]] + non_zero_elements = [1, 2, 3, 4, 5] + dense_x = paddle.to_tensor(x) + #TODO(zhangkaihuo): change to test the corresponding API + out = _C_ops.final_state_to_sparse_coo(dense_x, 2) + print(out) + assert np.array_equal(out.non_zero_indices().numpy(), + non_zero_indices) + assert np.array_equal(out.non_zero_elements().numpy(), + non_zero_elements) + + dense_tensor = _C_ops.final_state_to_dense(out) + assert np.array_equal(dense_tensor.numpy(), x) + + def test_to_sparse_csr(self): + with _test_eager_guard(): + x = [[0, 1, 0, 2], [0, 0, 3, 0], [4, 5, 0, 0]] + non_zero_crows = [0, 2, 3, 5] + non_zero_cols = [1, 3, 2, 0, 1] + non_zero_elements = [1, 2, 3, 4, 5] + dense_x = paddle.to_tensor(x) + out = _C_ops.final_state_to_sparse_csr(dense_x) + print(out) + assert np.array_equal(out.non_zero_crows().numpy(), non_zero_crows) + assert np.array_equal(out.non_zero_cols().numpy(), non_zero_cols) + assert np.array_equal(out.non_zero_elements().numpy(), + non_zero_elements) + + dense_tensor = _C_ops.final_state_to_dense(out) + assert np.array_equal(dense_tensor.numpy(), x) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_variable.py b/python/paddle/fluid/tests/unittests/test_variable.py index a3bfe3864a2493fdcf100a1a86648a159701ec11..beaf361379b94dd28997a6186a58608694a20eca 100644 --- a/python/paddle/fluid/tests/unittests/test_variable.py +++ b/python/paddle/fluid/tests/unittests/test_variable.py @@ -333,7 +333,8 @@ class TestVariable(unittest.TestCase): with self.assertRaises(IndexError): res = x[[True, False, False]] with self.assertRaises(ValueError): - res = x[[False, False]] + with paddle.static.program_guard(prog): + res = x[[False, False]] def test_slice(self): places = [fluid.CPUPlace()] diff --git a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py index 69bca8dd9ef15459021f44fd1b4887e636516ec6..66f2e871dac462c8e6e47357e7367755d2fc0cfc 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py @@ -849,6 +849,38 @@ def ref_softsign(x): return out +class XPUTestSoftshrinkOP(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'softshrink' + self.use_dynamic_create_class = False + + class XPUTestSoftshrink(TestActivationOPBase): + def set_case(self): + self.op_type = "softshrink" + self.dtype = self.in_type + + threshold = 0.5 + np.random.seed(1023) + x = np.random.uniform(0.25, 10, [10, 12]).astype(self.dtype) + out = ref_softshrink(x, threshold) + + self.inputs = {'X': x} + self.outputs = {'Out': out} + self.attrs = {'use_xpu': True} + + +support_types = get_xpu_op_support_types('softshrink') +for stype in support_types: + create_test_class(globals(), XPUTestSoftshrinkOP, stype) + + +def ref_softshrink(x, threshold=0.5): + out = np.copy(x) + out = (out < -threshold) * (out + threshold) + (out > threshold) * ( + out - threshold) + return out + + class XPUTestSwishOP(XPUOpTestWrapper): def __init__(self): self.op_name = 'swish' @@ -879,5 +911,36 @@ def ref_swish(x): return out +class XPUTestThresholdedReluOP(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'thresholded_relu' + self.use_dynamic_create_class = False + + class XPUTestThresholdedRelu(TestActivationOPBase): + def set_case(self): + self.op_type = "thresholded_relu" + self.dtype = self.in_type + + threshold = 1.0 + np.random.seed(1024) + x = np.random.uniform(-20, 20, [10, 12]).astype(self.dtype) + x[np.abs(x) < 0.005] = 0.02 + out = ref_thresholded_relu(x, threshold) + + self.inputs = {'X': x} + self.outputs = {'Out': out} + self.attrs = {'use_xpu': True} + + +support_types = get_xpu_op_support_types('thresholded_relu') +for stype in support_types: + create_test_class(globals(), XPUTestThresholdedReluOP, stype) + + +def ref_thresholded_relu(x, threshold=1.0): + out = (x > threshold) * x + return out + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sequence_conv_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_sequence_conv_op_xpu.py index 2ad79dd0cca00585b01065e1ae6fbb34da4970d4..9999217041859f43a26b5cb071a2f4942634de2d 100755 --- a/python/paddle/fluid/tests/unittests/xpu/test_sequence_conv_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_sequence_conv_op_xpu.py @@ -21,6 +21,8 @@ import random import sys sys.path.append("../") from op_test_xpu import XPUOpTest +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types +from xpu.get_test_cover_info import XPUOpTestWrapper paddle.enable_static() np.set_printoptions(threshold=np.inf) @@ -73,188 +75,198 @@ def seqconv(x, return np.dot(col, filter) -class TestSeqProject(XPUOpTest): - def setUp(self): - self.init_test_case() - self.op_type = 'sequence_conv' - self.use_xpu = True - - if self.context_length == 1 \ - and self.context_start == 0 \ - and self.padding_trainable: - print("If context_start is 0 " \ - "and context_length is 1," \ - " padding_trainable should be false.") - return - - # one level, batch size - x = np.random.uniform(-6.10907e-05, 0.000104218, - [self.input_size[0], - self.input_size[1]]).astype('float32') - w = np.random.uniform(-3.17068e-05, 0.000159822, [ - self.context_length * self.input_size[1], self.output_represention - ]).astype('float32') - - begin_pad = np.max([0, -self.context_start]) - end_pad = np.max([0, self.context_start + self.context_length - 1]) - total_pad = begin_pad + end_pad - padding_data = np.random.uniform( - 0, 0, [total_pad, self.input_size[1]]).astype('float32') - self.pad_data = padding_data - self.inputs = { - 'X': (x, self.lod), - 'Filter': w, - } - self.inputs_val = ['X', 'Filter'] - self.inputs_val_no_x = ['Filter'] - self.inputs_val_no_f = ['X'] - - if total_pad != 0: - self.inputs['PaddingData'] = padding_data - self.inputs_val = ['X', 'PaddingData', 'Filter'] - self.inputs_val_no_x = ['PaddingData', 'Filter'] - self.inputs_val_no_f = ['PaddingData', 'X'] - - self.attrs = { - 'contextStart': self.context_start, - 'contextLength': self.context_length, - 'paddingTrainable': self.padding_trainable, - 'contextStride': self.context_stride - } - out = seqconv(x, self.lod, w, self.context_length, self.context_start, - self.padding_trainable, self.pad_data) - self.outputs = {'Out': out} - - def test_check_output(self): - place = paddle.XPUPlace(0) - self.check_output_with_place(place) - - def test_check_grad_input(self): - self.check_grad(['X'], 'Out', no_grad_set=set(self.inputs_val_no_x)) - - def test_check_grad_padding_data(self): - if self.padding_trainable: +class XPUTestSequenceConv(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'sequence_conv' + + class TestSeqProject(XPUOpTest): + def setUp(self): + self.init_test_case() + self.op_type = 'sequence_conv' + self.dtype = self.in_type + self.use_xpu = True + + if self.context_length == 1 \ + and self.context_start == 0 \ + and self.padding_trainable: + print("If context_start is 0 " \ + "and context_length is 1," \ + " padding_trainable should be false.") + return + + # one level, batch size + x = np.random.uniform(-6.10907e-05, 0.000104218, + [self.input_size[0], + self.input_size[1]]).astype(self.dtype) + w = np.random.uniform(-3.17068e-05, 0.000159822, [ + self.context_length * self.input_size[1], + self.output_represention + ]).astype(self.dtype) + + begin_pad = np.max([0, -self.context_start]) + end_pad = np.max([0, self.context_start + self.context_length - 1]) + total_pad = begin_pad + end_pad + padding_data = np.random.uniform( + 0, 0, [total_pad, self.input_size[1]]).astype(self.dtype) + self.pad_data = padding_data + self.inputs = { + 'X': (x, self.lod), + 'Filter': w, + } + self.inputs_val = ['X', 'Filter'] + self.inputs_val_no_x = ['Filter'] + self.inputs_val_no_f = ['X'] + + if total_pad != 0: + self.inputs['PaddingData'] = padding_data + self.inputs_val = ['X', 'PaddingData', 'Filter'] + self.inputs_val_no_x = ['PaddingData', 'Filter'] + self.inputs_val_no_f = ['PaddingData', 'X'] + + self.attrs = { + 'contextStart': self.context_start, + 'contextLength': self.context_length, + 'paddingTrainable': self.padding_trainable, + 'contextStride': self.context_stride + } + out = seqconv(x, self.lod, w, self.context_length, + self.context_start, self.padding_trainable, + self.pad_data) + self.outputs = {'Out': out} + + def test_check_output(self): + place = paddle.XPUPlace(0) + self.check_output_with_place(place) + + def test_check_grad_input(self): + self.check_grad(['X'], 'Out', no_grad_set=set(self.inputs_val_no_x)) + + def test_check_grad_padding_data(self): + if self.padding_trainable: + self.check_grad( + ['PaddingData'], 'Out', no_grad_set=set(['X', 'Filter'])) + + def test_check_grad_Filter(self): self.check_grad( - ['PaddingData'], 'Out', no_grad_set=set(['X', 'Filter'])) - - def test_check_grad_Filter(self): - self.check_grad( - ['Filter'], 'Out', no_grad_set=set(self.inputs_val_no_f)) - - def test_check_grad_input_filter(self): - if self.padding_trainable: - self.check_grad( - ['X', 'Filter'], 'Out', no_grad_set=set(['PaddingData'])) - - def test_check_grad_padding_input(self): - if self.padding_trainable: - self.check_grad( - self.inputs_val_no_f, 'Out', no_grad_set=set(['Filter'])) - - def test_check_grad_padding_filter(self): - if self.padding_trainable: - self.check_grad(self.inputs_val_no_x, 'Out', no_grad_set=set(['X'])) - - def init_test_case(self): - self.input_row = 7 - self.input_col = 25 - self.context_start = -2 - self.context_length = 5 - self.padding_trainable = False - self.context_stride = 1 - - self.input_size = [self.input_row, self.input_col] - offset_lod = [[0, 1, self.input_row]] - self.lod = [[]] - # convert from offset-based lod to length-based lod - for i in range(len(offset_lod[0]) - 1): - self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i]) - self.output_represention = 8 # output feature size - - -class TestSeqProjectCase1(TestSeqProject): - def init_test_case(self): - self.input_row = 11 - self.context_start = -2 - self.context_length = 5 - self.padding_trainable = False - self.context_stride = 1 - - self.input_size = [self.input_row, 50] - offset_lod = [[0, 4, 5, 8, self.input_row]] - self.lod = [[]] - # convert from offset-based lod to length-based lod - for i in range(len(offset_lod[0]) - 1): - self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i]) - self.output_represention = 8 # output feature size - - -class TestSeqProjectCase2Len0(TestSeqProject): - def init_test_case(self): - self.input_row = 11 - self.context_start = -2 - self.context_length = 5 - self.padding_trainable = False - self.context_stride = 1 - - self.input_size = [self.input_row, 50] - offset_lod = [[0, 0, 4, 5, 5, 8, self.input_row, self.input_row]] - self.lod = [[]] - # convert from offset-based lod to length-based lod - for i in range(len(offset_lod[0]) - 1): - self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i]) - self.output_represention = 8 # output feature size - - -class TestSeqProjectCase3(TestSeqProject): - def init_test_case(self): - self.input_row = 25 - self.context_start = -2 - self.context_length = 5 - self.padding_trainable = False - self.context_stride = 1 - - self.input_size = [self.input_row, 25] - idx = list(range(self.input_size[0])) - del idx[0] - offset_lod = [[0] + np.sort(random.sample(idx, 8)).tolist() + - [self.input_size[0]]] - self.lod = [[]] - # convert from offset-based lod to length-based lod - for i in range(len(offset_lod[0]) - 1): - self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i]) - self.output_represention = 8 # output feature size - - -class TestSeqProjectCase4(TestSeqProject): - def init_test_case(self): - self.input_row = 7835 - self.input_col = 128 - self.context_start = -2 - self.context_length = 5 - self.padding_trainable = False - self.context_stride = 1 - - self.input_size = [self.input_row, self.input_col] - offset_lod = [[ - 0, 1, 2, 3, 131, 241, 242, 263, 264, 265, 266, 267, 268, 387, 515, - 516, 644, 645, 772, 794, 922, 923, 924, 944, 945, 1073, 1074, 1202, - 1330, 1458, 1556, 1557, 1558, 1686, 1748, 1876, 1912, 1913, 1914, - 2032, 2066, 2194, 2308, 2309, 2347, 2475, 2476, 2477, 2478, 2606, - 2607, 2735, 2736, 2737, 2738, 2838, 2966, 2967, 2968, 2969, 3097, - 3225, 3353, 3481, 3482, 3520, 3642, 3643, 3754, 3882, 3883, 4010, - 4011, 4012, 4140, 4219, 4228, 4356, 4357, 4415, 4475, 4476, 4604, - 4605, 4606, 4694, 4695, 4808, 4936, 4961, 4962, 5004, 5132, 5260, - 5312, 5440, 5441, 5569, 5570, 5675, 5676, 5750, 5810, 5811, 5939, - 6021, 6149, 6277, 6278, 6364, 6425, 6519, 6647, 6648, 6739, 6867, - 6995, 6996, 7120, 7223, 7244, 7367, 7407, 7408, 7467, 7595, 7699, - 7827, 7835 - ]] - self.lod = [[]] - # convert from offset-based lod to length-based lod - for i in range(len(offset_lod[0]) - 1): - self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i]) - self.output_represention = 8 # output feature size + ['Filter'], 'Out', no_grad_set=set(self.inputs_val_no_f)) + + def test_check_grad_input_filter(self): + if self.padding_trainable: + self.check_grad( + ['X', 'Filter'], 'Out', no_grad_set=set(['PaddingData'])) + + def test_check_grad_padding_input(self): + if self.padding_trainable: + self.check_grad( + self.inputs_val_no_f, 'Out', no_grad_set=set(['Filter'])) + + def test_check_grad_padding_filter(self): + if self.padding_trainable: + self.check_grad( + self.inputs_val_no_x, 'Out', no_grad_set=set(['X'])) + + def init_test_case(self): + self.input_row = 7 + self.input_col = 25 + self.context_start = -2 + self.context_length = 5 + self.padding_trainable = False + self.context_stride = 1 + + self.input_size = [self.input_row, self.input_col] + offset_lod = [[0, 1, self.input_row]] + self.lod = [[]] + # convert from offset-based lod to length-based lod + for i in range(len(offset_lod[0]) - 1): + self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i]) + self.output_represention = 8 # output feature size + + class TestSeqProjectCase1(TestSeqProject): + def init_test_case(self): + self.input_row = 11 + self.context_start = -2 + self.context_length = 5 + self.padding_trainable = False + self.context_stride = 1 + + self.input_size = [self.input_row, 50] + offset_lod = [[0, 4, 5, 8, self.input_row]] + self.lod = [[]] + # convert from offset-based lod to length-based lod + for i in range(len(offset_lod[0]) - 1): + self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i]) + self.output_represention = 8 # output feature size + + class TestSeqProjectCase2Len0(TestSeqProject): + def init_test_case(self): + self.input_row = 11 + self.context_start = -2 + self.context_length = 5 + self.padding_trainable = False + self.context_stride = 1 + + self.input_size = [self.input_row, 50] + offset_lod = [[0, 0, 4, 5, 5, 8, self.input_row, self.input_row]] + self.lod = [[]] + # convert from offset-based lod to length-based lod + for i in range(len(offset_lod[0]) - 1): + self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i]) + self.output_represention = 8 # output feature size + + class TestSeqProjectCase3(TestSeqProject): + def init_test_case(self): + self.input_row = 25 + self.context_start = -2 + self.context_length = 5 + self.padding_trainable = False + self.context_stride = 1 + + self.input_size = [self.input_row, 25] + idx = list(range(self.input_size[0])) + del idx[0] + offset_lod = [[0] + np.sort(random.sample(idx, 8)).tolist() + + [self.input_size[0]]] + self.lod = [[]] + # convert from offset-based lod to length-based lod + for i in range(len(offset_lod[0]) - 1): + self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i]) + self.output_represention = 8 # output feature size + + class TestSeqProjectCase4(TestSeqProject): + def init_test_case(self): + self.input_row = 7835 + self.input_col = 128 + self.context_start = -2 + self.context_length = 5 + self.padding_trainable = False + self.context_stride = 1 + + self.input_size = [self.input_row, self.input_col] + offset_lod = [[ + 0, 1, 2, 3, 131, 241, 242, 263, 264, 265, 266, 267, 268, 387, + 515, 516, 644, 645, 772, 794, 922, 923, 924, 944, 945, 1073, + 1074, 1202, 1330, 1458, 1556, 1557, 1558, 1686, 1748, 1876, + 1912, 1913, 1914, 2032, 2066, 2194, 2308, 2309, 2347, 2475, + 2476, 2477, 2478, 2606, 2607, 2735, 2736, 2737, 2738, 2838, + 2966, 2967, 2968, 2969, 3097, 3225, 3353, 3481, 3482, 3520, + 3642, 3643, 3754, 3882, 3883, 4010, 4011, 4012, 4140, 4219, + 4228, 4356, 4357, 4415, 4475, 4476, 4604, 4605, 4606, 4694, + 4695, 4808, 4936, 4961, 4962, 5004, 5132, 5260, 5312, 5440, + 5441, 5569, 5570, 5675, 5676, 5750, 5810, 5811, 5939, 6021, + 6149, 6277, 6278, 6364, 6425, 6519, 6647, 6648, 6739, 6867, + 6995, 6996, 7120, 7223, 7244, 7367, 7407, 7408, 7467, 7595, + 7699, 7827, 7835 + ]] + self.lod = [[]] + # convert from offset-based lod to length-based lod + for i in range(len(offset_lod[0]) - 1): + self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i]) + self.output_represention = 8 # output feature size + + +support_types = get_xpu_op_support_types('sequence_conv') +for stype in support_types: + create_test_class(globals(), XPUTestSequenceConv, stype) class TestSeqConvApi(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/xpu/test_slice_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_slice_op_xpu.py index 8f3578b526e1e5fbfaf2ad27c84bef5134f17d5f..3d7c9959db9ea28ac6f6ecd0050878eee15e6cbd 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_slice_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_slice_op_xpu.py @@ -18,169 +18,174 @@ import sys import unittest sys.path.append("..") from op_test import OpTest +from op_test_xpu import XPUOpTest +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper paddle.enable_static() # Situation 1: starts(list, no tensor), ends(list, no tensor) # 1.1 without attr(decrease) -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") -class TestSliceOp(OpTest): - def setUp(self): - self.op_type = "slice" - self.config() - self.inputs = {'Input': self.input} - self.outputs = {'Out': self.out} - self.attrs = { - 'axes': self.axes, - 'starts': self.starts, - 'ends': self.ends, - 'infer_flags': self.infer_flags, - "use_xpu": True - } - - def config(self): - self.input = np.random.random([3, 4, 5, 6]).astype("float32") - self.starts = [1, 0, 2] - self.ends = [3, 3, 4] - self.axes = [0, 1, 2] - self.infer_flags = [1, 1, 1] - self.out = self.input[1:3, 0:3, 2:4, :] - - def test_check_output(self): - place = paddle.XPUPlace(0) - self.check_output_with_place(place) - - def test_check_grad_normal(self): - place = paddle.XPUPlace(0) - self.check_grad_with_place(place, ['Input'], 'Out') - - -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") -class TestCase1(TestSliceOp): - def config(self): - self.input = np.random.random([3, 4, 5, 6]).astype("float32") - self.starts = [-3, 0, 2] - self.ends = [3, 100, -1] - self.axes = [0, 1, 2] - self.infer_flags = [1, 1, 1] - self.out = self.input[-3:3, 0:100, 2:-1, :] - - -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") -class TestCase2(TestSliceOp): - def config(self): - self.input = np.random.random([3, 4, 5, 6]).astype("float32") - self.starts = [-3, 0, 2] - self.ends = [3, 100, -1] - self.axes = [0, 1, 3] - self.infer_flags = [1, 1, 1] - self.out = self.input[-3:3, 0:100, :, 2:-1] +class XPUTestSliceOp(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'slice' + self.use_dynamic_create_class = False + + class TestSliceOp(XPUOpTest): + def setUp(self): + self.dtype = self.in_type + self.place = paddle.XPUPlace(0) + self.op_type = "slice" + self.config() + self.inputs = {'Input': self.input} + self.outputs = {'Out': self.out} + self.attrs = { + 'axes': self.axes, + 'starts': self.starts, + 'ends': self.ends, + 'infer_flags': self.infer_flags, + "use_xpu": True + } + + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype) + self.starts = [1, 0, 2] + self.ends = [3, 3, 4] + self.axes = [0, 1, 2] + self.infer_flags = [1, 1, 1] + self.out = self.input[1:3, 0:3, 2:4, :] + + def test_check_grad_normal(self): + if self.dtype == np.float16: + self.check_grad_with_place(self.place, ['Input'], 'Out') + else: + user_defined_grad_outputs = np.random.random( + self.out.shape).astype(self.dtype) + self.check_grad_with_place( + self.place, ['Input'], + 'Out', + user_defined_grad_outputs=user_defined_grad_outputs) + + class TestCase1(TestSliceOp): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype) + self.starts = [-3, 0, 2] + self.ends = [3, 100, -1] + self.axes = [0, 1, 2] + self.infer_flags = [1, 1, 1] + self.out = self.input[-3:3, 0:100, 2:-1, :] + + class TestCase2(TestSliceOp): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype) + self.starts = [-3, 0, 2] + self.ends = [3, 100, -1] + self.axes = [0, 1, 3] + self.infer_flags = [1, 1, 1] + self.out = self.input[-3:3, 0:100, :, 2:-1] # 1.2 with attr(decrease) -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") -class TestSliceOp_decs_dim(OpTest): - def setUp(self): - self.op_type = "slice" - self.config() - self.inputs = {'Input': self.input} - self.outputs = {'Out': self.out} - self.attrs = { - 'axes': self.axes, - 'starts': self.starts, - 'ends': self.ends, - 'infer_flags': self.infer_flags, - 'decrease_axis': self.decrease_axis, - "use_xpu": True - } - - def config(self): - self.input = np.random.random([3, 4, 5, 6]).astype("float32") - self.starts = [1, 0, 2] - self.ends = [2, 3, 4] - self.axes = [0, 1, 2] - self.decrease_axis = [0] - self.infer_flags = [1, 1, 1] - self.out = self.input[1, 0:3, 2:4, :] - - def test_check_output(self): - place = paddle.XPUPlace(0) - self.check_output_with_place(place) - - def test_check_grad_normal(self): - place = paddle.XPUPlace(0) - self.check_grad_with_place(place, ['Input'], 'Out') - - -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") -class TestSliceOp_decs_dim_2(TestSliceOp_decs_dim): - def config(self): - self.input = np.random.random([3, 4, 5, 6]).astype("float32") - self.starts = [1, 0, 2] - self.ends = [2, 1, 4] - self.axes = [0, 1, 2] - self.decrease_axis = [0, 1] - self.infer_flags = [1, 1, 1] - self.out = self.input[1, 0, 2:4, :] - - -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") -class TestSliceOp_decs_dim_3(TestSliceOp_decs_dim): - def config(self): - self.input = np.random.random([3, 4, 5, 6]).astype("float32") - self.starts = [-1, 0, 2] - self.ends = [1000000, 1, 4] - self.axes = [0, 1, 2] - self.decrease_axis = [0, 1] - self.infer_flags = [1, 1, 1] - self.out = self.input[-1, 0, 2:4, :] - - -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") -class TestSliceOp_decs_dim_4(TestSliceOp_decs_dim): - def config(self): - self.input = np.random.random([3, 4, 5, 7]).astype("float32") - self.starts = [0, 1, 2, 3] - self.ends = [1, 2, 3, 4] - self.axes = [0, 1, 2, 3] - self.decrease_axis = [0, 1, 2, 3] - self.infer_flags = [1, 1, 1] - self.out = self.input[0, 1, 2, 3:4] - - -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") -class TestSliceOp_decs_dim_5(TestSliceOp_decs_dim): - def config(self): - self.input = np.random.random([3, 4, 5, 6]).astype("float32") - self.starts = [-1] - self.ends = [1000000] - self.axes = [3] - self.decrease_axis = [3] - self.infer_flags = [1, 1, 1] - self.out = self.input[:, :, :, -1] - - -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") -class TestSliceOp_decs_dim_6(TestSliceOp_decs_dim): - def config(self): - self.input = np.random.random([3, 4, 5, 6]).astype("float32") - self.starts = [0, 1, 2, 3] - self.ends = [1, 2, 3, 4] - self.axes = [0, 1, 2, 3] - self.decrease_axis = [0, 1, 2, 3] - self.infer_flags = [1, 1, 1] - self.out = self.input[0, 1, 2, 3:4] - +class XPUTestSliceOp_decs_dim(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'slice' + self.use_dynamic_create_class = False + + class TestSliceOp_decs_dim(XPUOpTest): + def setUp(self): + self.dtype = self.in_type + self.place = paddle.XPUPlace(0) + self.op_type = "slice" + self.config() + self.inputs = {'Input': self.input} + self.outputs = {'Out': self.out} + self.attrs = { + 'axes': self.axes, + 'starts': self.starts, + 'ends': self.ends, + 'infer_flags': self.infer_flags, + 'decrease_axis': self.decrease_axis, + "use_xpu": True + } + + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype) + self.starts = [1, 0, 2] + self.ends = [2, 3, 4] + self.axes = [0, 1, 2] + self.decrease_axis = [0] + self.infer_flags = [1, 1, 1] + self.out = self.input[1, 0:3, 2:4, :] + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad_normal(self): + if self.dtype == np.float16: + self.check_grad_with_place(self.place, ['Input'], 'Out') + else: + user_defined_grad_outputs = np.random.random( + self.out.shape).astype(self.dtype) + self.check_grad_with_place( + self.place, ['Input'], + 'Out', + user_defined_grad_outputs=user_defined_grad_outputs) + + class TestSliceOp_decs_dim_2(TestSliceOp_decs_dim): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype) + self.starts = [1, 0, 2] + self.ends = [2, 1, 4] + self.axes = [0, 1, 2] + self.decrease_axis = [0, 1] + self.infer_flags = [1, 1, 1] + self.out = self.input[1, 0, 2:4, :] + + class TestSliceOp_decs_dim_3(TestSliceOp_decs_dim): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype) + self.starts = [-1, 0, 2] + self.ends = [1000000, 1, 4] + self.axes = [0, 1, 2] + self.decrease_axis = [0, 1] + self.infer_flags = [1, 1, 1] + self.out = self.input[-1, 0, 2:4, :] + + class TestSliceOp_decs_dim_4(TestSliceOp_decs_dim): + def config(self): + self.input = np.random.random([3, 4, 5, 7]).astype(self.dtype) + self.starts = [0, 1, 2, 3] + self.ends = [1, 2, 3, 4] + self.axes = [0, 1, 2, 3] + self.decrease_axis = [0, 1, 2, 3] + self.infer_flags = [1, 1, 1] + self.out = self.input[0, 1, 2, 3:4] + + class TestSliceOp_decs_dim_5(TestSliceOp_decs_dim): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype) + self.starts = [-1] + self.ends = [1000000] + self.axes = [3] + self.decrease_axis = [3] + self.infer_flags = [1, 1, 1] + self.out = self.input[:, :, :, -1] + + class TestSliceOp_decs_dim_6(TestSliceOp_decs_dim): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype) + self.starts = [0, 1, 2, 3] + self.ends = [1, 2, 3, 4] + self.axes = [0, 1, 2, 3] + self.decrease_axis = [0, 1, 2, 3] + self.infer_flags = [1, 1, 1] + self.out = self.input[0, 1, 2, 3:4] + + +support_types = get_xpu_op_support_types('slice') +for stype in support_types: + create_test_class(globals(), XPUTestSliceOp, stype) + create_test_class(globals(), XPUTestSliceOp_decs_dim, stype) if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/xpu/test_tile_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_tile_op_xpu.py index d010e1633578ed6f4a237dbda2641b1b563633ee..cd18bd63a88f7c4366470d4d0854f4951e1ba46d 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_tile_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_tile_op_xpu.py @@ -24,221 +24,158 @@ import paddle import paddle.fluid as fluid from paddle.fluid import compiler, Program, program_guard from paddle.fluid import core +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper paddle.enable_static() np.random.seed(10) #Situation 1: repeat_times is a list (without tensor) -class TestTileOpRank1(XPUOpTest): - def setUp(self): - self.set_xpu() - self.place = paddle.XPUPlace(0) - self.op_type = "tile" - self.init_data() - - self.inputs = {'X': np.random.random(self.ori_shape).astype("float32")} - self.attrs = {'repeat_times': self.repeat_times} - output = np.tile(self.inputs['X'], self.repeat_times) - self.outputs = {'Out': output} - - def set_xpu(self): - self.__class__.use_xpu = True - - def init_data(self): - self.ori_shape = [100] - self.repeat_times = [2] - - def test_check_output(self): - self.check_output_with_place(self.place) - - def test_check_grad(self): - pass - - -#with dimension expanding -class TestTileOpRank2Expanding(TestTileOpRank1): - def init_data(self): - self.ori_shape = [120] - self.repeat_times = [2, 2] - - -class TestTileOpRank2(TestTileOpRank1): - def init_data(self): - self.ori_shape = [12, 14] - self.repeat_times = [2, 3] - - -class TestTileOpRank3_Corner(TestTileOpRank1): - def init_data(self): - self.ori_shape = (2, 10, 5) - self.repeat_times = (1, 1, 1) - - -class TestTileOpRank3_Corner2(TestTileOpRank1): - def init_data(self): - self.ori_shape = (2, 10, 5) - self.repeat_times = (2, 2) - - -class TestTileOpRank3(TestTileOpRank1): - def init_data(self): - self.ori_shape = (2, 4, 15) - self.repeat_times = (2, 1, 4) - - -class TestTileOpRank4(TestTileOpRank1): - def init_data(self): - self.ori_shape = (2, 4, 5, 7) - self.repeat_times = (3, 2, 1, 2) +class XPUTestTileOpRank1(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'tile' + self.use_dynamic_create_class = False + + class TestTileOpRank1(XPUOpTest): + def setUp(self): + self.dtype = self.in_type + self.__class__.no_need_check_grad = True + self.place = paddle.XPUPlace(0) + self.op_type = "tile" + self.init_data() + self.inputs = { + 'X': np.random.random(self.ori_shape).astype(self.dtype) + } + self.attrs = {'repeat_times': self.repeat_times} + output = np.tile(self.inputs['X'], self.repeat_times) + self.outputs = {'Out': output} + + def init_data(self): + self.ori_shape = [100] + self.repeat_times = [2] + + def test_check_output(self): + self.check_output_with_place(self.place) + + #with dimension expanding + class TestTileOpRank2Expanding(TestTileOpRank1): + def init_data(self): + self.ori_shape = [120] + self.repeat_times = [2, 2] + + class TestTileOpRank2(TestTileOpRank1): + def init_data(self): + self.ori_shape = [12, 14] + self.repeat_times = [2, 3] + + class TestTileOpRank3_Corner(TestTileOpRank1): + def init_data(self): + self.ori_shape = (2, 10, 5) + self.repeat_times = (1, 1, 1) + + class TestTileOpRank3_Corner2(TestTileOpRank1): + def init_data(self): + self.ori_shape = (2, 10, 5) + self.repeat_times = (2, 2) + + class TestTileOpRank3(TestTileOpRank1): + def init_data(self): + self.ori_shape = (2, 4, 15) + self.repeat_times = (2, 1, 4) + + class TestTileOpRank4(TestTileOpRank1): + def init_data(self): + self.ori_shape = (2, 4, 5, 7) + self.repeat_times = (3, 2, 1, 2) # Situation 2: repeat_times is a list (with tensor) -class TestTileOpRank1_tensor_attr(XPUOpTest): - def setUp(self): - self.set_xpu() - self.place = paddle.XPUPlace(0) - self.op_type = "tile" - self.init_data() - repeat_times_tensor = [] - for index, ele in enumerate(self.repeat_times): - repeat_times_tensor.append(("x" + str(index), np.ones( - (1)).astype('int32') * ele)) - - self.inputs = { - 'X': np.random.random(self.ori_shape).astype("float32"), - 'repeat_times_tensor': repeat_times_tensor, - } - self.attrs = {"repeat_times": self.infer_repeat_times} - output = np.tile(self.inputs['X'], self.repeat_times) - self.outputs = {'Out': output} - - def set_xpu(self): - self.__class__.use_xpu = True - - def init_data(self): - self.ori_shape = [100] - self.repeat_times = [2] - self.infer_repeat_times = [-1] - - def test_check_output(self): - self.check_output_with_place(self.place) - - def test_check_grad(self): - pass - - -class TestTileOpRank2_Corner_tensor_attr(TestTileOpRank1_tensor_attr): - def init_data(self): - self.ori_shape = [12, 14] - self.repeat_times = [1, 1] - self.infer_repeat_times = [1, -1] - - -class TestTileOpRank2_attr_tensor(TestTileOpRank1_tensor_attr): - def init_data(self): - self.ori_shape = [12, 14] - self.repeat_times = [2, 3] - self.infer_repeat_times = [-1, 3] +class XPUTestTileOpRank1_tensor_attr(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'tile' + self.use_dynamic_create_class = False + + class TestTileOpRank1_tensor_attr(XPUOpTest): + def setUp(self): + self.dtype = self.in_type + self.__class__.no_need_check_grad = True + self.place = paddle.XPUPlace(0) + self.op_type = "tile" + self.init_data() + repeat_times_tensor = [] + for index, ele in enumerate(self.repeat_times): + repeat_times_tensor.append(("x" + str(index), np.ones( + (1)).astype('int32') * ele)) + + self.inputs = { + 'X': np.random.random(self.ori_shape).astype(self.dtype), + 'repeat_times_tensor': repeat_times_tensor, + } + self.attrs = {"repeat_times": self.infer_repeat_times} + output = np.tile(self.inputs['X'], self.repeat_times) + self.outputs = {'Out': output} + + def init_data(self): + self.ori_shape = [100] + self.repeat_times = [2] + self.infer_repeat_times = [-1] + + def test_check_output(self): + self.check_output_with_place(self.place) + + class TestTileOpRank2_Corner_tensor_attr(TestTileOpRank1_tensor_attr): + def init_data(self): + self.ori_shape = [12, 14] + self.repeat_times = [1, 1] + self.infer_repeat_times = [1, -1] + + class TestTileOpRank2_attr_tensor(TestTileOpRank1_tensor_attr): + def init_data(self): + self.ori_shape = [12, 14] + self.repeat_times = [2, 3] + self.infer_repeat_times = [-1, 3] # Situation 3: repeat_times is a tensor -class TestTileOpRank1_tensor(XPUOpTest): - def setUp(self): - self.set_xpu() - self.place = paddle.XPUPlace(0) - self.op_type = "tile" - self.init_data() - - self.inputs = { - 'X': np.random.random(self.ori_shape).astype("float32"), - 'RepeatTimes': np.array(self.repeat_times).astype("int32"), - } - self.attrs = {} - output = np.tile(self.inputs['X'], self.repeat_times) - self.outputs = {'Out': output} - - def set_xpu(self): - self.__class__.use_xpu = True - - def init_data(self): - self.ori_shape = [100] - self.repeat_times = [2] - - def test_check_output(self): - self.check_output_with_place(self.place) - - def test_check_grad(self): - pass - - -class TestTileOpRank2_tensor(TestTileOpRank1_tensor): - def init_data(self): - self.ori_shape = [12, 14] - self.repeat_times = [2, 3] - - -# Situation 4: input x is Integer -class TestTileOpInteger(XPUOpTest): - def setUp(self): - self.set_xpu() - self.place = paddle.XPUPlace(0) - self.op_type = "tile" - self.inputs = { - 'X': np.random.randint( - 10, size=(4, 4, 5)).astype("int32") - } - self.attrs = {'repeat_times': [2, 1, 4]} - output = np.tile(self.inputs['X'], (2, 1, 4)) - self.outputs = {'Out': output} - - def set_xpu(self): - self.__class__.use_xpu = True - - def test_check_output(self): - self.check_output_with_place(self.place) - - -# Situation 5: input x is Integer -class TestTileOpInt64_t(XPUOpTest): - def setUp(self): - self.set_xpu() - self.place = paddle.XPUPlace(0) - self.op_type = "tile" - self.inputs = { - 'X': np.random.randint( - 10, size=(2, 4, 5)).astype("int64") - } - self.attrs = {'repeat_times': [2, 1, 4]} - output = np.tile(self.inputs['X'], (2, 1, 4)) - self.outputs = {'Out': output} - - def set_xpu(self): - self.__class__.use_xpu = True - - def test_check_output(self): - self.check_output_with_place(self.place) - - -# Situation 6: input x is Bool -class TestTileOpBool(XPUOpTest): - def setUp(self): - self.set_xpu() - self.place = paddle.XPUPlace(0) - self.op_type = "tile" - self.inputs = { - 'X': np.random.randint( - 10, size=(2, 4, 5)).astype("bool") - } - self.attrs = {'repeat_times': [2, 1, 4]} - output = np.tile(self.inputs['X'], (2, 1, 4)) - self.outputs = {'Out': output} - - def set_xpu(self): - self.__class__.use_xpu = True - - def test_check_output(self): - self.check_output_with_place(self.place) +class XPUTestTileOpRank1_tensor(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'tile' + self.use_dynamic_create_class = False + + class TestTileOpRank1_tensor(XPUOpTest): + def setUp(self): + self.dtype = self.in_type + self.__class__.no_need_check_grad = True + self.place = paddle.XPUPlace(0) + self.op_type = "tile" + self.init_data() + + self.inputs = { + 'X': np.random.random(self.ori_shape).astype(self.dtype), + 'RepeatTimes': np.array(self.repeat_times).astype("int32"), + } + self.attrs = {} + output = np.tile(self.inputs['X'], self.repeat_times) + self.outputs = {'Out': output} + + def init_data(self): + self.ori_shape = [100] + self.repeat_times = [2] + + def test_check_output(self): + self.check_output_with_place(self.place) + + class TestTileOpRank2_tensor(TestTileOpRank1_tensor): + def init_data(self): + self.ori_shape = [12, 14] + self.repeat_times = [2, 3] + + +support_types = get_xpu_op_support_types('tile') +for stype in support_types: + create_test_class(globals(), XPUTestTileOpRank1, stype) + create_test_class(globals(), XPUTestTileOpRank1_tensor_attr, stype) + create_test_class(globals(), XPUTestTileOpRank1_tensor, stype) # Test python API diff --git a/python/paddle/incubate/tensor/math.py b/python/paddle/incubate/tensor/math.py index 9f577d5ff38024fe9264deec2980ff091996a1d8..2d0b079ee9280e2dd0cc0e62c6c5932565ba9dfd 100644 --- a/python/paddle/incubate/tensor/math.py +++ b/python/paddle/incubate/tensor/math.py @@ -29,7 +29,7 @@ def segment_sum(data, segment_ids, name=None): where sum is over j such that `segment_ids[j] == i`. Args: - data (Tensor): A tensor, available data type float32, float64. + data (Tensor): A tensor, available data type float32, float64, int32, int64. segment_ids (Tensor): A 1-D tensor, which have the same size with the first dimension of input data. Available data type is int32, int64. @@ -54,7 +54,8 @@ def segment_sum(data, segment_ids, name=None): out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "SUM") return out - check_variable_and_dtype(data, "X", ("float32", "float64"), "segment_pool") + check_variable_and_dtype(data, "X", ("float32", "float64", "int32", + "int64"), "segment_pool") check_variable_and_dtype(segment_ids, "SegmentIds", ("int32", "int64"), "segment_pool") @@ -82,7 +83,7 @@ def segment_mean(data, segment_ids, name=None): of all index 'segment_ids[j] == i'. Args: - data (tensor): a tensor, available data type float32, float64. + data (tensor): a tensor, available data type float32, float64, int32, int64. segment_ids (tensor): a 1-d tensor, which have the same size with the first dimension of input data. available data type is int32, int64. @@ -107,7 +108,8 @@ def segment_mean(data, segment_ids, name=None): out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "MEAN") return out - check_variable_and_dtype(data, "X", ("float32", "float64"), "segment_pool") + check_variable_and_dtype(data, "X", ("float32", "float64", "int32", + "int64"), "segment_pool") check_variable_and_dtype(segment_ids, "SegmentIds", ("int32", "int64"), "segment_pool") @@ -134,7 +136,7 @@ def segment_min(data, segment_ids, name=None): where min is over j such that `segment_ids[j] == i`. Args: - data (tensor): a tensor, available data type float32, float64. + data (tensor): a tensor, available data type float32, float64, int32, int64. segment_ids (tensor): a 1-d tensor, which have the same size with the first dimension of input data. available data type is int32, int64. @@ -159,7 +161,8 @@ def segment_min(data, segment_ids, name=None): out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "MIN") return out - check_variable_and_dtype(data, "X", ("float32", "float64"), "segment_pool") + check_variable_and_dtype(data, "X", ("float32", "float64", "int32", + "int64"), "segment_pool") check_variable_and_dtype(segment_ids, "SegmentIds", ("int32", "int64"), "segment_pool") @@ -186,7 +189,7 @@ def segment_max(data, segment_ids, name=None): where max is over j such that `segment_ids[j] == i`. Args: - data (tensor): a tensor, available data type float32, float64. + data (tensor): a tensor, available data type float32, float64, int32, int64. segment_ids (tensor): a 1-d tensor, which have the same size with the first dimension of input data. available data type is int32, int64. @@ -211,7 +214,8 @@ def segment_max(data, segment_ids, name=None): out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "MAX") return out - check_variable_and_dtype(data, "X", ("float32", "float64"), "segment_pool") + check_variable_and_dtype(data, "X", ("float32", "float64", "int32", + "int64"), "segment_pool") check_variable_and_dtype(segment_ids, "SegmentIds", ("int32", "int64"), "segment_pool") diff --git a/python/paddle/optimizer/sgd.py b/python/paddle/optimizer/sgd.py index 5167c18de179dabc4b25bf077d6a81b6ef0b8bf6..6c575b4b997d661d8be79c4e0b457c6a2f34795c 100644 --- a/python/paddle/optimizer/sgd.py +++ b/python/paddle/optimizer/sgd.py @@ -59,16 +59,14 @@ class SGD(Optimizer): .. code-block:: python import paddle - import numpy as np - inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") + + inp = paddle.uniform(min=-0.1, max=0.1, shape=[10, 10], dtype='float32') linear = paddle.nn.Linear(10, 10) inp = paddle.to_tensor(inp) out = linear(inp) loss = paddle.mean(out) - beta1 = paddle.to_tensor([0.9], dtype="float32") - beta2 = paddle.to_tensor([0.99], dtype="float32") sgd = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), weight_decay=0.01) - back = out.backward() + out.backward() sgd.step() sgd.clear_grad() diff --git a/python/paddle/static/input.py b/python/paddle/static/input.py index f06c45cc369737403025ed264815a98b81acc6da..7c0c71951aa1d7a566cabf73ecb9d26e03b8dab6 100644 --- a/python/paddle/static/input.py +++ b/python/paddle/static/input.py @@ -193,7 +193,7 @@ class InputSpec(object): print(x_spec) # InputSpec(shape=(2, 2), dtype=VarType.FP32, name=x) """ - if isinstance(tensor, (Variable, core.VarBase)): + if isinstance(tensor, (Variable, core.VarBase, core.eager.Tensor)): return cls(tensor.shape, tensor.dtype, name or tensor.name) else: raise ValueError( diff --git a/python/paddle/tensor/einsum.py b/python/paddle/tensor/einsum.py index 040480c26faa8fca3e9e08cf0b69cc6cdaaeedfc..06c2a82fd696d8f84f6ce1b38fad95834d10d3a0 100644 --- a/python/paddle/tensor/einsum.py +++ b/python/paddle/tensor/einsum.py @@ -13,9 +13,10 @@ # limitations under the License. import itertools +import numpy as np import re -from .linalg import matmul, transpose +from .linalg import dot, matmul, transpose from .manipulation import squeeze, unsqueeze, reshape from .math import multiply from .math import sum as paddle_sum @@ -111,36 +112,6 @@ def validate_rhs(rhs, input_labels, n_bcast_dims): f"Invalid equation: duplicate output labels are found.") -# ''' -# Tests if the two operands can perform a broadcast operation on the given ranges of dimensions. -# We follow the Numpy broadcasting convention which states that, by lining up the shape arrays -# starting from the right most dimension, all the aligned dimensions either have equal sizes or -# one of them is sized one. -# Parameters -# ---------- -# args: -# *args unpacks into operand one's axes range, shape, operand two's axes range, shape -# f: -# if available, is used as a callback for postprocessing the aligned operand dimensions. -# ''' -# xran, xshape, yran, yshape = args -# -# xran_inv, yran_inv = xran[::-1], yran[::-1] -# -# for xi, yi in zip(xran_inv, yran_inv): -# xs, ys = xshape[xi], yshape[yi] -# cond = xs == ys or xs == 1 or ys == 1 -# if not cond: -# return False -# -# if not f: -# return True -# -# # Apply the callback to each aligned dimension pair -# for xi, yi in zip(xran_inv, yran_inv): -# f(xi, yi) - - def build_view(in_labels, out_labels): ''' Build an inverse map of dimension indices. Three conditions must hold for @@ -291,39 +262,12 @@ def build_global_shape(g_view, g_labels, op_shapes): g_shape = [sizes.pop() if len(sizes) > 0 else 1 for sizes in g_shape] - g_masks = [[s > 1 for s in view_shape] for view_shape in view_shapes] + g_masks = [[s > 1 or s == -1 for s in view_shape] + for view_shape in view_shapes] return g_shape, g_masks -def dim_strides(shape): - ''' - Returns the dimension strides for a tensor shape - ''' - strides = [] - stride = 1 - for size in shape[::-1]: - strides.append(stride) - stride = stride * size - return strides - - -def create_view(operand, *view_def): - ''' - Create and materialize a view. - - Parameters - ---------- - operand: - the base tensor operand - view_def: - include two lists which define the view's dimension sizes and strides - ''' - assert False, f'Diagonal and trace not implemented yet.' - view_shape, view_strides = view_def - return operand.create_view(view_shape, view_strides) - - def has_duplicated_labels(labels): ''' Returns True if there is any duplicate label. @@ -337,46 +281,17 @@ def diagonalize(labels, operand): Merges dimensions with duplicate labels. For those dimensions with duplicate labels, merge them into one dimension - which represents the diagonal elements. That requires the duplicate labeled - dimensions equal sized. The order of dimensions is kept unchanged up to - the left-most appearance of each label. + which represents the diagonal elements. This requires the dimensions with + duplicate labels are equal sized. Examples -------- 'ijj...i' would be merged into 'ij...' ''' - if not has_duplicated_labels(labels): - return labels, operand - - strides = dim_strides(operand.shape) - shape = operand.shape - new_labels = [] - new_shape = [] - new_strides = [] - - for ax, l in enumerate(labels): - if l == '.' or l not in new_labels: - # not duplicate - new_labels.append(l) - new_strides.append(strides[ax]) - new_shape.append(shape[ax]) - else: - # duplicate label - diag_ax = new_labels.index(l) - new_strides[diag_ax] += strides[ax] + assert not has_duplicated_labels(labels), ( + f'Duplicate labels are not supported.') - # Call framework API to build a new tensor - new_op = create_view(operand, new_shape, new_strides) - return new_labels, new_op - - -def prod(iter, default=1): - if len(iter): - res = 1 - for s in iter: - res *= s - return res - return default + return labels, operand def plan_reduce(plan, op, reduce_dims, keepdim): @@ -408,102 +323,108 @@ def plan_matmul(plan, g_view, op1, op2, g_supports, g_shape, I, J1, J2, K): op1_view, op2_view = [g_view[op] for op in (op1, op2)] - # Note, I may index into -1 - I1_dims = [op1_view[ax] for ax in I if op1_view[ax] >= 0] - I2_dims = [op2_view[ax] for ax in I if op2_view[ax] >= 0] - J1_dims = [op1_view[ax] for ax in J1] - J2_dims = [op2_view[ax] for ax in J2] - K1_dims = [op1_view[ax] for ax in K] - K2_dims = [op2_view[ax] for ax in K] + I1 = [idx for idx in I if op1_view[idx] >= 0] + I2 = [idx for idx in I if op2_view[idx] >= 0] + op1_view = np.array(op1_view) + op1_dims = op1_view[I1 + J1 + K] - op1_mask, op2_mask = [g_supports[op] for op in (op1, op2)] - op1_vshape = [s if m else 1 for s, m in zip(g_shape, op1_mask)] - op2_vshape = [s if m else 1 for s, m in zip(g_shape, op2_mask)] - - I1_shape, J1_shape, K1_shape = [[op1_vshape[ax] for ax in axes] - for axes in (I, J1, K)] - I2_shape, J2_shape, K2_shape = [[op2_vshape[ax] for ax in axes] - for axes in (I, J2, K)] + op2_view = np.array(op2_view) + op2_dims = op2_view[I2 + J2 + K] - K1_size, J1_size, J2_size = prod(K1_shape), prod(J1_shape), prod(J2_shape) + op1_mask, op2_mask = [g_supports[op] for op in (op1, op2)] + op1_vshape = np.array([s if m else 1 for s, m in zip(g_shape, op1_mask)]) + op2_vshape = np.array([s if m else 1 for s, m in zip(g_shape, op2_mask)]) + vshape = np.maximum(op1_vshape, op2_vshape) - perm1 = I1_dims + J1_dims + K1_dims - perm2 = I2_dims + J2_dims + K2_dims + i1, i2, j1, j2, k = map(len, (I1, I2, J1, J2, K)) - if any(i != dim for i, dim in enumerate(perm1)): + if any(op1_dims != np.arange(len(op1_dims))): # print(f'perm1: {perm1}') - step = transpose, [var1], var1, perm1 + step = transpose, [var1], var1, list(op1_dims) plan.add_step(step) - if any(i != dim for i, dim in enumerate(perm2)): + if any(op2_dims != np.arange(len(op2_dims))): # print(f'perm2: {perm2}') - step = transpose, [var2], var2, perm2 + step = transpose, [var2], var2, list(op2_dims) plan.add_step(step) - # In case of no K... dimensions, do a broadcast - if not K: - # unsqueeze operands include J1...J2... dimensions - if J2: - fill_start = len(I2_dims) + len(J1) - fill_end = fill_start + len(J2) - fill = list(range(fill_start, fill_end)) - step = unsqueeze, [var1], var1, fill - plan.add_step(step) - if J1: - fill_start = len(I2_dims) - fill_end = fill_start + len(J1) - fill = list(range(fill_start, fill_end)) - step = unsqueeze, [var2], var2, fill - plan.add_step(step) - # make broadcast - step = multiply, [var1, var2], var2 - plan.add_step(step) - # K... are there, let's reason about I... and J... - # In case I... and J... are empty, do the vector-vector version of matmul - elif not I and not J1 and not J2: - # merge K dimensions - if len(K) > 1: - for var in var1, var2: - step = reshape, [var], var, [K1_size] - plan.add_step(step) - # Build vector-vector matmul - step = matmul, [var1, var2], var2 - plan.add_step(step) - # General case, there are K... and some I... and J..., the actual operation will be - # matrix-vector or matrix-matrix multiplies, depending on the operands' shapes. - else: - # Merge J dims and K dims by reshaping - merged_shape1 = I1_shape + [J1_size] + [K1_size] - merged_shape2 = I2_shape + [J2_size] + [K1_size] + # Check if conditions hold for turnning the operation into a matmul + if j1 + j2 > 0 and k > 0 and -1 not in np.concatenate( + (op1_vshape, op2_vshape)): + op1_shape = list(op1_vshape[I]) + [np.prod(op1_vshape[J1]) + ] + [np.prod(op1_vshape[K])] + op2_shape = list(op2_vshape[I]) + [np.prod(op2_vshape[J2]) + ] + [np.prod(op2_vshape[K])] - step = reshape, [var1], var1, merged_shape1 + # Merge J dims and K dims by reshaping + step = reshape, [var1], var1, op1_shape plan.add_step(step) - step = reshape, [var2], var2, merged_shape2 + step = reshape, [var2], var2, op2_shape plan.add_step(step) # Matmul step = matmul, [var1, var2], var2, False, True plan.add_step(step) - # The result shape is in I..., J1, J2. Let's reshape back to known dimensions - # Note, this is static deduction, not by reading the tensor shape at runtime - result_shape = [1] * len(I) - for i, ax in enumerate(I): - result_shape[i] = max(op1_vshape[ax], op2_vshape[ax]) - if J1: - result_shape += J1_shape - if J2: - result_shape += J2_shape - - # Need a scalar dimension somehow - if result_shape: - step = reshape, [var2], var2, result_shape + # Reshape back + shape = list(vshape[I + J1 + J2]) + step = reshape, [var2], var2, shape plan.add_step(step) + elif j1 == j2 == k == 1: + # Can still do matmul even unknown shapes are present + step = matmul, [var1, var2], var2, False, True + plan.add_step(step) + + # In the rest cases we opt for ops other than matmul + else: + # unsqueeze operands include J1...J2... dimensions + if j2: + fill = list(range(i1 + j1, i1 + j1 + j2)) + step = unsqueeze, [var1], var1, fill + plan.add_step(step) + if j1: + fill = list(range(i2, i2 + j1)) + step = unsqueeze, [var2], var2, fill + plan.add_step(step) + # In case of no dimensions to contract, do an elementwise multiply + if k == 0: + # make broadcast + step = multiply, [var1, var2], var2 + plan.add_step(step) + # Contract and no join, turn into a dot + elif j1 + j2 == 0 and k == 1: + step = unsqueeze, [var1], var1, [-2] + plan.add_step(step) + step = unsqueeze, [var2], var2, [-1] + plan.add_step(step) + step = matmul, [var1, var2], var2 + plan.add_step(step) + step = squeeze, [var2], var2, [-1, -2] + plan.add_step(step) + elif j1 + j2 == 0 and not-1 in np.concatenate( + (op1_vshape[K], op2_vshape[K])): + assert all(op1_vshape[K] == op2_vshape[K]) + step = reshape, [var1], var1, list(op1_vshape[ + I]) + [1] + [np.prod(op1_vshape[K])] + plan.add_step(step) + step = reshape, [var2], var2, list(op2_vshape[ + I]) + [1] + [np.prod(op2_vshape[K])] + plan.add_step(step) + step = matmul, [var1, var2], var2, False, True + plan.add_step(step) + step = squeeze, [var2], var2, [-1, -2] + plan.add_step(step) + else: + step = multiply, [var1, var2], var2 + plan.add_step(step) + reduce_dims = list(range(-k, 0)) + plan_reduce(plan, op2, reduce_dims, keepdim=False) + # Wrap up, updating auxiliary data # Updating g_mask for I and J axes - for i, ax in enumerate(I + J1 + J2): - op2_mask[ax] = (result_shape[i] > 1) + for ax in I + J1 + J2: + op2_mask[ax] = vshape[ax] > 1 or vshape[ax] == -1 for ax in K: op2_mask[ax] = False @@ -514,6 +435,8 @@ def plan_matmul(plan, g_view, op1, op2, g_supports, g_shape, I, J1, J2, K): for ax in I + J1 + J2: op2_view[ax], dim = dim, dim + 1 + g_view[op2] = list(op2_view) + def plan_summation(plan, g_view, op1, op2, g_supports, g_shape, g_count, n_bcast): @@ -737,7 +660,6 @@ def plan_einsum(operands, g_view, g_shape, g_supports, g_count, n_bcast): return plan -@dygraph_only def einsum(equation, *operands): r""" einsum(equation, *operands) diff --git a/python/paddle/tensor/to_string.py b/python/paddle/tensor/to_string.py index 85672ec7a36e698b199669c167488ced17d51837..f164bbc466f18da9b7145533c32369a85d6124df 100644 --- a/python/paddle/tensor/to_string.py +++ b/python/paddle/tensor/to_string.py @@ -263,14 +263,7 @@ def to_string(var, prefix='Tensor'): data=data) -def tensor_to_string(tensor, prefix='Tensor'): - indent = len(prefix) + 1 - - _template = "{prefix}(shape={shape}, dtype={dtype}, place={place}, stop_gradient={stop_gradient},\n{indent}{data})" - - if not tensor._is_initialized(): - return "Tensor(Not initialized)" - +def _format_dense_tensor(tensor, indent): np_tensor = tensor.numpy() if len(tensor.shape) == 0: @@ -288,6 +281,26 @@ def tensor_to_string(tensor, prefix='Tensor'): data = _format_tensor( np_tensor, sumary, indent=indent, max_width=max_width, signed=signed) + return data + + +def sparse_tensor_to_string(tensor, prefix='Tensor'): + indent = len(prefix) + 1 + _template = "{prefix}(shape={shape}, dtype={dtype}, place={place}, stop_gradient={stop_gradient}, \n{indent}{data})" + if tensor.is_sparse_coo(): + indices_tensor = tensor.non_zero_indices() + elements_tensor = tensor.non_zero_elements() + indices_data = _format_dense_tensor(indices_tensor, indent) + elements_data = _format_dense_tensor(elements_tensor, indent) + data = 'non_zero_indices=' + indices_data + ',\nnon_zero_elements=' + elements_data + else: + crows_tensor = tensor.non_zero_crows() + cols_tensor = tensor.non_zero_cols() + elements_tensor = tensor.non_zero_elements() + crows_data = _format_dense_tensor(crows_tensor, indent) + cols_data = _format_dense_tensor(cols_tensor, indent) + elements_data = _format_dense_tensor(elements_tensor, indent) + data = 'non_zero_crows=' + crows_data + ',\nnon_zero_cols=' + cols_data + ',\nnon_zero_elements=' + elements_data return _template.format( prefix=prefix, @@ -297,3 +310,25 @@ def tensor_to_string(tensor, prefix='Tensor'): stop_gradient=tensor.stop_gradient, indent=' ' * indent, data=data) + + +def tensor_to_string(tensor, prefix='Tensor'): + indent = len(prefix) + 1 + + _template = "{prefix}(shape={shape}, dtype={dtype}, place={place}, stop_gradient={stop_gradient},\n{indent}{data})" + + if not tensor._is_initialized(): + return "Tensor(Not initialized)" + + if tensor.is_sparse(): + return sparse_tensor_to_string(tensor, prefix) + else: + data = _format_dense_tensor(tensor, indent) + return _template.format( + prefix=prefix, + shape=tensor.shape, + dtype=tensor.dtype, + place=tensor._place_str, + stop_gradient=tensor.stop_gradient, + indent=' ' * indent, + data=data) diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index 0d012685b738c63f3eadc0c1d8c44592fe875845..70dea65b7699b413f0dc5fc8d68599229beb3078 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -141,6 +141,14 @@ output : Tensor invoke : full_like(x, 1, dtype, place) +- api : pool2d + args : (Tensor x, int[] kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm) + output : Tensor(out) + infer_meta : + func : PoolInferMeta + kernel: + func : pool2d + - api : reshape args : (Tensor x, ScalarArray shape) output : Tensor(out) @@ -177,6 +185,14 @@ kernel : func : sign +- api : softmax + args : (Tensor x, int axis) + output : Tensor + infer_meta : + func : SoftmaxInferMeta + kernel : + func : sotfmax + - api : split args : (Tensor x, ScalarArray num_or_sections, Scalar axis) output : Tensor[] diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py index d91b76bb70314a2d516b8a384cf3406b7f9e4d0d..bf3d7b3d19eab806706f1d2d654957aac5b33434 100644 --- a/python/paddle/utils/code_gen/api_base.py +++ b/python/paddle/utils/code_gen/api_base.py @@ -698,7 +698,7 @@ PADDLE_API {self.gene_return_type_code()} {self.get_api_func_name() + '_'}({self self.outputs['types'], 'SetKernelOutput', code_indent, inplace_flag) api_func_name = self.get_api_func_name() + ('_' if inplace_flag else '') return f""" -{code_indent} auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( +{code_indent} const auto& kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( {code_indent} "{self.kernel['func'][0]}", {{kernel_backend, kernel_layout, kernel_data_type}}); {code_indent} VLOG(6) << "{self.api} API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]"; {code_indent} VLOG(6) << "{self.api} API kernel: " << kernel; diff --git a/python/paddle/utils/code_gen/sparse_api.yaml b/python/paddle/utils/code_gen/sparse_api.yaml index 9c859022e8ad1d910ecf44426b2850496e793cee..2d1fe78b55981c4d0b39848bb81ae4ea8fcc690b 100644 --- a/python/paddle/utils/code_gen/sparse_api.yaml +++ b/python/paddle/utils/code_gen/sparse_api.yaml @@ -4,18 +4,19 @@ kernel : func : sparse_conv3d layout : x + backward : conv3d_grad - api : to_dense - args : (Tensor x, Backend backend) + args : (Tensor x) output : Tensor(out@DenseTensor) - invoke : to_dense_impl(x, backend) + invoke : to_dense_impl(x) - api : to_sparse_coo - args : (Tensor x, Backend backend, int64 sparse_dim) + args : (Tensor x, int64 sparse_dim) output : Tensor(out@SparseCooTensor) - invoke : to_sparse_coo_impl(x, backend, sparse_dim) + invoke : to_sparse_coo_impl(x, sparse_dim) - api : to_sparse_csr - args : (Tensor x, Backend backend) + args : (Tensor x) output : Tensor(out@SparseCsrTensor) - invoke : to_sparse_csr_impl(x, backend) + invoke : to_sparse_csr_impl(x) diff --git a/python/paddle/utils/code_gen/sparse_api_gen.py b/python/paddle/utils/code_gen/sparse_api_gen.py index dd22e16dc64f0a12c3575b6f2a0d2c21cd97955b..b4fc7638622b9ec7cfbac12b3d7c831fb6b25ec7 100644 --- a/python/paddle/utils/code_gen/sparse_api_gen.py +++ b/python/paddle/utils/code_gen/sparse_api_gen.py @@ -192,9 +192,7 @@ def source_include(header_file_path): def api_register(): - return """ -PD_REGISTER_API(Test); -""" + return "" def api_namespace(): diff --git a/python/paddle/utils/code_gen/sparse_bw_api_gen.py b/python/paddle/utils/code_gen/sparse_bw_api_gen.py index 561e198a41b99c2362f5e14ac7fcd6da051c8875..5dac7c8c48367b0cb3a880ab8614de5cbec58257 100644 --- a/python/paddle/utils/code_gen/sparse_bw_api_gen.py +++ b/python/paddle/utils/code_gen/sparse_bw_api_gen.py @@ -115,9 +115,7 @@ def source_include(header_file_path): def api_register(): - return """ -PD_REGISTER_API(Test); -""" + return "" def api_namespace(): diff --git a/tools/check_added_ut.sh b/tools/check_added_ut.sh index 2a9fb842862c2e733376d7eee985b428e497b9f9..5466a1cdd597b0f466d3a0a25def932d6a6be098 100644 --- a/tools/check_added_ut.sh +++ b/tools/check_added_ut.sh @@ -52,9 +52,10 @@ if [[ "$SYSTEM" == "Linux" ]] || [[ "$SYSTEM" == "Darwin" ]];then elif [[ "$SYSTEM" == "Windows_NT" ]];then bash $PADDLE_ROOT/win_cmake.sh >prec_build.log 2>&1 fi -ctest -N | awk -F ':' '{print $2}' | sed '/^$/d' | sed '$d' | sed 's/ //g' | grep 'test' > $PADDLE_ROOT/br-ut +# remove line ended with .exe to get correct deleted_ut list +ctest -N | awk -F ':' '{print $2}' | sed '/^$/d' | sed '$d' | sed 's/ //g' | sed '/\.exe$/d' | grep 'test' > $PADDLE_ROOT/br-ut cd $PADDLE_ROOT/build -ctest -N | awk -F ':' '{print $2}' | sed '/^$/d' | sed '$d' | sed 's/ //g' | grep 'test' > $PADDLE_ROOT/pr-ut +ctest -N | awk -F ':' '{print $2}' | sed '/^$/d' | sed '$d' | sed 's/ //g' | sed '/\.exe$/d' | grep 'test' > $PADDLE_ROOT/pr-ut cd $PADDLE_ROOT grep -F -x -v -f br-ut pr-ut > $PADDLE_ROOT/added_ut if [[ "$SYSTEM" == 'Linux' ]];then @@ -66,6 +67,8 @@ rm -rf prec_build if [[ "$SYSTEM" == "Linux" ]] || [[ "$SYSTEM" == "Darwin" ]];then rm $PADDLE_ROOT/br-ut $PADDLE_ROOT/pr-ut $PADDLE_ROOT/paddle/scripts/paddle_build_pre.sh elif [[ "$SYSTEM" == "Windows_NT" ]];then + # get the deleted ut list in windows, will be used in check_change_of_unittest.sh + grep -F -x -v -f pr-ut br-ut > $PADDLE_ROOT/deleted_ut rm $PADDLE_ROOT/br-ut $PADDLE_ROOT/pr-ut $PADDLE_ROOT/win_cmake.sh fi git checkout -f $CURBRANCH diff --git a/tools/infrt/custom_pdop.td b/tools/infrt/custom_pdop.td index f754767259563f2cd64bac92adf76249b18af11f..ae0316036f1854e281e07de59fb5aa53201bd35e 100644 --- a/tools/infrt/custom_pdop.td +++ b/tools/infrt/custom_pdop.td @@ -23,16 +23,6 @@ def PD_FetchOp : PD_Op<"fetch", [Terminator]> { let arguments = (ins PD_Tensor :$inputs, StrAttr:$name); } -def PD_ReturnOp : PD_Op<"return", [Terminator]> { - let summary = "return Op"; - - let description = [{ - Fetch tensor from the graph. - }]; - - let arguments = (ins Variadic:$inputs); -} - def PD_GraphOp : PD_Op<"graph", [SingleBlockImplicitTerminator<"::infrt::ReturnOp">]> { let summary = "paddle graph Op"; let description = [{ @@ -52,6 +42,6 @@ def PD_ConstantOp : PD_Op<"constant", [NoSideEffect, ConstantLike, DeclareOpInte let hasFolder = 1; let builders = [ - OpBuilder<(ins "Attribute":$value)>, + OpBuilder<(ins "mlir::Attribute":$value)>, ]; } diff --git a/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py b/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py index 027dfe4328a55ff246928cbc9ab6d3d36f15e1fd..b0e420da64aa280b71859b27334a2abeaaacc53b 100644 --- a/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py +++ b/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py @@ -16,8 +16,6 @@ import paddle.fluid.framework as framework from paddle.fluid import core from paddle import compat as cpt -ops_having_canonicalization = {"elementwise_add", } - # collect original ops: op which has both inference and grid defination def get_original_ops(): @@ -186,16 +184,31 @@ def generate_all_ops_inputs_outputs_map(op_descs): cpp_style_ops_outputs_map_str = start_ + ops_outputs_str + "\n};" # 3. Write to header file - dst_head_file = "../../paddle/infrt/dialect/pd_ops_info.h" + dst_head_file = "../../paddle/infrt/dialect/pd/common/pd_ops_info.h" with open(dst_head_file, 'w') as ops_inputs_outputs_head_file: ops_inputs_outputs_head_file.write(cpp_style_ops_inputs_map_str) ops_inputs_outputs_head_file.write("\n\n") ops_inputs_outputs_head_file.write(cpp_style_ops_outputs_map_str) +def get_constraint(op_type, op_proto): + # 2.3.1 inputs + constraint = "NoSideEffect" + + optional_input_num_ = 0 + for input_ in op_proto[INPUTS]: + if op_proto[INPUTS][input_][EXTRA] != True and op_proto[INPUTS][input_][ + INTERMEDIATE] != True and op_proto[INPUTS][input_][ + DISPENSABLE] == True: + optional_input_num_ += 1 + if optional_input_num_ > 1: + constraint += ", AttrSizedOperandSegments" + return constraint + + # funtion to generate paddle op dialect file def convert_op_proto_into_mlir(op_descs): - dst_dialect_file = "../../paddle/infrt/dialect/pd_ops.td" + dst_dialect_file = "../../paddle/infrt/dialect/pd/ir/pd_ops.td" custom_dialect_file = "custom_pdop.td" # 1. Head files @@ -214,7 +227,7 @@ def convert_op_proto_into_mlir(op_descs): "include \"mlir/Interfaces/InferTypeOpInterface.td\"", "include \"mlir/Interfaces/LoopLikeInterface.td\"", "include \"mlir/IR/OpBase.td\"", - "include \"paddle/infrt/dialect/pd_op_base.td\"", + "include \"paddle/infrt/dialect/pd/ir/pd_op_base.td\"", "", ] @@ -239,13 +252,14 @@ def convert_op_proto_into_mlir(op_descs): if (op_type in skipped_op_list) or (op_type not in original_ops_): continue automatically_generated_op_dialect.append(op_type) + constraint_ = get_constraint(op_type, op_proto) # 2.1 OpDef - HEAD = 'def PD_{op_type_capitalize}Op : PD_Op<"{op_type}", [NoSideEffect]> {left_brace}\n'.format( + HEAD = 'def PD_{op_type_capitalize}Op : PD_Op<"{op_type}", [{constraint}]> {left_brace}\n'.format( op_type_capitalize=op_type.capitalize(), + constraint=constraint_, op_type=op_type, left_brace="{") SUMMARY = ' let summary = "{} op";\n'.format(op_type) - CANONICALIZATION = "let hasCanonicalizer = 1;" if op_type in ops_having_canonicalization else "" # 2.2 Description contents = "" @@ -259,14 +273,22 @@ def convert_op_proto_into_mlir(op_descs): ARGUMENTS = "" if (len(op_proto[INPUTS]) > 0 or len(op_proto[ATTRS]) > 0): ARGUMENTS = " let arguments = (ins " + # 2.3.1 inputs for input_ in op_proto[INPUTS]: if op_proto[INPUTS][input_][EXTRA] != True and op_proto[INPUTS][ input_][INTERMEDIATE] != True: - if op_proto[INPUTS][input_][DUPLICABLE] != "true": - ARGUMENTS = ARGUMENTS + " PD_Tensor:$" + input_ + "," + if op_proto[INPUTS][input_][DISPENSABLE] != True: + if op_proto[INPUTS][input_][DUPLICABLE] != True: + ARGUMENTS = ARGUMENTS + " PD_Tensor:$" + input_ + "," + else: + ARGUMENTS = ARGUMENTS + " PD_Tensor_Array:$" + input_ + "," else: - ARGUMENTS = ARGUMENTS + " PD_Tensor_Array:$" + input_ + "," + if op_proto[INPUTS][input_][DUPLICABLE] != True: + ARGUMENTS = ARGUMENTS + " Optional:$" + input_ + "," + else: + ARGUMENTS = ARGUMENTS + " Optional:$" + input_ + "," + # unsupported: BLOCK = 8; BLOCKS = 10; attr_mlir_converter = { 0: 'SI32Attr', @@ -335,7 +357,7 @@ def convert_op_proto_into_mlir(op_descs): for output_ in op_proto[OUTPUTS]: if op_proto[OUTPUTS][output_][EXTRA] != True and op_proto[ OUTPUTS][output_][INTERMEDIATE] != True: - if op_proto[OUTPUTS][output_][DUPLICABLE] != "true": + if op_proto[OUTPUTS][output_][DUPLICABLE] != True: outputs = outputs + "PD_Tensor:${},".format(output_) else: outputs = outputs + "PD_Tensor_Array:${},".format( @@ -348,7 +370,6 @@ def convert_op_proto_into_mlir(op_descs): ops_mlir_file.write(DESCRIPTION) ops_mlir_file.write(ARGUMENTS) ops_mlir_file.write(RESULTS) - ops_mlir_file.write(CANONICALIZATION) ops_mlir_file.write("}\n") print("Skipped ops num: " + str(len(skipped_op_list))) diff --git a/tools/infrt/generate_phi_kernel_dialect.py b/tools/infrt/generate_phi_kernel_dialect.py index 36561d4e71da8b669f1e06b0240a4d4b3b2ca92e..f632c9a9dba504d209946e494e55eb970e727629 100644 --- a/tools/infrt/generate_phi_kernel_dialect.py +++ b/tools/infrt/generate_phi_kernel_dialect.py @@ -43,7 +43,8 @@ precision_type_converter = { "float64": "FLOAT64", "complex64": "COMPLEX64", "complex128": "COMPLEX128", - "bool": "BOOL" + "bool": "BOOL", + "Undefined": "UNK" } kernel_types_info_file = "./kernels.json" diff --git a/tools/infrt/get_phi_kernel_function.sh b/tools/infrt/get_phi_kernel_function.sh index 3b9f4b7273500f23d67a3062a2d4ee367c0b473b..6b2586d40819b9e25eef823dff59687114664197 100644 --- a/tools/infrt/get_phi_kernel_function.sh +++ b/tools/infrt/get_phi_kernel_function.sh @@ -41,7 +41,37 @@ python3 ${PADDLE_ROOT}/python/paddle/utils/code_gen/wrapped_infermeta_gen.py \ grep PD_REGISTER_INFER_META_FN ${temp_path}/generate.cc \ | awk -F "\(|,|::|\)" '{print $2, $4}' > ${temp_path}/wrap_info.txt -#step 3: merge all infos + +#step 3:get ir's attr_name. +ir_attr_name_info_file=`mktemp` +# phi_cpu attr +all_ir_name=`grep -Eo "PDTCPU_Kernel<.*\"" paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td | awk -v FS="<" '{gsub(/\"/,"");print $2}'` +for ir in $all_ir_name +do + attr_name=`grep "<\"$ir" -A 3 paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td | grep -Eo "Attr:.*)" \ + | awk '{gsub(/F32Attr/,"");gsub(/F64Attr/,"");gsub(/StrAttr/,"");gsub(/BOOLAttr/,""); \ + gsub(/SI1Attr/,"");gsub(/SI8Attr/,"");gsub(/SI16Attr/,"");gsub(/SI32Attr/,"");gsub(/SI64Attr/,""); \ + gsub(/UI1Attr/,"");gsub(/UI8Attr/,"");gsub(/I16Attr/,"");gsub(/I32Attr/,"");gsub(/I64Attr/,""); \ + gsub(/I1Attr/,"");gsub(/I8Attr/,"");gsub(/UI16Attr/,"");gsub(/UI32Attr/,"");gsub(/UI64Attr/,""); \ + gsub(/Attr/,"");gsub(/\)/,""); \ + gsub(/[,:]/,"");print $a}'` + echo phi_cpu.$ir $attr_name >> $ir_attr_name_info_file +done +# phi_gpu attr +all_ir_name=`grep -Eo "PDTGPU_Kernel<.*\"" paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td | awk -v FS="<" '{gsub(/\"/,"");print $2}'` +for ir in $all_ir_name +do + attr_name=`grep "<\"$ir" -A 3 paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td | grep -Eo "Attr:.*)" \ + | awk '{gsub(/F32Attr/,"");gsub(/F64Attr/,"");gsub(/StrAttr/,"");gsub(/BOOLAttr/,""); \ + gsub(/SI1Attr/,"");gsub(/SI8Attr/,"");gsub(/SI16Attr/,"");gsub(/SI32Attr/,"");gsub(/SI64Attr/,""); \ + gsub(/UI1Attr/,"");gsub(/UI8Attr/,"");gsub(/I16Attr/,"");gsub(/I32Attr/,"");gsub(/I64Attr/,""); \ + gsub(/I1Attr/,"");gsub(/I8Attr/,"");gsub(/UI16Attr/,"");gsub(/UI32Attr/,"");gsub(/UI64Attr/,""); \ + gsub(/Attr/,"");gsub(/\)/,""); \ + gsub(/[,:]/,"");print $a}'` + echo phi_gpu.$ir $attr_name >> $ir_attr_name_info_file +done + +#step 4: merge all infos # @input1 => phi kernel infomation : kernel_name kernel_key(GPU/CPU, precision, layout) # @input2 => information from api.yaml : kernel_name kernel_function_name inferMeta_function_name # @input3 => information from wrapped_infermeta_gen : ensure the inferMeta function has @@ -50,4 +80,5 @@ python3 ${PADDLE_ROOT}/tools/infrt/get_phi_kernel_info.py \ --paddle_root_path ${PADDLE_ROOT} \ --kernel_info_file $kernel_register_info_file \ --infermeta_wrap_file ${temp_path}/wrap_info.txt \ + --attr_info_file $ir_attr_name_info_file \ --generate_file ${PADDLE_ROOT}/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.cc diff --git a/tools/infrt/get_phi_kernel_info.py b/tools/infrt/get_phi_kernel_info.py index 774f6cd6bf3648a0de7a34e01e893d212bce9770..85ad585cdefa9cbb4ac8d029e699af4d5ffaeaf7 100644 --- a/tools/infrt/get_phi_kernel_info.py +++ b/tools/infrt/get_phi_kernel_info.py @@ -37,6 +37,8 @@ def parse_args(): type=str, required=True, help="inferMeta wrap info file.") + parser.add_argument( + "--attr_info_file", type=str, required=True, help="attr info file.") parser.add_argument( "--generate_file", type=str, @@ -59,6 +61,23 @@ def get_kernel_info(file_path): return [l.strip() for l in cont] +def get_attr_info(file_path): + """ + phi_gpu.argsort.float64.any $axisBool$descending + """ + ret = {} + with open(file_path, 'r') as f: + cont = f.readlines() + for l in cont: + datas = l.strip().split(' ') + if len(datas) == 2: + attrs = datas[1].split('$') + ret[datas[0]] = attrs[1:] + else: + ret[datas[0]] = None + return ret + + def merge(infer_meta_data, kernel_data, wrap_data): meta_map = {} for api in infer_meta_data: @@ -114,14 +133,14 @@ namespace kernel { def gen_context(val): if val == "CPU": - return "phi::CPUContext" - # elif val == "GPU": - # return "phi::GPUContext" + return "phi::CPUContext", "phi_cpu" + elif val == "GPU": + return "phi::GPUContext", "phi_gpu" # elif val == "XPU": - # return "phi::XPUContext" + # return "phi::XPUContext", "phi_xpu" else: # raise Exception(f"Unknown context type {val}") - return "" + return "", "" def gen_layout(val): @@ -195,34 +214,53 @@ def gen_dtype(vals: List[str]): return ir_dtypes, origin_dtypes -# TODO(wilber): Now only process CPUContext. -def gen_register_info(resources: List[List[str]]): +# Note: Now only process CPUContext and GPUContext. + + +def gen_register_code_info(item: List[str], attr_data: Dict[str, List[str]]): """ - resources: [['add', 'CPU', 'ALL_LAYOUT', 'AddKernel', 'float', 'double', '...'(varaidic types), 'ElementwiseInferMeta'], ...] + item: ['add', 'CPU', 'ALL_LAYOUT', 'AddKernel', 'float', 'double', '...'(varaidic types), 'ElementwiseInferMeta'] + attr_data: {'phi_cpu.arg_min.float32.any': ['axisBool', 'keepdimsBool', 'flatten', 'dtype']} """ - res = "void RegisterInferShapeLaunchers(host_context::KernelRegistry* registry) {" - for item in resources: - # The output string is polluted by C++ macros, here the \ is removed - update_item = [v.strip('\\') for v in item] + ctx_name, ir_ctx_name = gen_context(item[1]) + if (ctx_name == ""): + return "" + item[2] = gen_layout(item[2]) + ir_dtypes, origin_dtypes = gen_dtype(item[4:-1]) + infer_shape_func = "&phi::" + item[-1] - ctx_name = gen_context(update_item[1]) - if (ctx_name == ""): - continue - update_item[2] = gen_layout(update_item[2]) - ir_dtypes, origin_dtypes = gen_dtype(update_item[4:-1]) - infer_shape_func = "&phi::" + update_item[-1] + res = "" - if update_item[-1] == "unknown": - # TODO(wilber): handle the unknown inferShape func. - continue + if item[-1] == "unknown": + # TODO(wilber): handle the unknown inferShape func. + return "" + + for ir_dtype, origin_dtype in zip(ir_dtypes, origin_dtypes): + kernel_func = gen_kernel_func(item[3], ctx_name, origin_dtype) + ir_name = ir_ctx_name + '.' + item[0].lower( + ) + '.' + ir_dtype + '.' + item[2].lower() + if ir_name in attr_data.keys() and attr_data[ir_name] is not None: + attr_names = ', '.join( + ["\"" + a + "\"" for a in attr_data[ir_name]]) + res += f""" +registry->AddKernelWithAttrs("{ir_name}",""" + + res += f""" + std::bind(&KernelLauncherFunc, + KernelLauncher(), + std::placeholders::_1), + {{{attr_names}}}); +""" - for ir_dtype, origin_dtype in zip(ir_dtypes, origin_dtypes): - kernel_func = gen_kernel_func(update_item[3], ctx_name, - origin_dtype) - ir_name = 'phi_cpu.' + update_item[0].lower( - ) + '.' + ir_dtype + '.' + update_item[2].lower() + else: res += f""" - registry->AddKernel("{ir_name}",""" +registry->AddKernel("{ir_name}",""" res += f""" std::bind(&KernelLauncherFunc