diff --git a/cmake/external/cinn.cmake b/cmake/external/cinn.cmake index 41b90345c8c5f38afa413bd2411af975c9d0b103..d3f330ba9dd0fa58b26e9ea05a7154184747daff 100644 --- a/cmake/external/cinn.cmake +++ b/cmake/external/cinn.cmake @@ -26,7 +26,7 @@ add_definitions(-w) ###################################### include(ExternalProject) set(CINN_PREFIX_DIR ${THIRD_PARTY_PATH}/CINN) -set(CINN_GIT_TAG release/v0.1) +set(CINN_GIT_TAG 56879b637e2c4db19091eedad03d7cc674e092a2) set(CINN_OPTIONAL_ARGS -DPY_VERSION=${PY_VERSION} -DWITH_CUDA=${WITH_GPU} -DWITH_CUDNN=${WITH_GPU} diff --git a/cmake/external/llvm.cmake b/cmake/external/llvm.cmake index 9f6fd32ad986c4a5911b1d00dfb548fa3320c34d..5c48afa2806aab10bb08317679c0a00c8f177f7b 100644 --- a/cmake/external/llvm.cmake +++ b/cmake/external/llvm.cmake @@ -99,7 +99,8 @@ endfunction() function(mlir_add_rewriter td_base) set(LLVM_TARGET_DEFINITIONS ${td_base}.td) - mlir_tablegen(${td_base}.cpp.inc -gen-rewriters "-I${CMAKE_SOURCE_DIR}/infrt/dialect/pass") + set(LLVM_TARGET_DEPENDS ${LLVM_TARGET_DEPENDS} ${CMAKE_SOURCE_DIR}/paddle/infrt/dialect/infrt/ir/infrt_base.td) + mlir_tablegen(${td_base}.cpp.inc -gen-rewriters) add_public_tablegen_target(MLIR${td_base}IncGen) add_dependencies(mlir-headers MLIR${td_base}IncGen) endfunction() diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt index f88c993d85e2fa6eda27b7e845ee27f08347fa83..3fca45cc068f9916b52b3f99df2baa679d4c3546 100644 --- a/paddle/fluid/distributed/collective/CMakeLists.txt +++ b/paddle/fluid/distributed/collective/CMakeLists.txt @@ -1,8 +1,9 @@ cc_library(processgroup SRCS ProcessGroup.cc DEPS phi phi_api eager_api) +cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup phi phi_api) + if (WITH_DISTRIBUTE) cc_library(processgroup_gloo SRCS ProcessGroupGloo.cc DEPS phi phi_api eager_api gloo_wrapper) endif() -cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup) if(WITH_NCCL) cc_library(processgroup_nccl SRCS ProcessGroupNCCL.cc DEPS place cuda_stream enforce collective_helper device_context phi phi_api eager_api) diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc index 67715f410d443c38a1c5d92c560a35a909c5ec1c..7f21bcee87ab705097d3c2beaf799e5f2d93b833 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc @@ -88,8 +88,8 @@ void SyncDefaultStream( for (size_t i = 0; i < places.size(); ++i) { auto* default_ctx = static_cast( platform::DeviceContextPool::Instance().Get(places[i])); - ncclEvents[i].Record(*dev_ctx[i]); - ncclEvents[i].Block(*default_ctx); + ncclEvents[i].Record(*default_ctx); + ncclEvents[i].Block(*dev_ctx[i]); } } diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc index 59f3ea3b0a7d85651e7780b4b11875f19b70931e..5533f3f4cbf4b136c52b35cb74afefb86cbe73d7 100644 --- a/paddle/fluid/distributed/collective/reducer.cc +++ b/paddle/fluid/distributed/collective/reducer.cc @@ -13,7 +13,6 @@ // limitations under the License. #include "paddle/fluid/distributed/collective/reducer.h" -#include "paddle/phi/common/data_type.h" namespace paddle { namespace distributed { @@ -127,5 +126,430 @@ std::vector> Eager_AssignGroupBySize( return res; } +template +static void ConcatTensorsForAllReduce( + const DeviceContext &context, + const std::vector &dense_tensors_, + Tensor *p_dense_contents) { + operators::math::ConcatFunctor concat_functor_; + concat_functor_( + context, dense_tensors_, 0, + std::dynamic_pointer_cast(p_dense_contents->impl()) + .get()); +} + +template +static void SplitTensorsForAllReduce( + const DeviceContext &context, Tensor *p_dense_contents, + std::vector *p_dense_tensors) { + auto *in = + std::dynamic_pointer_cast(p_dense_contents->impl()) + .get(); + std::vector outs; + std::vector shape_refer; + + outs.reserve(p_dense_tensors->size()); + shape_refer.reserve(p_dense_tensors->size()); + + for (auto &tensor : *p_dense_tensors) { + outs.emplace_back(&tensor); + shape_refer.emplace_back(&tensor); + } + + operators::math::SplitFunctor split_functor_; + split_functor_(context, *in, shape_refer, 0, &outs); +} + +// context is used to select the stream for concat +template +static void ConcatTensorsWithType( + const DeviceContext &context, + const std::vector &dense_tensors_, + Tensor *p_dense_contents, phi::DataType type) { + switch (type) { + case phi::DataType::FLOAT16: + ConcatTensorsForAllReduce( + context, dense_tensors_, p_dense_contents); + break; + case phi::DataType::FLOAT32: + ConcatTensorsForAllReduce(context, dense_tensors_, + p_dense_contents); + break; + case phi::DataType::FLOAT64: + ConcatTensorsForAllReduce(context, dense_tensors_, + p_dense_contents); + break; + default: + PADDLE_THROW(platform::errors::Unimplemented( + "Data type (%s) is not supported when it concats tensors for " + "allreduce.", + type)); + } +} + +// context is used to select the stream for split +template +static void SplitTensorsWithType(const DeviceContext &context, + Tensor *p_dense_contents, + std::vector *p_dense_tensors, + phi::DataType type) { + switch (type) { + case phi::DataType::FLOAT16: + SplitTensorsForAllReduce( + context, p_dense_contents, p_dense_tensors); + break; + case phi::DataType::FLOAT32: + SplitTensorsForAllReduce(context, p_dense_contents, + p_dense_tensors); + break; + case phi::DataType::FLOAT64: + SplitTensorsForAllReduce(context, p_dense_contents, + p_dense_tensors); + break; + default: + PADDLE_THROW(platform::errors::Unimplemented( + "Data type (%s) is not supported when it splits tensors for " + "allreduce.", + type)); + } +} + +void EagerGroup::ConcatTensors(const platform::Place &place) { + if (platform::is_gpu_place(place)) { +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + auto *default_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(place)); + ConcatTensorsWithType(*default_ctx, dense_tensors_, &dense_contents_, + dtype_); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Paddle can't concat grad tensors since it's not compiled with NCCL," + "Please recompile or reinstall Paddle with NCCL support.")); +#endif + } else if (platform::is_cpu_place(place)) { + auto *default_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(place)); + ConcatTensorsWithType(*default_ctx, dense_tensors_, &dense_contents_, + dtype_); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Concat grad tensor not supported on place (%s)", place)); + } +} + +void EagerGroup::SplitTensors(const platform::Place &place) { + if (platform::is_gpu_place(place)) { +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + auto *default_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(place)); + SplitTensorsWithType(*default_ctx, &dense_contents_, &dense_tensors_, + dtype_); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Paddle can't split grad tensor since it's not compiled with NCCL," + "Please recompile or reinstall Paddle with NCCL support.")); +#endif + } else if (platform::is_cpu_place(place)) { + auto *default_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(place)); + SplitTensorsWithType(*default_ctx, &dense_contents_, &dense_tensors_, + dtype_); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Split grad tensor not supported on place (%s)", place)); + } +} + +EagerReducer::EagerReducer( + const std::vector tensors, + const std::vector> &group_indices, + const std::vector &is_sparse_gradient, + std::shared_ptr process_group, + const std::vector &group_size_limits, bool find_unused_parameters) + : tensors_(tensors), + group_indices_(group_indices), + is_sparse_gradient_(is_sparse_gradient), + process_group_(process_group), + group_size_limits_(group_size_limits), + find_unused_vars_each_step_(find_unused_parameters) { + VLOG(3) << "Start construct the Reducer ..."; + + nranks_ = process_group_->GetSize(); + + // initialize groups + InitializeGroups(group_indices); + + for (size_t global_var_index = 0; global_var_index < tensors_.size(); + ++global_var_index) { + auto tensor = tensors_[global_var_index]; + auto reduce_hook = [=](void) -> void { + this->AddDistHook(global_var_index); + }; + + const auto &grad_node = GetGradNodeFromTensor(&tensor); + + PADDLE_ENFORCE( + grad_node.get() != nullptr, + paddle::platform::errors::Fatal("Detected NULL grad_node," + "Leaf tensor should have had grad_node " + "with type: GradNodeAccumulation")); + const auto &accumulation_grad_node = + std::dynamic_pointer_cast(grad_node); + accumulation_grad_node->RegisterReduceHook( + std::make_shared(reduce_hook)); + } + + vars_marked_ready_.resize(tensors_.size(), false); + local_used_vars_.resize(tensors_.size(), 0); +} + +std::shared_ptr EagerReducer::GetGradNodeFromTensor( + Tensor *tensor) { + auto *autograd_meta = tensor->get_autograd_meta(); + const auto &grad_node = + static_cast(autograd_meta)->GetMutableGradNode(); + return grad_node; +} + +void EagerReducer::InitializeGroups( + const std::vector> &group_indices) { + VLOG(3) << "Start initialize groups .."; + + // clear the group + groups_.clear(); + groups_.reserve(group_indices.size()); + + variable_locators_.clear(); + variable_locators_.resize(tensors_.size()); + + auto group_nums = group_indices.size(); + for (size_t group_index = 0; group_index < group_nums; ++group_index) { + const auto &tensor_indices_ = group_indices[group_index]; + PADDLE_ENFORCE_GT( + tensor_indices_.size(), 0, + platform::errors::PreconditionNotMet( + "The number of group[%d]'s elements is 0.", group_index)); + + EagerGroup group; + + // It's just for check the sparse or dense + auto first_var = tensors_[tensor_indices_.front()]; + if (tensor_indices_.size() == 1 && + is_sparse_gradient_[tensor_indices_.front()]) { + // process the sparse gradient. one sparse, one group + group.dtype_ = first_var.dtype(); + } else { + // process the dense gradient. + InitializeDenseGroups(tensor_indices_, &group); + experimental::Backend backend; + switch (inner_place_.GetType()) { + case phi::AllocationType::GPU: + backend = experimental::Backend::GPU; + break; + case phi::AllocationType::CPU: + backend = experimental::Backend::CPU; + break; + default: + PADDLE_THROW(platform::errors::Unimplemented( + "Place type (%s) is not supported. ", inner_place_)); + break; + } + group.dense_contents_ = paddle::experimental::empty( + ScalarArray({group.all_length_}), group.dtype_, backend); + } + + // map tensors to this group by VariableLocator + size_t inside_group_index = 0; + for (const auto var_index : tensor_indices_) { + TensorLocator tensor_locator; + tensor_locator.group_index = group_index; + tensor_locator.inside_group_index = inside_group_index++; + variable_locators_[var_index] = tensor_locator; + } + group.tensor_indices_ = std::move(tensor_indices_); + groups_.emplace_back(std::move(group)); + + VLOG(3) << "The Group[" << group_index << "]:" << groups_.back(); + } +} + +void EagerReducer::InitializeDenseGroups( + const std::vector &tensor_indices_, EagerGroup *p_group) { + VLOG(3) << "InitializeDenseGroups."; + int64_t all_length = 0; + for (size_t index = 0; index < tensor_indices_.size(); ++index) { + auto tensor_index = tensor_indices_[index]; + auto &tensor = tensors_[tensor_index]; + auto &tensor_name = tensor.name(); + + PADDLE_ENFORCE_EQ(tensor.is_initialized(), true, + platform::errors::PreconditionNotMet( + "Tensor %s is not initialized.", tensor_name)); + const auto size = tensor.numel(); + PADDLE_ENFORCE_GT( + size, 0, platform::errors::PreconditionNotMet( + "The number of tensor %s's elements is 0.", tensor_name)); + all_length += size; + + p_group->length_.push_back(size); + + // for concat operator + p_group->origin_shapes_.push_back(ScalarArray(tensor.shape())); + p_group->dense_tensors_.push_back(phi::DenseTensor()); + + const auto &dtype = tensor.dtype(); + const auto &place = tensor.place(); + const auto &inner_place = tensor.impl()->place(); + if (index > 0) { + PADDLE_ENFORCE_EQ(dtype, p_group->dtype_, + platform::errors::PreconditionNotMet( + "Tensor %s has unexpected dtype.", tensor_name)); + PADDLE_ENFORCE_EQ(place, place_, + platform::errors::PreconditionNotMet( + "Tensor %s has different place. Expected place is " + "%s, but actual place is %s", + tensor_name, inner_place_, inner_place)); + } else { + p_group->dtype_ = dtype; + place_ = place; + inner_place_ = inner_place; + } + } + p_group->all_length_ = all_length; +} + +void EagerReducer::PrepareForBackward(const std::vector &outputs) { + VLOG(3) << "after forward, then reset count for backward."; + grad_need_hooks_ = true; + next_group_ = 0; + std::for_each(groups_.begin(), groups_.end(), [](EagerGroup &group) { + group.pending_ = group.tensor_indices_.size(); + }); + + // reinitialize vars_marked_ready_ for next iteration + vars_marked_ready_.clear(); + vars_marked_ready_.resize(tensors_.size(), false); +} + +void EagerReducer::AddDistHook(size_t var_index) { + PADDLE_ENFORCE_LT(var_index, variable_locators_.size(), + platform::errors::OutOfRange( + "Out of bounds variable index. it must be less" + "than %d, but it is %d", + variable_locators_.size(), var_index)); + + // gradient synchronization is not required when grad_need_hooks_ is false. + if (!grad_need_hooks_) { + return; + } + + auto &tensor = tensors_[var_index]; + const auto &grad_node = GetGradNodeFromTensor(&tensor); + + VLOG(3) << "Var[" << var_index << "] [" << (*grad_node).name() + << "] arrived and triggered disthook"; + + local_used_vars_[var_index] = 1; + + MarkVarReady(var_index, true); +} + +void EagerReducer::MarkVarReady(const size_t var_index, + const bool is_used_var) { + const auto &var_locator = variable_locators_[var_index]; + const auto group_index = var_locator.group_index; + const auto inside_group_index = var_locator.inside_group_index; + + auto &group = groups_[group_index]; + auto &group_tensor = group.dense_tensors_[inside_group_index]; + auto *autograd_meta = tensors_[var_index].get_autograd_meta(); + auto &grad_tensor = static_cast(autograd_meta)->Grad(); + + group_tensor + .ShareDataWith( + *(std::dynamic_pointer_cast(grad_tensor.impl()))) + .Resize({grad_tensor.numel()}); + + vars_marked_ready_[var_index] = true; + + if (--group.pending_ == 0) { + // can start allreduce + MarkGroupReady(group_index); + } +} + +void EagerReducer::MarkGroupReady(size_t group_index) { + VLOG(3) << "Group[" << group_index << "] is ready"; + + PADDLE_ENFORCE_GE( + group_index, next_group_, + platform::errors::PreconditionNotMet( + "The index of the incoming group must be greater " + "than or equal to the previously synchronized group index, " + "expect it to greater than or equal to %d, but got %d.", + next_group_, group_index)); + + if (group_index > next_group_) { + VLOG(3) << "It will adjust the order of group in next batch automatically"; + return; + } + + for (; next_group_ < groups_.size() && groups_[next_group_].pending_ == 0; + ++next_group_) { + UNUSED auto &group = groups_[next_group_]; + FusedAllReduceSchedule(&group, next_group_); + } +} + +void EagerReducer::FusedAllReduceSchedule(EagerGroup *group, + const int curr_group_index) { + // The overall timeline: concat > div_nranks > allreduce > split + distributed::AllreduceOptions opts; + opts.reduce_op = ReduceOp::SUM; + + VLOG(3) << "group [" << curr_group_index << "] start fused_allreduce."; + + // concat tensors + group->ConcatTensors(inner_place_); + + // div nranks + double scaling = 1.0 / nranks_; + paddle::experimental::scale_(group->dense_contents_, scaling, 0.0, false); + + // all_reduce + std::vector reduce_tensors = {group->dense_contents_}; + tasks_.push_back(process_group_->AllReduce(reduce_tensors, opts)); + + if (tasks_.size() == groups_.size()) { + for (size_t index = 0; index < tasks_.size(); index++) { + auto &task = tasks_.back(); + task->Synchronize(); + tasks_.pop_back(); + } + for (size_t index = 0; index < groups_.size(); index++) { + auto &group = groups_[index]; + group.SplitTensors(inner_place_); + } + } +} + +std::ostream &operator<<(std::ostream &out, const EagerGroup &group) { + const auto &tensors_ = group.tensor_indices_; + out << "numel: " << group.all_length_ << " ;var number: " << tensors_.size() + << "\n"; + auto begin = tensors_.begin(); + auto end = tensors_.end(); + out << "["; + for (int i = 0; begin != end && i < 100; ++i, ++begin) { + if (i > 0) out << ' '; + out << *begin; + } + if (begin != end) { + out << " ..."; + } + out << "]\n"; + return out; +} + } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/collective/reducer.h b/paddle/fluid/distributed/collective/reducer.h index f8c75385ef8bd6891df8eda6faa93c73091c37f5..ac6f3fbe5956cd47d4385343509d41afec0b69a4 100644 --- a/paddle/fluid/distributed/collective/reducer.h +++ b/paddle/fluid/distributed/collective/reducer.h @@ -17,16 +17,109 @@ #include #include #include "paddle/fluid/distributed/collective/ProcessGroup.h" +#include "paddle/fluid/eager/accumulation/accumulation_node.h" +#include "paddle/fluid/eager/api/utils/hook_utils.h" #include "paddle/fluid/eager/api/utils/tensor_utils.h" +#include "paddle/fluid/eager/autograd_meta.h" +#include "paddle/fluid/eager/utils.h" +#include "paddle/fluid/operators/math/concat_and_split.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/phi/api/include/api.h" +#include "paddle/phi/api/include/tensor.h" +#include "paddle/phi/api/lib/ext_compat_utils.h" +#include "paddle/phi/common/data_type.h" namespace paddle { namespace distributed { using Tensor = paddle::experimental::Tensor; +using Scalar = paddle::experimental::ScalarBase; +using ScalarArray = + paddle::experimental::ScalarArrayBase; std::vector> Eager_AssignGroupBySize( - const std::vector, const std::vector& is_sparse_gradient, - const std::vector& group_size_limits, - const std::vector& tensor_indices = {}); + const std::vector, const std::vector &is_sparse_gradient, + const std::vector &group_size_limits, + const std::vector &tensor_indices = {}); + +class EagerGroup { + public: + Tensor dense_contents_; + + // for concat kernel + std::vector dense_tensors_; + std::vector length_; + int64_t all_length_{0}; + std::vector origin_shapes_; + + // Global indices of participating tensors in the group + std::vector tensor_indices_; + + // Number of params that haven't been ready. When it is 0, it means + // the group is ready. + size_t pending_ = -1; + + // external message of group + phi::DataType dtype_; + + // context is used to select the stream for concat + void ConcatTensors(const platform::Place &); + + // context is used to select the stream for split + void SplitTensors(const platform::Place &); + + friend std::ostream &operator<<(std::ostream &, const EagerGroup &); +}; + +struct TensorLocator { + // record the index in groups_ + size_t group_index; + size_t inside_group_index; +}; + +class EagerReducer { + public: + explicit EagerReducer( + const std::vector tensors, + const std::vector> &group_indices, + const std::vector &is_sparse_gradient, + std::shared_ptr process_group, + const std::vector &group_size_limits, + bool find_unused_parameters); + + virtual ~EagerReducer() {} + + std::shared_ptr GetGradNodeFromTensor(Tensor *tensor); + + void InitializeGroups(const std::vector> &group_indices); + void InitializeDenseGroups(const std::vector &tensor_indices_, + EagerGroup *p_group); + void PrepareForBackward(const std::vector &outputs); + void AddDistHook(size_t var_index); + void MarkVarReady(const size_t var_index, const bool is_used_var); + void MarkGroupReady(const size_t group_index); + void FusedAllReduceSchedule(EagerGroup *group, const int curr_group_index); + + private: + std::vector tensors_; + std::vector> group_indices_; + std::vector is_sparse_gradient_; + std::shared_ptr process_group_; + std::vector group_size_limits_; + bool find_unused_vars_each_step_; + + std::vector groups_; + std::vector variable_locators_; + PlaceType place_; + platform::Place inner_place_; + size_t next_group_ = 0; + int64_t nranks_ = -1; + std::vector> tasks_; + + bool grad_need_hooks_{false}; + + std::vector vars_marked_ready_; + std::vector local_used_vars_; +}; } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/ps/table/depends/initializers.h b/paddle/fluid/distributed/ps/table/depends/initializers.h index f46e659a88babb07918d02f1e05859829895f2bf..5ac0c08f97d76f6bc1cb77f1f6cd0da77be2385f 100644 --- a/paddle/fluid/distributed/ps/table/depends/initializers.h +++ b/paddle/fluid/distributed/ps/table/depends/initializers.h @@ -23,7 +23,6 @@ #include "gflags/gflags.h" #include "paddle/fluid/framework/generator.h" - #include "paddle/fluid/operators/truncated_gaussian_random_op.h" namespace paddle { @@ -118,9 +117,13 @@ class TruncatedGaussianInitializer : public Initializer { seed_ = static_cast(std::stoi(attrs[1])); mean_ = std::stof(attrs[2]); std_ = std::stof(attrs[3]); - - std::uniform_real_distribution dist_( - std::numeric_limits::min(), 1.0); + auto normal_cdf = [](float x) { + return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0; + }; + float a_normal_cdf = normal_cdf((-2.0 - mean_) / std_); + float b_normal_cdf = normal_cdf((2.0 - mean_) / std_); + std::uniform_real_distribution dist_(2.0 * a_normal_cdf - 1.0, + 2.0 * b_normal_cdf - 1.0); random_engine_ = framework::GetCPURandomEngine(seed_); } diff --git a/paddle/fluid/eager/CMakeLists.txt b/paddle/fluid/eager/CMakeLists.txt index f9d1b705390cb1c22bf9336292af30363c0010cf..691a381405e9a792d1ee0f256647405a3739e9d8 100644 --- a/paddle/fluid/eager/CMakeLists.txt +++ b/paddle/fluid/eager/CMakeLists.txt @@ -1,4 +1,5 @@ -set(eager_deps phi_api hook_utils tensor_utils utils global_utils backward phi_tensor tracer layer autograd_meta grad_node_info grad_tensor_holder accumulation_node) +set(eager_deps phi_api hook_utils tensor_utils utils global_utils backward phi_tensor tracer layer autograd_meta grad_node_info grad_tensor_holder accumulation_node custom_operator_node) + set(fluid_deps tracer layer proto_desc operator op_registry variable_helper memcpy) set(generated_deps final_dygraph_function final_dygraph_node dygraph_function dygraph_node) @@ -9,6 +10,8 @@ endif() add_subdirectory(api) add_subdirectory(accumulation) +add_subdirectory(custom_operator) + cc_library(grad_node_info SRCS grad_node_info.cc DEPS phi_api phi_tensor) cc_library(grad_tensor_holder SRCS grad_tensor_holder.cc DEPS grad_node_info gradient_accumulator) diff --git a/paddle/fluid/eager/api/utils/global_utils.h b/paddle/fluid/eager/api/utils/global_utils.h index 00578d9a359a3b8d57148efc959de553e811f541..a9a62fcd50e7a0648e695d1f60d52d3f936c53ed 100644 --- a/paddle/fluid/eager/api/utils/global_utils.h +++ b/paddle/fluid/eager/api/utils/global_utils.h @@ -18,7 +18,7 @@ #include #include #include "paddle/fluid/imperative/tracer.h" - +#include "paddle/phi/api/ext/op_meta_info.h" namespace egr { class UniqueNameGenerator { @@ -70,6 +70,21 @@ class Controller { void SetInEagerMode(bool in_eager_mode) { in_eager_mode_ = in_eager_mode; } + const std::unordered_map>& + GetOpMetaInfoMap() { + return op_meta_info_map_; + } + + void MergeOpMetaInfoMap(const std::unordered_map< + std::string, std::vector>& map) { + op_meta_info_map_.insert(map.begin(), map.end()); + } + + std::unordered_map>>& + GetCustomEdgesSlotMap() { + return custom_edges_slot_map_; + } + private: Controller() = default; static Controller* controller_; @@ -77,6 +92,11 @@ class Controller { new paddle::imperative::Tracer()}; // TODO(jiabin): remove when we don't need imperative. bool in_eager_mode_{false}; + std::unordered_map> + op_meta_info_map_; + /* op_type : {{grad_outputs}, {grad_inputs}, {input}, {output}, {attrs}}*/ + std::unordered_map>> + custom_edges_slot_map_; DISABLE_COPY_AND_ASSIGN(Controller); }; diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc index dc79a8a45a246798551a0bcce8c487f67183220b..6a2e5e7ac6cd75068bba4e9b675ab67588c38366 100644 --- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc +++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc @@ -1553,9 +1553,23 @@ static std::pair GenerateForwardFunctionContents( core_ops_returns_info[op_type] = return_contents; // [Generation] ComputeRequireGrad -> GradNodeCreation + if (!bwd_info.GenerateForwardOnly()) { std::string grad_node_creation_body_str = GenerateGradNodeCreationContent(fwd_info, bwd_info); + + // Add event record + std::string event_name = op_type + " node_creation"; + const char* NODE_CREATION_TEMPLATE = + "{\n" + " paddle::platform::RecordEvent node_creation_record_event(\"%s\", " + "paddle::platform::TracerEventType::Operator, 1);\n" + " %s\n" + "}"; + + grad_node_creation_body_str = paddle::string::Sprintf( + NODE_CREATION_TEMPLATE, event_name, grad_node_creation_body_str); + generated_function_body += grad_node_creation_body_str; generated_function_body += "\n"; @@ -1614,10 +1628,20 @@ static std::pair GenerateForwardFunctionContents( if ((*iter) == ',') dygraph_function_args_str.erase(iter); } - const char* FWD_FUNCTION_TEMPLATE = "%s %s(%s) {\n\n%s\n}\n\n"; + const char* DYGRAPH_FUNCTION_EVENT_RECORD_FUNCTION_TEMPLATE = + "paddle::platform::RecordEvent dygraph_entrance_record_event(\"%s\", " + "paddle::platform::TracerEventType::Operator, 1);"; + std::string event_name = op_type + " dygraph"; + std::string fwd_record_event_str = paddle::string::Sprintf( + DYGRAPH_FUNCTION_EVENT_RECORD_FUNCTION_TEMPLATE, event_name); + const char* FWD_FUNCTION_TEMPLATE = + "%s %s(%s) {\n\n" + " %s\n" + " %s\n" + "}\n\n"; std::string fwd_function_str = paddle::string::Sprintf( FWD_FUNCTION_TEMPLATE, function_proto_return_type_str, function_name, - dygraph_function_args_str, generated_function_body); + dygraph_function_args_str, fwd_record_event_str, generated_function_body); // [Generation] Generate forward functions header const char* FWD_HEADER_TEMPLATE = "%s %s(%s);\n"; @@ -2240,8 +2264,9 @@ static void GenerateForwardDygraphFile(const std::string& forward_cc_path, "\"paddle/fluid/eager/api/generated/fluid_generated/" "dygraph_forward_api.h\"\n" "#include " - "\"paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.h\"\n\n" - "#include \"paddle/fluid/eager/api/utils/global_utils.h\"\n"; + "\"paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.h\"\n" + "#include \"paddle/fluid/eager/api/utils/global_utils.h\"\n" + "#include \"paddle/fluid/platform/profiler/event_tracing.h\"\n\n"; std::string forward_cc_include_str = paddle::string::Sprintf(FORWARD_INCLUDE_TEMPLATE); std::ofstream forward_cc_stream(forward_cc_path, std::ios::out); diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py index 967891fe5227dcd6129c0ef1808fba7720711568..656418a05ad6d04bc19838c97d86db9cda19c1c6 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py @@ -148,6 +148,12 @@ def ReadBwdFile(filepath): ###################### ### Yaml Parsers ### ###################### +def RemoveSpecialSymbolsInName(string): + # Remove any name after '@' + ret = string.split("@")[0] + return ret + + def IntermediateValidationCheck(intermediate_outputs, forward_returns_list): # intermediate_outputs : [name0, name1, ...] # forward_returns_list : [[ret_name, type, orig_pos], ...] @@ -166,15 +172,19 @@ def IntermediateValidationCheck(intermediate_outputs, forward_returns_list): def ParseDispensable(string): # string: "X, Y" + string = RemoveSpecialSymbolsInName(string) return [v.strip() for v in string.split(",")] def ParseIntermediate(string): + string = RemoveSpecialSymbolsInName(string) return [v.strip() for v in string.split(",")] def ParseNoNeedBuffer(string): # string: "x, y" + string = RemoveSpecialSymbolsInName(string) + no_need_buffer_set = set() for name in string.split(","): no_need_buffer_set.add(name.strip()) @@ -204,6 +214,8 @@ def ParseYamlArgs(string): assert arg_type in yaml_types_mapping.keys() arg_type = yaml_types_mapping[arg_type] + + arg_name = RemoveSpecialSymbolsInName(arg_name) if "Tensor" in arg_type: assert default_value is None inputs_list.append([arg_name, arg_type, i]) @@ -239,6 +251,7 @@ def ParseYamlReturns(string): ret_type = yaml_types_mapping[ret_type] assert "Tensor" in ret_type + ret_name = RemoveSpecialSymbolsInName(ret_name) returns_list.append([ret_name, ret_type, i]) return returns_list @@ -910,8 +923,20 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name, backward_fwd_input_map, backward_grad_input_map, backward_grad_output_map, backward_attrs_list, optional_inputs) + node_event_name = fwd_api_name + " node_creation" + NODE_CREATION_TEMPLATE = """{{\n + paddle::platform::RecordEvent node_creation_record_event(\"{}\", paddle::platform::TracerEventType::Operator, 1);\n + {}\n + }}""" + node_creation_str = NODE_CREATION_TEMPLATE.format(node_event_name, + node_creation_str) + + dygraph_event_str = f"paddle::platform::RecordEvent dygraph_entrance_record_event(\"{fwd_api_name} dygraph\", paddle::platform::TracerEventType::Operator, 1);" + FORWARD_FUNCTION_TEMPLATE = """ {} {}({}) {{ + {} + // Forward API Call {} @@ -925,7 +950,7 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name, forward_function_name = GetForwardFunctionName(fwd_api_name) forward_function_str = FORWARD_FUNCTION_TEMPLATE.format( returns_type_str, forward_function_name, inputs_args_definition_str, - forward_call_str, node_creation_str, returns_str) + dygraph_event_str, forward_call_str, node_creation_str, returns_str) forward_function_declaration_str = f"{returns_type_str} {forward_function_name}({inputs_args_declaration_str});" return forward_function_str, forward_function_declaration_str @@ -1052,6 +1077,8 @@ def GenerateForwardCCFile(filepath, forward_definition_str): #include "paddle/phi/api/include/sparse_api.h" #include "paddle/fluid/eager/api/utils/global_utils.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" + """ file_contents += GenerateCoreOpInfoDefinition() diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py index eee32a2c5057d523212a4faa5eca8678e961f417..9b77f0449e01d6555cd3a25f101e4867ccc6ffd3 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py @@ -94,9 +94,13 @@ def GeneratePythonCFunction(fwd_api_name, forward_inputs_position_map, dygraph_function_call_list[pos] = f"{name}" dygraph_function_call_str = ",".join(dygraph_function_call_list) + pythonc_event_str = f"paddle::platform::RecordEvent pythonc_record_event(\"{fwd_api_name} pybind_imperative_func\", paddle::platform::TracerEventType::Operator, 1);" + PYTHON_C_FUNCTION_TEMPLATE = """ static PyObject * eager_final_state_api_{}(PyObject *self, PyObject *args, PyObject *kwargs) {{ + {} + PyThreadState *tstate = nullptr; try {{ @@ -136,8 +140,8 @@ static PyObject * eager_final_state_api_{}(PyObject *self, PyObject *args, PyObj fwd_function_name = namespace_str + GetForwardFunctionName(fwd_api_name) python_c_function_str = PYTHON_C_FUNCTION_TEMPLATE.format( - fwd_api_name, fwd_api_name, get_eager_tensor_str, parse_attributes_str, - fwd_function_name, dygraph_function_call_str) + fwd_api_name, pythonc_event_str, fwd_api_name, get_eager_tensor_str, + parse_attributes_str, fwd_function_name, dygraph_function_call_str) python_c_function_reg_str = f"{{\"final_state_{fwd_api_name}\", (PyCFunction)(void(*)(void)) {namespace_str}eager_final_state_api_{fwd_api_name}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {fwd_api_name} in dygraph.\"}}\n" @@ -231,6 +235,7 @@ def GeneratePythonCWrappers(python_c_function_str, python_c_function_reg_str): #include "paddle/fluid/pybind/op_function_common.h" #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h" #include "paddle/fluid/pybind/exception.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" #include namespace paddle {{ diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc index 934497d7d179c1732bde68c147ed86661c25ddae..1987d024d8f3e34121f54962c45f0f8c1e91b723 100644 --- a/paddle/fluid/eager/backward.cc +++ b/paddle/fluid/eager/backward.cc @@ -19,6 +19,8 @@ #include "paddle/fluid/eager/grad_node_info.h" #include "paddle/fluid/eager/grad_tensor_holder.h" #include "paddle/fluid/eager/utils.h" +#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/errors.h" @@ -77,6 +79,9 @@ std::unordered_map getInDegreeMap( void RunBackward(const std::vector& tensors, const std::vector& grad_tensors, bool retain_graph) { + paddle::platform::RecordEvent backward_record_event( + "backward", paddle::platform::TracerEventType::Operator, 1); + VLOG(6) << "Start Backward"; // *Gradient Hook should happen at node-level // *Inplace version check should perform at node-level @@ -112,7 +117,8 @@ void RunBackward(const std::vector& tensors, // Prepare GradTensorHolder if (!node_input_buffers_dict.count(grad_node)) { - VLOG(6) << "Create Value for grad input tensor " << i; + VLOG(6) << "Create Value for grad input tensor " << i + << " of grad node: " << grad_node->name(); node_input_buffers_dict[grad_node] = std::make_unique(grad_node->InputMeta()); } @@ -158,19 +164,27 @@ void RunBackward(const std::vector& tensors, VLOG(6) << "Run Backward"; while (!queue.empty()) { GradNodeBase* node = queue.front(); - queue.pop(); + paddle::platform::RecordEvent node_record_event( + std::string(typeid(*node).name()) + " grad_node", + paddle::platform::TracerEventType::Operator, 1); + + if (queue.size() > 1 && node_in_degree_map[node] != 0) { + queue.pop(); + continue; + } + queue.pop(); // Run node: This is where Hook happens PADDLE_ENFORCE( node_input_buffers_dict.count(node), paddle::platform::errors::Fatal( - "Unable to find next node in the InputBuufer" + "Unable to find next node in the GradTensorHolder \n" "Trying to run Node without configuring its GradTensorHolder")); std::unique_ptr node_input_buffer = std::move(node_input_buffers_dict[node]); - VLOG(6) << "Run Backward Kernel with input_buffer"; + VLOG(6) << "Run Backward Kernel with GradTensorHolder"; // Run Pre Backward Node and get outputs std::vector> grad_output_tensors = (*node)(node_input_buffer->Buffers()); @@ -215,9 +229,8 @@ void RunBackward(const std::vector& tensors, if ((!grad_output_tensor.defined() || !grad_output_tensor.initialized())) { - VLOG(6) - << "We get grad_output_tensor with slot: " << i << ", rank: " << j - << " as uninitialized or undefined in both tensor and variable"; + VLOG(6) << "We get grad_output_tensor with slot: " << i + << ", rank: " << j << " as uninitialized or undefined tensor"; } VLOG(6) << "Get Edge and grad_output_tensor with slot: " << i << ", rank: " << j @@ -228,6 +241,8 @@ void RunBackward(const std::vector& tensors, const auto& input_meta = next_node->InputMeta(); auto grad_tensor_holder = std::make_unique(input_meta); + VLOG(6) << "Construct GradTensorHolder for grad node: " + << next_node->name(); node_input_buffers_dict[next_node] = std::move(grad_tensor_holder); } VLOG(6) << "Sum grad inputs for edge slot: " << edge_rank.first @@ -237,10 +252,12 @@ void RunBackward(const std::vector& tensors, // Update queue node_in_degree_map[next_node]--; - PADDLE_ENFORCE(node_in_degree_map[next_node] >= 0, - paddle::platform::errors::Fatal( - "Detected in-degree value smaller than zero." - "Node's in-degree cannot be negative")); + PADDLE_ENFORCE( + node_in_degree_map[next_node] >= 0, + paddle::platform::errors::Fatal( + "Detected in-degree value smaller than zero. For Node: %s" + "Node's in-degree cannot be negative", + next_node->name())); if (node_in_degree_map[next_node] == 0) { queue.emplace(std::move(next_node)); } diff --git a/paddle/fluid/eager/custom_operator/CMakeLists.txt b/paddle/fluid/eager/custom_operator/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..ccc9a03a55660772b51dc27bbfa78b7531a369d3 --- /dev/null +++ b/paddle/fluid/eager/custom_operator/CMakeLists.txt @@ -0,0 +1 @@ +cc_library(custom_operator_node SRCS custom_operator_node.cc DEPS phi_tensor phi_api grad_node_info custom_operator op_meta_info) diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.cc b/paddle/fluid/eager/custom_operator/custom_operator_node.cc new file mode 100644 index 0000000000000000000000000000000000000000..48ac8c8358afd68cee9d22b8ea0a4e8fd7c3c92e --- /dev/null +++ b/paddle/fluid/eager/custom_operator/custom_operator_node.cc @@ -0,0 +1,90 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/eager/custom_operator/custom_operator_node.h" +#include "paddle/fluid/framework/custom_operator.h" +#include "paddle/fluid/framework/op_meta_info_helper.h" +#include "paddle/phi/api/ext/op_meta_info.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace egr { +std::vector> RunCustomOpNode:: +operator()( + const std::vector>& grads) { + paddle::CustomOpKernelContext ctx; + auto grad_inputs_name = paddle::framework::OpMetaInfoHelper::GetInputs( + egr::Controller::Instance().GetOpMetaInfoMap().at(op_type_)[1]); + auto grad_outputs_names = paddle::framework::OpMetaInfoHelper::GetOutputs( + egr::Controller::Instance().GetOpMetaInfoMap().at(op_type_)[1]); + auto map = egr::Controller::Instance().GetCustomEdgesSlotMap().at(op_type_); + auto kernel_map = egr::Controller::Instance().GetOpMetaInfoMap(); + + std::vector> tmp_ins( + grad_inputs_name.size()); + VLOG(7) << " Prepare Backward inputs of grads with size: " << grads.size() + << ", whose grad_inputs_name size is: " << grad_inputs_name.size(); + for (size_t i = 0; i < grads.size(); i++) { + if (map[1].find(i) != map[1].end()) { + VLOG(7) << "Insert grad: " << i << " to grad_inputs: " << map[1][i]; + tmp_ins[map[1][i]] = grads[i]; + } + } + + for (auto it : fwd_outs) { + VLOG(7) << "Insert fwd_outs to grad_inputs: " << it.first; + tmp_ins[it.first] = RunCustomOpNode::Recover(&(it.second)); + } + + for (auto it : fwd_ins) { + VLOG(7) << "Insert fwd_ins to grad_inputs: " << it.first; + tmp_ins[it.first] = RunCustomOpNode::Recover(&(it.second)); + } + + VLOG(6) << "Prepare Grad inputs"; + for (const auto& in : tmp_ins) { + ctx.EmplaceBackInputs(in); + } + VLOG(6) << "Prepare Grad attrs"; + ctx.EmplaceBackAttrs(attrs_); + std::vector> outs( + GetEdges().size()); + std::vector> tmp_outs( + grad_outputs_names.size()); + VLOG(6) << "Prepare Grad outputs for size: " << grad_outputs_names.size(); + for (size_t i = 0; i < GetEdges().size(); i++) { + if (map[0].find(i) != map[0].end()) { + VLOG(7) << "Insert grad outputs: " << i + << " with size: " << GetEdges()[i].size() + << " to tmp_outputs: " << map[0][i]; + for (size_t j = 0; j < GetEdges()[i].size(); j++) { + outs[i].emplace_back(/* init it incase of copy nullptr of shared_ptr */ + std::make_shared( + phi::DataType::UNDEFINED), + egr::Controller::Instance().GenerateUniqueName( + "custom_tmp_grad")); + } + tmp_outs[map[0][i]] = outs[i]; + } + } + for (size_t i = 0; i < tmp_outs.size(); i++) { + VLOG(7) << "Prepare grad outputs size: " << tmp_outs[i].size(); + ctx.EmplaceBackOutputs(tmp_outs[i]); + } + VLOG(7) << "Run Kernel of Grad Custom Op: " << op_type_; + + (*paddle::framework::OpMetaInfoHelper::GetKernelFn( + kernel_map.at(op_type_)[1]))(&ctx); + return outs; +} +} // namespace egr diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.h b/paddle/fluid/eager/custom_operator/custom_operator_node.h new file mode 100644 index 0000000000000000000000000000000000000000..e5ddef9c062149282d790a5fd6bf31b25a20cf5a --- /dev/null +++ b/paddle/fluid/eager/custom_operator/custom_operator_node.h @@ -0,0 +1,77 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/eager/autograd_meta.h" +#include "paddle/fluid/eager/grad_node_info.h" +#include "paddle/fluid/eager/hooks.h" +#include "paddle/fluid/eager/tensor_wrapper.h" +#include "paddle/fluid/framework/custom_operator.h" +#include "paddle/utils/any.h" + +namespace egr { +class RunCustomOpNode : public GradNodeBase { + public: + // Constructor: configure fwd input tensors to grad node + explicit RunCustomOpNode(size_t bwd_in_slot_num, size_t bwd_out_slot_num, + const std::string& op_type) + : GradNodeBase(bwd_in_slot_num, bwd_out_slot_num), op_type_(op_type) { + VLOG(6) << "Construct RunCustomOpNode for op: " << op_type; + } + + ~RunCustomOpNode() override { + VLOG(6) << "Destruct RunCustomOpNode for op: " << op_type_; + } + + // Functor: perform backward computations + virtual std::vector> operator()( + const std::vector>& grads) + override; + + std::string name() { + return paddle::string::Sprintf("RunCustomOpNode: %s_grad", op_type_); + } + + static std::vector ConstructTensorWrapper( + const std::vector& fwd_var) { + std::vector res; + for (auto const& var : fwd_var) { + res.emplace_back(var); + } + return res; + } + + static std::vector Recover( + std::vector* fwd_var) { + std::vector res; + for (size_t i = 0; i < fwd_var->size(); i++) { + res.emplace_back(fwd_var->at(i).recover(nullptr)); + } + return res; + } + + void SetAttrs(const std::vector& attr) { attrs_ = attr; } + + public: + std::unordered_map> fwd_outs; + std::unordered_map> fwd_ins; + std::unordered_map grads2grad_in_map; + + private: + std::vector attrs_; + std::string op_type_{""}; +}; + +} // namespace egr diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc index 427be83c3bbee31eaa0c7e3d26d2d9599b344450..7eb2902d935c4fd8d5990c81fbf6bcf3fd6e6e66 100644 --- a/paddle/fluid/eager/grad_node_info.cc +++ b/paddle/fluid/eager/grad_node_info.cc @@ -25,7 +25,7 @@ #include "glog/logging.h" /** - * Implementation of GradNodeBase, Edge and InputBuffer. + * Implementation of GradNodeBase, Edge and GradTensorHolder. **/ namespace egr { diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc index adb3246ee8c808c9f62fde0228f40cccb2f9ac88..056c7102f663b93d215e494908d9c95be832068c 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc @@ -40,6 +40,8 @@ PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(sum, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(sum_grad, CPU, ALL_LAYOUT); using namespace egr; // NOLINT using namespace egr_utils_api; // NOLINT diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc index bd70e84d9b461490f53ac6692d55860da1bfc9d8..5e790389819f53b250db8797c7a8b3466818abfb 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc @@ -44,6 +44,8 @@ PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(matmul_grad, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(sum, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT); TEST(Benchmark, EagerScaleCUDA) { eager_test::InitEnv(paddle::platform::CUDAPlace()); diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc index a9d297c1c64f7b64373237a0500802a5c883aedd..b4b47a85f66662347d5e087cd4391979fb6c4250 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc @@ -41,6 +41,8 @@ PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(sum, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(sum_grad, CPU, ALL_LAYOUT); namespace paddle { namespace imperative { diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc index bd9eaa09ca9a406da943c8a0b0f37b674d5ea3c2..a3e393b039425e506066b485bc8a8688bff20d96 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc @@ -43,6 +43,8 @@ PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(matmul_grad, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(sum, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT); namespace paddle { namespace imperative { diff --git a/paddle/fluid/eager/to_static/run_program_op_func.h b/paddle/fluid/eager/to_static/run_program_op_func.h index 6f8bccd64e45f015a5c1aed44fbfdfc6f68660f1..9967d8c36900f45fdd76272bc4416df1d30f2a6a 100644 --- a/paddle/fluid/eager/to_static/run_program_op_func.h +++ b/paddle/fluid/eager/to_static/run_program_op_func.h @@ -57,6 +57,7 @@ inline void run_program_dygraph_function( auto grad_node = std::make_shared(1, 2); grad_node->SetFwdOutNames(out_names); + grad_node->SetOut(out); // Set Attributes grad_node->SetAttrMap(attrs); // Set TensorWrappers diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h index ae5d86664a346fd8a1d877f9e1dd74f687302595..d99624e49324853d513a20a725c1a3d12b6aaab5 100644 --- a/paddle/fluid/eager/to_static/run_program_op_node.h +++ b/paddle/fluid/eager/to_static/run_program_op_node.h @@ -260,9 +260,9 @@ inline void RunProgramAPI( } VLOG(2) << "The number of sub scopes after forward: " << out_scope_vec->front()->kids().size(); - // #ifdef PADDLE_WITH_MKLDNN - // if (FLAGS_use_mkldnn) paddle::platform::DontClearMKLDNNCache(place); - // #endif +#ifdef PADDLE_WITH_MKLDNN + if (FLAGS_use_mkldnn) paddle::platform::DontClearMKLDNNCache(place); +#endif } inline void RunProgramGradAPI( @@ -357,7 +357,7 @@ inline void RunProgramGradAPI( details::ShareTensorsFromScope(params_grad, *global_block, &scope); // Step5. drop current scope - // global_inner_scope->DeleteScope(&scope); + global_inner_scope->DeleteScope(&scope); VLOG(2) << "The number of sub scopes after backward: " << global_inner_scope->kids().size(); } @@ -400,6 +400,10 @@ class GradNodeRunProgram : public egr::GradNodeBase { paddle::platform::errors::InvalidArgument( "The grads[0].size() and fwd_out_names_.size() should be equal.")); for (size_t i = 0; i < fwd_out_names_.size(); ++i) { + auto &out_grad = egr::EagerUtils::unsafe_autograd_meta(*out_[i])->Grad(); + const_cast(out_grad).set_impl( + grads[0][i].impl()); + const_cast(grads[0][i]) .set_name(fwd_out_names_[i] + "@GRAD"); } @@ -432,6 +436,10 @@ class GradNodeRunProgram : public egr::GradNodeBase { fwd_out_names_ = out_names; } + void SetOut(const std::vector &out) { + out_ = out; + } + protected: void ConstructGradTensors( const std::vector &fwd_tensors, @@ -440,7 +448,11 @@ class GradNodeRunProgram : public egr::GradNodeBase { // such as: name, tensor type(DenseTensor or SelectedRows). VLOG(3) << "fwd_tensors.size(): " << fwd_tensors.size(); for (auto &fwd_t : fwd_tensors) { - grad_tensors->emplace_back(fwd_t.impl()); + if (phi::DenseTensor::classof(fwd_t.impl().get())) { + grad_tensors->emplace_back(std::make_shared()); + } else if (phi::SelectedRows::classof(fwd_t.impl().get())) { + grad_tensors->emplace_back(std::make_shared()); + } auto &grad_t = grad_tensors->back(); grad_t.set_name(fwd_t.name() + "@GRAD"); } @@ -462,6 +474,7 @@ class GradNodeRunProgram : public egr::GradNodeBase { std::vector step_scope_; std::vector fwd_out_names_; + std::vector out_; // Attribute Map paddle::framework::AttributeMap attrs_; diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index aa92a3b2226c1fca1fa7326e76ef29b0b38cd8d6..5dc3d9e89c557e86f5af821446b82ad691ad5c95 100755 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -440,6 +440,7 @@ message(STATUS "branch: ${PADDLE_BRANCH}") configure_file(commit.h.in commit.h) cc_library(custom_operator SRCS custom_operator.cc DEPS tensor attribute framework_proto op_registry operator dynamic_loader string_helper phi_tensor op_meta_info phi_api) + #cc_binary(test_executor SRCS test_executor.cc DEPS executor op_registry ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ) #cc_binary(new_executor SRCS new_exec_test.cc DEPS operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler) diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc index b9e3bee25f6b5377dde7b525138643964fd8366a..478e39b99dcc9935306603a48810d46ba792d3c3 100644 --- a/paddle/fluid/framework/custom_operator.cc +++ b/paddle/fluid/framework/custom_operator.cc @@ -25,6 +25,7 @@ limitations under the License. */ #include #include +#include "paddle/fluid/eager/api/utils/global_utils.h" #include "paddle/fluid/framework/attribute.h" #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/op_meta_info_helper.h" @@ -946,15 +947,16 @@ void RegisterOperatorWithMetaInfoMap( ////////////////////// User APIs /////////////////////// // load op api -void LoadOpMetaInfoAndRegisterOp(const std::string& dso_name) { +const std::unordered_map>& +LoadOpMetaInfoAndRegisterOp(const std::string& dso_name) { void* handle = paddle::platform::dynload::GetOpDsoHandle(dso_name); VLOG(3) << "load custom_op lib: " << dso_name; typedef OpMetaInfoMap& get_op_meta_info_map_t(); auto* get_op_meta_info_map = detail::DynLoad(handle, "PD_GetOpMetaInfoMap"); auto& op_meta_info_map = get_op_meta_info_map(); - RegisterOperatorWithMetaInfoMap(op_meta_info_map, handle); + return op_meta_info_map.GetMap(); } } // namespace framework diff --git a/paddle/fluid/framework/custom_operator.h b/paddle/fluid/framework/custom_operator.h index 4310b564371822d0238a55b9091f524d8d419966..fef1e82a14fe6e03de40c8376f922f87f64564f8 100644 --- a/paddle/fluid/framework/custom_operator.h +++ b/paddle/fluid/framework/custom_operator.h @@ -20,9 +20,9 @@ limitations under the License. */ namespace paddle { namespace framework { - // Load custom op api: register op after user compiled -void LoadOpMetaInfoAndRegisterOp(const std::string& dso_name); +const std::unordered_map>& +LoadOpMetaInfoAndRegisterOp(const std::string& dso_name); // Register custom op api: register op directly void RegisterOperatorWithMetaInfoMap( @@ -31,6 +31,5 @@ void RegisterOperatorWithMetaInfoMap( // Interface for selective register custom op. void RegisterOperatorWithMetaInfo(const std::vector& op_meta_infos, void* dso_handle = nullptr); - } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc index 29c7f5d0ce73cbf1af18e6f5869d59d2200917ad..b1d7059f311cd370a40e83d7b0016d5af8cdb163 100644 --- a/paddle/fluid/framework/infershape_utils.cc +++ b/paddle/fluid/framework/infershape_utils.cc @@ -249,13 +249,13 @@ class CompatMetaTensor : public phi::MetaTensor { } void share_meta(const MetaTensor& meta_tensor) override { + share_dims(meta_tensor); set_dtype(meta_tensor.dtype()); // VarDesc doesn't contains layout, so we cannot share layout // set_layout(meta_tensor.layout()); - // special case 1: share lod of LoDTensor + // special case: share lod of LoDTensor share_lod(meta_tensor); - share_dims(meta_tensor); } private: @@ -297,7 +297,8 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, VLOG(3) << "BuildInferMetaContext: op kernel signature - " << signature; // 2. build infermeta context - phi::InferMetaContext infer_meta_context(ctx->IsRuntime()); + phi::InferMetaContext infer_meta_context( + {ctx->IsRuntime(), ctx->IsRunMKLDNNKernel()}); auto& input_names = std::get<0>(signature.args); auto& attr_names = std::get<1>(signature.args); @@ -499,8 +500,22 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, "Unsupported attribute type is received when call " "InferShapeFunctor.")); } - } else { - // do nothing + } else if (ctx->HasInput(attr_name)) { + // convert from data + if (attr_defs[i].type_index == std::type_index(typeid(int32_t))) { + if (ctx->IsRuntime()) { + const auto& infershape_inputs = ctx->GetInputVarPtrs(attr_name); + auto var_temp = BOOST_GET_CONST(Variable*, infershape_inputs[i]); + auto val = experimental::MakePhiScalarFromVar(*var_temp); + int32_t val_int = val.template to(); + infer_meta_context.EmplaceBackAttr(val_int); + } else { + infer_meta_context.EmplaceBackAttr(-1); + } + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Get value from variable only support int yet")); + } } } diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index a1f2d6edca6a2db5d5bb4c8cf896c492f20ed2da..623c8a048c2417ab51772c55b681031d9bcfd925 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -126,6 +126,7 @@ if(WITH_MKLDNN) pass_library(interpolate_mkldnn_pass inference DIR mkldnn) pass_library(softplus_activation_mkldnn_fuse_pass inference DIR mkldnn) pass_library(fc_act_mkldnn_fuse_pass inference DIR mkldnn) + pass_library(elt_act_mkldnn_fuse_pass inference DIR mkldnn) pass_library(cpu_quantize_placement_pass base DIR mkldnn) pass_library(cpu_quantize_pass inference DIR mkldnn) pass_library(cpu_quantize_squash_pass inference DIR mkldnn) diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index d7d866fa98bb5895e4f3175e227f7b3c2ce869b6..18068e22b7f3c31d59636bc7ab6a234e109d5ee6 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -918,6 +918,36 @@ PDNode *patterns::ConvActivation::operator()( return activation_out_var; } +PDNode *patterns::ElementwiseActivation::operator()( + paddle::framework::ir::PDNode *elementwise_a, + const std::string &elementwise_type, const std::string &activation_type) { + // Create Operators + elementwise_a->assert_is_op_input(elementwise_type, "X"); + auto *elementwise_op = + pattern->NewNode(elementwise_repr())->assert_is_op(elementwise_type); + auto *activation_op = + pattern->NewNode(activation_repr())->assert_is_op(activation_type); + // Create variables + auto *elementwise_b = pattern->NewNode(elementwise_b_repr()) + ->AsInput() + ->assert_is_op_input(elementwise_type, "Y"); + // intermediate variable, will be removed in the IR after fuse. + auto *elementwise_out_var = + pattern->NewNode(elementwise_out_repr()) + ->AsIntermediate() + ->assert_is_only_output_of_op(elementwise_type) + ->assert_is_op_input(activation_type); + // output + auto *activation_out_var = pattern->NewNode(activation_out_repr()) + ->AsOutput() + ->assert_is_op_output(activation_type); + + elementwise_op->LinksFrom({elementwise_a, elementwise_b}) + .LinksTo({elementwise_out_var}); + activation_op->LinksFrom({elementwise_out_var}).LinksTo({activation_out_var}); + return activation_out_var; +} + PDNode *patterns::SeqConvEltAddRelu::operator()( paddle::framework::ir::PDNode *seqconv_input) { // Create Operators diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index 0f21906d08d0e4fc8a54472ab40ceb08df9d1949..062d2f9dedce65f6e16b70f0b201a4ca63b0531a 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -487,6 +487,28 @@ struct ConvActivation : public PatternBase { PATTERN_DECL_NODE(activation_out); }; +// Elementwise with Activation +// op: elementwise + activation +// named nodes: +// elementwise_a, elementwise_b, +// elementwise_out, elementwise, +// activation_out, activation +struct ElementwiseActivation : public PatternBase { + ElementwiseActivation(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "elementwise_add_activation") {} + + PDNode* operator()(PDNode* elementwise_a, const std::string& elementwise_type, + const std::string& activation_type); + + // declare operator node's name + PATTERN_DECL_NODE(elementwise); + PATTERN_DECL_NODE(activation); + // declare variable node's name + PATTERN_DECL_NODE(elementwise_b); + PATTERN_DECL_NODE(elementwise_out); + PATTERN_DECL_NODE(activation_out); +}; + // SEQCONV with Elementwise_Add ReLU // op: seqconv + elementwise_add + relu // named nodes: diff --git a/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..b7f7a8071d21413f45d86e98b8649a3aaba5d2f5 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc @@ -0,0 +1,145 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/string/pretty_log.h" + +namespace paddle { +namespace framework { +namespace ir { + +using string::PrettyLogDetail; + +void ElementwiseActivationOneDNNPass::ApplyImpl(Graph *graph) const { + std::vector act_types = { + "relu", "tanh", "leaky_relu", "swish", "hardswish", "sqrt", + "abs", "clip", "gelu", "relu6", "sigmoid"}; + std::vector elt_types = {"elementwise_add", "elementwise_sub", + "elementwise_mul"}; + + for (const auto &elt_type : elt_types) + for (const auto &act_type : act_types) { + std::unordered_map attr_map; + + if (act_type == "swish") + attr_map.emplace("beta", "activation_alpha"); + else if (act_type == "relu6") + attr_map.emplace("threshold", "activation_alpha"); + else if (act_type == "clip") { + attr_map.emplace("min", "activation_alpha"); + attr_map.emplace("max", "activation_beta"); + } else { + attr_map.emplace("alpha", "activation_alpha"); + attr_map.emplace("beta", "activation_beta"); + } + FuseElementwiseAct(graph, elt_type, act_type, attr_map); + } +} + +void ElementwiseActivationOneDNNPass::FuseElementwiseAct( + Graph *graph, const std::string &elt_type, const std::string &act_type, + const std::unordered_map &attr_map) const { + PADDLE_ENFORCE_NOT_NULL( + graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); + FusePassBase::Init("elementwise_act", graph); + + GraphPatternDetector gpd; + auto *elementwise_input = gpd.mutable_pattern() + ->NewNode(elt_type + "_act/elementwise_input") + ->AsInput() + ->assert_is_op_input(elt_type, "X"); + patterns::ElementwiseActivation elementwise_act_pattern(gpd.mutable_pattern(), + elt_type + "_act"); + elementwise_act_pattern(elementwise_input, elt_type, act_type); + + int found_elementwise_activation_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, + Graph *g) { + VLOG(4) << "Fuse " << elt_type << " with activation op."; + // Elementwise output + GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out, + elementwise_act_pattern); + // ACT output + GET_IR_NODE_FROM_SUBGRAPH(activation_out, activation_out, + elementwise_act_pattern); + // ops + GET_IR_NODE_FROM_SUBGRAPH(elementwise, elementwise, + elementwise_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(activation, activation, elementwise_act_pattern); + + auto *elementwise_op = elementwise->Op(); + + if (elementwise_op->HasAttr("use_mkldnn")) { + const std::string wo_elt_type = + "The " + elt_type; // Workaround for PP error message checking. + PADDLE_ENFORCE_EQ( + BOOST_GET_CONST(bool, elementwise_op->GetAttr("use_mkldnn")), true, + platform::errors::PreconditionNotMet( + wo_elt_type + "+Act fusion may happen only when oneDNN library " + "is used.")); + } + + auto *activation_op = activation->Op(); + for (const auto &attr : attr_map) { + if (activation_op->HasAttr(attr.first)) { + elementwise_op->SetAttr(attr.second, + activation_op->GetAttr(attr.first)); + } + } + + if (act_type == "gelu" && activation_op->HasAttr("approximate") && + BOOST_GET_CONST(bool, activation_op->GetAttr("approximate"))) + elementwise_op->SetAttr("activation_type", std::string("gelu_tanh")); + else + elementwise_op->SetAttr("activation_type", act_type); + + elementwise_op->SetOutput("Out", {activation_out->Name()}); + + IR_OP_VAR_LINK(elementwise, activation_out); + GraphSafeRemoveNodes(g, {activation, elementwise_out}); + found_elementwise_activation_count++; + }; + + gpd(graph, handler); + AddStatis(found_elementwise_activation_count); + PrettyLogDetail("--- fused %d %s with %s activation", + found_elementwise_activation_count, elt_type, act_type); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(elt_act_mkldnn_fuse_pass, + paddle::framework::ir::ElementwiseActivationOneDNNPass); +REGISTER_PASS_CAPABILITY(elt_act_mkldnn_fuse_pass) + .AddCombination( + paddle::framework::compatible::OpVersionComparatorCombination() + .LE("elementwise_add", 1) + .LE("elementwise_sub", 1) + .LE("elementwise_mul", 1) + .LE("relu", 0) + .LE("tanh", 0) + .LE("leaky_relu", 1) + .LE("swish", 0) + .LE("hard_swish", 0) + .LE("sqrt", 0) + .LE("abs", 0) + .LE("clip", 1) + .LE("gelu", 0) + .LE("relu6", 0) + .LE("sigmoid", 0)); diff --git a/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..b8b7d06a828508e9773301bfc602e01f9354eac4 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.h @@ -0,0 +1,44 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" + +namespace paddle { +namespace framework { +namespace ir { + +/* + * \brief Fuse the Elementwise and activation operators into single + * OneDNN's Elementwise with post-op. + */ +class ElementwiseActivationOneDNNPass : public FusePassBase { + public: + virtual ~ElementwiseActivationOneDNNPass() {} + + protected: + void ApplyImpl(Graph *graph) const override; + + void FuseElementwiseAct( + Graph *graph, const std::string &elt_types, const std::string &act_types, + const std::unordered_map &attr_map) const; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc index d578ada0db00fed85f7b4f25f1483169c72c2c0b..ef2e83ced26e07f199a122ee3157eb428b63aec9 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc @@ -25,11 +25,11 @@ USE_OP_ITSELF(softmax); USE_OP_DEVICE_KERNEL(softmax, MKLDNN); USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); -USE_OP(leaky_relu); +USE_OP_ITSELF(leaky_relu); USE_OP_DEVICE_KERNEL(leaky_relu, MKLDNN); USE_OP(gelu); USE_OP_ITSELF(relu); -USE_OP(tanh); +USE_OP_ITSELF(tanh); USE_OP_DEVICE_KERNEL(tanh, MKLDNN); namespace paddle { diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc index 219aae71127ed8963b4bfe4e8ee5e7259dbf7d02..eadb00b9e88e14075c46a53c711fd43774f26581 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc @@ -32,7 +32,7 @@ USE_OP(concat); USE_OP(matmul); USE_OP_ITSELF(elementwise_add); USE_OP(sigmoid); -USE_OP(tanh); +USE_OP_ITSELF(tanh); USE_OP(elementwise_mul); USE_OP(softmax_with_cross_entropy); USE_OP_ITSELF(reduce_mean); @@ -48,7 +48,7 @@ USE_OP(transpose2_grad); USE_OP(concat_grad); USE_OP_ITSELF(elementwise_mul_grad); USE_OP(sigmoid_grad); -USE_OP(tanh_grad); +USE_OP_ITSELF(tanh_grad); USE_OP(sum); USE_OP(slice_grad); USE_OP(lookup_table_grad); diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index f8e30c1ee294ecf692e2992b6123232ba1c8bd7d..f23a266ef03641bc8f8d273b15ab4982e377cb03 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -2250,41 +2250,62 @@ void OperatorWithKernel::BuildPhiKernelContext( } } else { // TODO(chenweihang): support other attrs later - auto& attr = Attrs().at(attr_names[i]); + auto attr_it = attrs_.find(attr_names[i]); if (attr_defs[i].type_index == std::type_index(typeid(int))) { - pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(int, attr)); + if (attr_it == attrs_.end()) { + auto in_it = ctx.inputs.find(attr_names[i]); + if (in_it != ctx.inputs.end()) { + // get data from input + auto val = experimental::MakePhiScalarFromVar(*(in_it->second[0])); + int32_t val_int = val.template to(); + pt_kernel_context->EmplaceBackAttr(val_int); + } else { + PADDLE_THROW(platform::errors::NotFound( + "can not find attribute `%s` both in attribute and input ", + attr_names[i])); + } + } else { + pt_kernel_context->EmplaceBackAttr( + BOOST_GET_CONST(int, attr_it->second)); + } } else if (attr_defs[i].type_index == std::type_index(typeid(float))) { - pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(float, attr)); + pt_kernel_context->EmplaceBackAttr( + BOOST_GET_CONST(float, attr_it->second)); } else if (attr_defs[i].type_index == std::type_index(typeid(bool))) { - pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(bool, attr)); + pt_kernel_context->EmplaceBackAttr( + BOOST_GET_CONST(bool, attr_it->second)); } else if (attr_defs[i].type_index == std::type_index(typeid(int64_t))) { - pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(int64_t, attr)); + pt_kernel_context->EmplaceBackAttr( + BOOST_GET_CONST(int64_t, attr_it->second)); } else if (attr_defs[i].type_index == std::type_index(typeid(std::string))) { - pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(std::string, attr)); + pt_kernel_context->EmplaceBackAttr( + BOOST_GET_CONST(std::string, attr_it->second)); } else if (attr_defs[i].type_index == std::type_index(typeid(phi::DataType))) { auto data_type = paddle::framework::TransToPhiDataType( static_cast( - BOOST_GET_CONST(int, attr))); + BOOST_GET_CONST(int, attr_it->second))); pt_kernel_context->EmplaceBackAttr(data_type); } else if (attr_defs[i].type_index == std::type_index(typeid(std::vector))) { - if (std::type_index(attr.type()) == + if (std::type_index(attr_it->second.type()) == std::type_index(typeid(std::vector))) { pt_kernel_context->EmplaceBackAttr( - BOOST_GET_CONST(std::vector, attr)); - } else if (std::type_index(attr.type()) == + BOOST_GET_CONST(std::vector, attr_it->second)); + } else if (std::type_index(attr_it->second.type()) == std::type_index(typeid(std::vector))) { // Emplace Back Attr according to the type of Phi_Kernel args. - const auto& vector_int_attr = BOOST_GET_CONST(std::vector, attr); + const auto& vector_int_attr = + BOOST_GET_CONST(std::vector, attr_it->second); const std::vector vector_int64_attr(vector_int_attr.begin(), vector_int_attr.end()); pt_kernel_context->EmplaceBackAttr(vector_int64_attr); } } else if (attr_defs[i].type_index == std::type_index(typeid(std::vector))) { - const auto& vector_int_attr = BOOST_GET_CONST(std::vector, attr); + const auto& vector_int_attr = + BOOST_GET_CONST(std::vector, attr_it->second); pt_kernel_context->EmplaceBackAttr(vector_int_attr); } else { PADDLE_THROW(platform::errors::Unimplemented( diff --git a/paddle/fluid/framework/paddle2cinn/cinn_lib_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_lib_test.cc index 23cb653fef22ac966655e5650d20c128e2bd3cdd..7a7a7b2798f5920f89e15222959a935da9af2c25 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_lib_test.cc +++ b/paddle/fluid/framework/paddle2cinn/cinn_lib_test.cc @@ -45,8 +45,8 @@ Program CreateAddProgram() { NetBuilder builder("net_builder"); auto a = builder.CreateInput(Float(32), {M, N}); auto b = builder.CreateInput(Float(32), {M, N}); - auto c = builder.add(a, b); - auto d = builder.add(a, c); + auto c = builder.Add(a, b); + auto d = builder.Add(a, c); auto program = builder.Build(); return program; @@ -116,8 +116,8 @@ TEST(net_build, program_execute_fc) { auto w = builder.CreateInput(Float(32), {N, K}, "W"); // weight auto b = builder.CreateInput(Float(32), {N}, "B"); // bias - auto mul_out = builder.mul(a, w, 2, 1); - auto add_out = builder.add(mul_out, b); + auto mul_out = builder.Mul(a, w, 2, 1); + auto add_out = builder.Add(mul_out, b); auto program = builder.Build(); #ifdef PADDLE_WITH_CUDA diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc index 7416d206fc43eaf5a56c3eb606bb0672d1172c0b..d7478b18dba0616fdc995866d8892c7c052a0e35 100644 --- a/paddle/fluid/imperative/basic_engine.cc +++ b/paddle/fluid/imperative/basic_engine.cc @@ -389,6 +389,9 @@ static void PerformBackwardInplace(const std::string& op_type, } void BasicEngine::Execute() { + platform::RecordEvent backward_record_event( + "backward", platform::TracerEventType::Operator, 1); + if (init_nodes_.empty()) { return; } @@ -412,7 +415,7 @@ void BasicEngine::Execute() { for (auto& cur_op : *shared_cur_node) { platform::RecordEvent op_type_record_event( - cur_op.Type(), platform::TracerEventType::Operator, 1); + cur_op.Type() + " grad_node", platform::TracerEventType::Operator, 1); ++op_num; diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h index d7c0c8cc547e6b04f67ddbb06121d139756d5142..8deb3b93e9c50489dcfc6805063f23e3705cb634 100644 --- a/paddle/fluid/imperative/prepared_operator.h +++ b/paddle/fluid/imperative/prepared_operator.h @@ -264,14 +264,23 @@ void BuildDygraphPhiKernelContext( size_t start_idx = (i == 0 ? 0 : kernel_ctx->InputRangeAt(i - 1).second); - if ((it == ins.end()) && - (input_defs[i].type_index == - std::type_index(typeid(paddle::optional)))) { - kernel_ctx->EmplaceBackInputWithoutSetRange(nullptr); - auto end_idx = start_idx + 1; - kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i); - continue; + if (it == ins.end()) { + if (LIKELY(input_defs[i].type_index == + std::type_index( + typeid(paddle::optional)))) { + kernel_ctx->EmplaceBackInputWithoutSetRange(nullptr); + auto end_idx = start_idx + 1; + kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i); + continue; + } else { + PADDLE_THROW(phi::errors::NotFound( + "Can not find input variable '%s' for %s OP, please check whether " + "the name setting in OpArgumentMapping is consistent with that in " + "OpMaker.", + input_names[i], pt_kernel_signature.name)); + } } + auto ins_vector = it->second; size_t end_idx = start_idx + ins_vector.size(); @@ -410,6 +419,17 @@ void BuildDygraphPhiKernelContext( experimental::MakePhiScalarFromVar(ins_vector[0]->Var()))); } + } else if (ins.find(attr_names[i]) != ins.end()) { + // deal tensor attr here + auto& ins_vector = ins.at(attr_names[i]); + auto tensor_attr = + experimental::MakePhiScalarFromVar(ins_vector[0]->Var()); + if (attr_defs[i].type_index == std::type_index(typeid(int))) { + int val = tensor_attr.template to(); + kernel_ctx->EmplaceBackAttr(val); + } else { + PADDLE_THROW(platform::errors::Unimplemented("only support int here")); + } } else if (attr_defs[i].type_index == std::type_index(typeid(std::vector))) { auto& attr = GetAttr(attrs, default_attrs, attr_names[i]); @@ -466,6 +486,7 @@ void BuildDygraphPhiKernelContext( } } else { // TODO(chenweihang): support other attrs later + auto& attr = GetAttr(attrs, default_attrs, attr_names[i]); if (attr_defs[i].type_index == std::type_index(typeid(int))) { kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(int, attr)); diff --git a/paddle/fluid/imperative/tests/test_tracer.cc b/paddle/fluid/imperative/tests/test_tracer.cc index 2e38bd77cf63cc85b75a50e62250a6e746f525bc..f754c6fdd0ee7742f0e544baad0225502c172848 100644 --- a/paddle/fluid/imperative/tests/test_tracer.cc +++ b/paddle/fluid/imperative/tests/test_tracer.cc @@ -34,6 +34,7 @@ PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT); #endif namespace imperative = paddle::imperative; diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 01c9d2847e0c850fd4159613a47d647bdbf46c31..c55599cc9aa954e2bd437f0917c792e4fdb6b577 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -177,7 +177,7 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap& ins, paddle::framework::AttributeMap* passed_default_attrs_, bool use_default_attr_map) { platform::RecordEvent op_type_record_event( - type, platform::TracerEventType::Operator, 1); + type + " trace_op", platform::TracerEventType::Operator, 1); platform::ScopedFlushDenormal flush; VLOG(1) << "Trace Op: " << type; if (FLAGS_use_mkldnn) { @@ -297,19 +297,24 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap& ins, program_desc_tracer_->InsertOp(type, new_ins, outs, attrs); } - if (ComputeRequiredGrad(new_ins, outs, trace_backward)) { - PADDLE_ENFORCE_EQ( - passed_default_attrs_, nullptr, - paddle::platform::errors::PermissionDenied( - "We expect passed_default_attrs_ is nullptr while " - "use_default_attr_map is true, however we got not null " - "passed_default_attrs_. Please check your usage of trace_op. ")); - CreateGradOpNode(*op, new_ins, outs, attrs, default_attrs, place, - inplace_map); - } else { - VLOG(3) << "No Grad to track for Op: " << type; + { + platform::RecordEvent node_creation_record_event( + type + " node_creation", platform::TracerEventType::Operator, 1); + + if (ComputeRequiredGrad(new_ins, outs, trace_backward)) { + PADDLE_ENFORCE_EQ( + passed_default_attrs_, nullptr, + paddle::platform::errors::PermissionDenied( + "We expect passed_default_attrs_ is nullptr while " + "use_default_attr_map is true, however we got not null " + "passed_default_attrs_. Please check your usage of trace_op. ")); + CreateGradOpNode(*op, new_ins, outs, attrs, default_attrs, place, + inplace_map); + } else { + VLOG(3) << "No Grad to track for Op: " << type; + } + VLOG(6) << "Finish Trace Op: " << type; } - VLOG(6) << "Finish Trace Op: " << type; } template void Tracer::TraceOp( diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index f5f36d805b43ea0815683e3b65bf157fe5beb2de..22d9dedb32ebfcc229e0034cc5cf6092907dc8df 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -262,6 +262,7 @@ void CpuPassStrategy::EnableMKLDNN() { // "fc_act_mkldnn_fuse_pass", "batch_norm_act_fuse_pass", // "softplus_activation_mkldnn_fuse_pass", // + "elt_act_mkldnn_fuse_pass", // // TODO(intel): Please fix the bug on windows. // https://github.com/PaddlePaddle/Paddle/issues/29710 // "mkldnn_inplace_pass", // This pass should be activated after diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc index fe04d552e40263a396059e3da59de4d51def67e0..7b65d2d7c97cca335f76f1d0399a25bcd8a00c92 100644 --- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc @@ -328,5 +328,5 @@ class Pool2dOpConverter : public OpConverter { } // namespace inference } // namespace paddle -USE_OP(pool2d); +USE_OP_ITSELF(pool2d); REGISTER_TRT_OP_CONVERTER(pool2d, Pool2dOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/pool3d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool3d_op.cc index b8e87a8d94d1f43d35da1a46c300a1b37c9382ec..5a306f622adbe7a298ab53daae1168ad50b402a9 100644 --- a/paddle/fluid/inference/tensorrt/convert/pool3d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pool3d_op.cc @@ -224,5 +224,5 @@ class Pool3dOpConverter : public OpConverter { } // namespace inference } // namespace paddle -USE_OP(pool3d); +USE_OP_ITSELF(pool3d); REGISTER_TRT_OP_CONVERTER(pool3d, Pool3dOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc index 7f7313fbcb5969aafea47ad23248acd5a6ca3644..1946f9e28388e3ab6d1d580d0f7d91c1ef3e604f 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc @@ -54,5 +54,5 @@ TEST(Relu6OpConverter, main) { test_activation("relu6"); } USE_OP_ITSELF(relu); USE_OP(sigmoid); -USE_OP(tanh); +USE_OP_ITSELF(tanh); USE_OP(relu6); diff --git a/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc b/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc index 1725888abc379bfa4ffbbc5cfc4cecd1872c7c18..f17e00de0eeb7c8f4d782f0a4eaecc2fc1df268b 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc @@ -45,4 +45,4 @@ TEST(leaky_relu_op, test_leaky_relu) { } // namespace paddle // USE_OP(leaky_relu); -USE_OP(leaky_relu); +USE_OP_ITSELF(leaky_relu); diff --git a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc index bded833505cd25352adc4123de415613d1fc926d..36f13262a73d703a6d9776855adbab3c44075aa7 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc @@ -71,4 +71,4 @@ TEST(Pool2dOpConverter, avg_ceil_test) { test_pool2d(false, true, "avg"); } } // namespace inference } // namespace paddle -USE_OP(pool2d); +USE_OP_ITSELF(pool2d); diff --git a/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu index 861a9aa9d000bff9e6dcc673cc5c8d99c3a7a6ec..5596a89a083fe9ff177aa9abc769b8fa27105c1f 100644 --- a/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu @@ -13,7 +13,7 @@ // limitations under the License. #include "paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.h" -#include "paddle/fluid/operators/math/pooling.h" +#include "paddle/phi/kernels/funcs/pooling.h" namespace paddle { namespace inference { @@ -108,16 +108,14 @@ int Pool3DPlugin::enqueue(int batchSize, const void *const *inputs, output_shape.insert(output_shape.begin(), batchSize); if (pool3d_type_ == Pool3DType::max) { - paddle::operators::math::MaxPool pool_process; - paddle::operators::math::Pool3dDirectCUDAFunctor< - paddle::operators::math::MaxPool, float> + phi::funcs::MaxPool pool_process; + phi::funcs::Pool3dDirectCUDAFunctor, float> pool3d_forward; pool3d_forward(idata, input_shape, output_shape, ksize_, strides_, paddings_, true, adaptive_, odatas[0], stream, pool_process); } else if (pool3d_type_ == Pool3DType::avg) { - paddle::operators::math::AvgPool pool_process; - paddle::operators::math::Pool3dDirectCUDAFunctor< - paddle::operators::math::AvgPool, float> + phi::funcs::AvgPool pool_process; + phi::funcs::Pool3dDirectCUDAFunctor, float> pool3d_forward; pool3d_forward(idata, input_shape, output_shape, ksize_, strides_, paddings_, true, adaptive_, odatas[0], stream, pool_process); @@ -351,16 +349,14 @@ int Pool3DPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc, } if (pool3d_type_ == "max") { - paddle::operators::math::MaxPool pool_process; - paddle::operators::math::Pool3dDirectCUDAFunctor< - paddle::operators::math::MaxPool, float> + phi::funcs::MaxPool pool_process; + phi::funcs::Pool3dDirectCUDAFunctor, float> pool3d_forward; pool3d_forward(input, input_shape, output_shape, ksize, strides_, paddings, true, adaptive_, output, stream, pool_process); } else if (pool3d_type_ == "avg") { - paddle::operators::math::AvgPool pool_process; - paddle::operators::math::Pool3dDirectCUDAFunctor< - paddle::operators::math::AvgPool, float> + phi::funcs::AvgPool pool_process; + phi::funcs::Pool3dDirectCUDAFunctor, float> pool3d_forward; pool3d_forward(input, input_shape, output_shape, ksize, strides_, paddings, true, adaptive_, output, stream, pool_process); diff --git a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu index 6d711c26adc6ff8e49375d15f32322303f3ae6ef..9bfe98d759d8e29bc34b42fa667e5cda5f1493de 100644 --- a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu @@ -13,7 +13,7 @@ // limitations under the License. #include "paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.h" -#include "paddle/fluid/operators/math/pooling.h" +#include "paddle/phi/kernels/funcs/pooling.h" namespace paddle { namespace inference { @@ -84,16 +84,14 @@ int PoolPlugin::enqueue(int batchSize, const void *const *inputs, output_shape.insert(output_shape.begin(), batchSize); if (pool_type_ == PoolType::max) { - paddle::operators::math::MaxPool pool_process; - paddle::operators::math::Pool2dDirectCUDAFunctor< - paddle::operators::math::MaxPool, float> + phi::funcs::MaxPool pool_process; + phi::funcs::Pool2dDirectCUDAFunctor, float> pool2d_forward; pool2d_forward(idata, input_shape, output_shape, ksize_, strides_, paddings_, true, false, odatas[0], stream, pool_process); } else if (pool_type_ == PoolType::avg) { - paddle::operators::math::AvgPool pool_process; - paddle::operators::math::Pool2dDirectCUDAFunctor< - paddle::operators::math::AvgPool, float> + phi::funcs::AvgPool pool_process; + phi::funcs::Pool2dDirectCUDAFunctor, float> pool2d_forward; pool2d_forward(idata, input_shape, output_shape, ksize_, strides_, paddings_, exclusive_, adaptive_, odatas[0], stream, @@ -292,16 +290,14 @@ int PoolPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc, } if (pool_type_ == "max") { - paddle::operators::math::MaxPool pool_process; - paddle::operators::math::Pool2dDirectCUDAFunctor< - paddle::operators::math::MaxPool, float> + phi::funcs::MaxPool pool_process; + phi::funcs::Pool2dDirectCUDAFunctor, float> pool2d_forward; pool2d_forward(input, input_shape, output_shape, ksize, strides_, paddings, true, false, output, stream, pool_process); } else if (pool_type_ == "avg") { - paddle::operators::math::AvgPool pool_process; - paddle::operators::math::Pool2dDirectCUDAFunctor< - paddle::operators::math::AvgPool, float> + phi::funcs::AvgPool pool_process; + phi::funcs::Pool2dDirectCUDAFunctor, float> pool2d_forward; pool2d_forward(input, input_shape, output_shape, ksize, strides_, paddings, exclusive_, adaptive_, output, stream, pool_process); diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index a7a417c29a7bdb7a47d4798246de55c0bd3536f9..f296ce96d4e5f6dca5c4ad2668eea8508b37068f 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -131,4 +131,7 @@ cc_library(virtual_memory_auto_growth_best_fit_allocator SRCS virtual_memory_aut if(NOT WIN32) cc_library(mmap_allocator SRCS mmap_allocator.cc DEPS allocator) cc_test(mmap_allocator_test SRCS mmap_allocator_test.cc DEPS mmap_allocator allocator) + if (WITH_GPU) + cc_library(cuda_ipc_allocator SRCS cuda_ipc_allocator.cc DEPS allocator) + endif() endif(NOT WIN32) diff --git a/paddle/fluid/memory/allocation/cuda_ipc_allocator.cc b/paddle/fluid/memory/allocation/cuda_ipc_allocator.cc new file mode 100644 index 0000000000000000000000000000000000000000..b2f24d5aed1eb827b4857f5936a19b206a38c788 --- /dev/null +++ b/paddle/fluid/memory/allocation/cuda_ipc_allocator.cc @@ -0,0 +1,80 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef _WIN32 + +#include "paddle/fluid/memory/allocation/cuda_ipc_allocator.h" +#include "paddle/fluid/platform/cuda_device_guard.h" + +#include +#include +#include +#include +#include + +#include "glog/logging.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace memory { +namespace allocation { + +namespace { +std::mutex ipc_mutex_; +std::unordered_map> ipc_handle_to_baseptr_; +} // namespace + +std::shared_ptr GetIpcBasePtr(std::string handle) { + std::lock_guard lock(ipc_mutex_); + + auto iter = ipc_handle_to_baseptr_.find(handle); + if (iter != ipc_handle_to_baseptr_.end()) { + auto baseptr = iter->second.lock(); + if (baseptr) return baseptr; + } + // The IpcMemHandle can only open once for the same handle, + // so here we cache it here. + void *baseptr = nullptr; + auto ipc_handle = + reinterpret_cast(handle.c_str()); + PADDLE_ENFORCE_GPU_SUCCESS(cudaIpcOpenMemHandle( + &baseptr, *ipc_handle, cudaIpcMemLazyEnablePeerAccess)); + // Close ipc handle on the same device. + int device_id = platform::GetCurrentDeviceId(); + // Add deleter to close ipc handle. + auto sp = std::shared_ptr(baseptr, [handle, device_id](void *ptr) { + platform::CUDADeviceGuard guard(device_id); + std::lock_guard lock(ipc_mutex_); + PADDLE_ENFORCE_GPU_SUCCESS(cudaIpcCloseMemHandle(ptr)); + ipc_handle_to_baseptr_.erase(handle); + VLOG(6) << "cudaIpcCloseMemHandle for ptr:" + << "\t" << ptr; + }); + std::weak_ptr wp = sp; + ipc_handle_to_baseptr_.insert(iter, {handle, wp}); + + return sp; +} + +CudaIpcAllocation::~CudaIpcAllocation() { + shared_ptr_.reset(); + VLOG(6) << "tensor deleted cudaIpcCloseMemHandle for ptr:" + << "\t" << this->ptr(); +} + +} // namespace allocation +} // namespace memory +} // namespace paddle + +#endif diff --git a/paddle/fluid/memory/allocation/cuda_ipc_allocator.h b/paddle/fluid/memory/allocation/cuda_ipc_allocator.h new file mode 100644 index 0000000000000000000000000000000000000000..52e3cf10ea73a787d87d19beeedcdedca1e3dd3b --- /dev/null +++ b/paddle/fluid/memory/allocation/cuda_ipc_allocator.h @@ -0,0 +1,56 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef _WIN32 +#pragma once + +#include +#include // NOLINT +#include +#include +#include + +#include "paddle/fluid/memory/allocation/allocator.h" +#include "paddle/fluid/platform/cuda_device_guard.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace memory { +namespace allocation { + +std::shared_ptr GetIpcBasePtr(std::string handle); + +class CudaIpcAllocation : public Allocation { + public: + explicit CudaIpcAllocation(void *ptr, size_t size, int device_id, + std::shared_ptr shared_ptr) + : Allocation(ptr, size, platform::CUDAPlace(device_id)), + device_id_(std::move(device_id)), + shared_ptr_(std::move(shared_ptr)) {} + + inline const int &device_id() const { return device_id_; } + + ~CudaIpcAllocation() override; + + private: + int device_id_; + std::shared_ptr shared_ptr_; +}; + +} // namespace allocation +} // namespace memory +} // namespace paddle + +#endif diff --git a/paddle/fluid/memory/allocation/mmap_allocator.cc b/paddle/fluid/memory/allocation/mmap_allocator.cc index acaf5d548555cc3ee69bc5a03309645006256487..25c2235cce85369babc4d601de96c7475a0b1fbd 100644 --- a/paddle/fluid/memory/allocation/mmap_allocator.cc +++ b/paddle/fluid/memory/allocation/mmap_allocator.cc @@ -29,6 +29,155 @@ namespace paddle { namespace memory { namespace allocation { +std::string GetIPCName() { + static std::random_device rd; + std::string handle = "/paddle_"; +#ifdef _WIN32 + handle += std::to_string(GetCurrentProcessId()); +#else + handle += std::to_string(getpid()); +#endif + handle += "_"; + handle += std::to_string(rd()); + return handle; +} + +struct CountInfo { + std::atomic refcount; +}; + +void AllocateMemoryMap(std::string filename, int flags, size_t size, + void **map_ptr_, int *fd_) { + // TODO(@ZHUI): support win32 + int file_flags = 0; + int fd = -1; + if (flags & MAPPED_SHAREDMEM) { + file_flags = O_RDWR | O_CREAT; + } else { + file_flags = O_RDONLY; + } + if (flags & MAPPED_EXCLUSIVE) { + file_flags |= O_EXCL; + } + if (flags & MAPPED_NOCREATE) { + file_flags &= ~O_CREAT; + } + + if (!(flags & MAPPED_FROMFD)) { + if (flags & MAPPED_SHAREDMEM) { + fd = shm_open(filename.c_str(), file_flags, (mode_t)0600); + PADDLE_ENFORCE_NE( + fd, -1, + platform::errors::Unavailable( + "File descriptor %s open failed, unable in read-write mode", + filename.c_str())); + VLOG(6) << "shm_open: " << filename; + } + } else { + fd = -1; + } + + PADDLE_ENFORCE_EQ(ftruncate(fd, size), 0, + platform::errors::Unavailable( + "Fruncate a file to a specified length failed!")); + + if (flags & MAPPED_SHAREDMEM) { + *map_ptr_ = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + } else { + *map_ptr_ = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); + } + + PADDLE_ENFORCE_NE(*map_ptr_, MAP_FAILED, + platform::errors::Unavailable( + "Memory map failed when create shared memory.")); + + if (flags & MAPPED_KEEPFD) { + *fd_ = fd; + } else { + PADDLE_ENFORCE_NE(::close(fd), -1, + platform::errors::Unavailable( + "Error closing memory maped file <", filename, ">")); + + *fd_ = -1; + } +} + +std::shared_ptr +AllocateRefcountedMemoryMapAllocation(std::string filename, int flags, + size_t size) { + int fd = -1; + void *base_ptr = nullptr; + AllocateMemoryMap(filename, flags, size + mmap_alignment, &base_ptr, &fd); + void *aliged_base_ptr = + static_cast(static_cast(base_ptr) + mmap_alignment); + return std::make_shared(aliged_base_ptr, size, + filename, flags, fd); +} + +RefcountedMemoryMapAllocation::RefcountedMemoryMapAllocation( + void *ptr, size_t size, std::string ipc_name, int fd, int flags) + : MemoryMapAllocation(ptr, size, ipc_name, fd, flags) { + // must reset base ptr first. + resetBaseptr(); + initializeRefercount(); +} + +void MemoryMapAllocation::close() { + if (closed_) { + return; + } + closed_ = true; +} + +MemoryMapAllocation::~MemoryMapAllocation() { close(); } + +void RefcountedMemoryMapAllocation::incref() { + CountInfo *info = static_cast(map_ptr_); + ++info->refcount; +} + +int RefcountedMemoryMapAllocation::decref() { + CountInfo *info = static_cast(map_ptr_); + return --info->refcount == 0; +} + +void RefcountedMemoryMapAllocation::resetBaseptr() { + map_ptr_ = + static_cast(static_cast(map_ptr_) - mmap_alignment); + map_size_ = map_size_ + mmap_alignment; +} + +void RefcountedMemoryMapAllocation::initializeRefercount() { + CountInfo *info = reinterpret_cast(map_ptr_); + + if (flags_ & MAPPED_EXCLUSIVE) { + new (&info->refcount) std::atomic(1); + } else { + info->refcount++; + } +} + +void RefcountedMemoryMapAllocation::close() { + if (closed_) { + return; + } + closed_ = true; + void *data = map_ptr_; + CountInfo *info = reinterpret_cast(data); + if (--info->refcount == 0) { + PADDLE_ENFORCE_NE( + shm_unlink(ipc_name_.c_str()), -1, + platform::errors::Unavailable( + "could not unlink the shared memory file ", ipc_name_)); + VLOG(6) << "shm_unlink file: " << ipc_name_; + } + + PADDLE_ENFORCE_NE( + munmap(map_ptr_, map_size_), -1, + platform::errors::Unavailable("could not unmap the shared memory file: ", + strerror(errno), " (", errno, ")")); +} + MemoryMapWriterAllocation::~MemoryMapWriterAllocation() { PADDLE_ENFORCE_NE( munmap(this->ptr(), this->size()), -1, @@ -44,30 +193,30 @@ MemoryMapReaderAllocation::~MemoryMapReaderAllocation() { /* Here we do not pay attention to the result of shm_unlink, because the memory mapped file may have been cleared due to the MemoryMapFdSet::Clear() */ + + // Code of DataLoader subprocess: + // + // core._array_to_share_memory_tensor(b) + // out_queue.put((idx, tensor_list, structure)) + // core._remove_tensor_list_mmap_fds(tensor_list) + + /* If the tensor in already in the send queue, the tensor will be + * deconstructed by the function. If the tensor not send yet, it + * will be cleared by MemoryMapFdSet::Clear(). + * If the `_remove_tensor_list_mmap_fds` have be interrupted, the + * tensor will be cleared by both methods. + * */ + shm_unlink(this->ipc_name().c_str()); MemoryMapFdSet::Instance().Remove(this->ipc_name()); VLOG(3) << "~MemoryMapReaderAllocation: " << this->ipc_name(); } -std::string GetIPCName() { - static std::random_device rd; - std::string handle = "/paddle_"; -#ifdef _WIN32 - handle += std::to_string(GetCurrentProcessId()); -#else - handle += std::to_string(getpid()); -#endif - handle += "_"; - handle += std::to_string(rd()); - return handle; -} - std::shared_ptr AllocateMemoryMapWriterAllocation( size_t size) { const std::string &ipc_name = GetIPCName(); int flags = O_RDWR | O_CREAT; - - int fd = shm_open(ipc_name.c_str(), flags, 0644); + int fd = shm_open(ipc_name.c_str(), flags, 0600); PADDLE_ENFORCE_NE( fd, -1, platform::errors::Unavailable("File descriptor %s open failed", ipc_name.c_str())); @@ -86,12 +235,14 @@ std::shared_ptr AllocateMemoryMapWriterAllocation( std::shared_ptr RebuildMemoryMapReaderAllocation( const std::string &ipc_name, size_t size) { - int fd = shm_open(ipc_name.c_str(), O_RDONLY, 0644); + int flags = O_RDWR | O_CREAT; + flags &= ~O_CREAT; + + int fd = shm_open(ipc_name.c_str(), flags, 0600); PADDLE_ENFORCE_NE( fd, -1, platform::errors::Unavailable("File descriptor %s open failed", ipc_name.c_str())); - - void *ptr = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0); + void *ptr = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); PADDLE_ENFORCE_NE(ptr, MAP_FAILED, platform::errors::Unavailable( "Memory map failed when rebuild shared memory.")); diff --git a/paddle/fluid/memory/allocation/mmap_allocator.h b/paddle/fluid/memory/allocation/mmap_allocator.h index 3f91e5c42780826ae0ef2e61e982da2336d10a3f..4f8dbfbb51e66db227dfcf46bc3ce313d8406dd1 100644 --- a/paddle/fluid/memory/allocation/mmap_allocator.h +++ b/paddle/fluid/memory/allocation/mmap_allocator.h @@ -16,8 +16,9 @@ #ifndef _WIN32 +#include #include -#include // NOLINT +#include #include #include #include @@ -28,6 +29,72 @@ namespace paddle { namespace memory { namespace allocation { +std::string GetIPCName(); + +static constexpr int64_t mmap_alignment = 64; + +enum MappedModes { + MAPPED_SHAREDMEM = 1, + MAPPED_EXCLUSIVE = 2, + MAPPED_NOCREATE = 4, + MAPPED_KEEPFD = 8, + MAPPED_FROMFD = 16, + MAPPED_UNLINK = 32 +}; + +class MemoryMapAllocation : public Allocation { + public: + explicit MemoryMapAllocation(void *ptr, size_t size, std::string ipc_name) + : Allocation(ptr, size, platform::CPUPlace()), + ipc_name_(std::move(ipc_name)), + map_ptr_(ptr), + map_size_(size) {} + explicit MemoryMapAllocation(void *ptr, size_t size, std::string ipc_name, + int flags, int fd) + : Allocation(ptr, size, platform::CPUPlace()), + ipc_name_(std::move(ipc_name)), + fd_(fd), + flags_(flags), + map_ptr_(ptr), + map_size_(size) {} + + inline const std::string &ipc_name() const { return ipc_name_; } + + virtual void close(); + + ~MemoryMapAllocation() override; + + protected: + std::string ipc_name_; + int fd_ = -1; + int flags_ = 0; + void *map_ptr_ = nullptr; + size_t map_size_ = 0; + bool closed_ = false; +}; + +class RefcountedMemoryMapAllocation : public MemoryMapAllocation { + public: + RefcountedMemoryMapAllocation(void *ptr, size_t size, std::string ipc_name, + int flags, int fd); + + void incref(); + int decref(); + void close() override; + virtual ~RefcountedMemoryMapAllocation() { close(); } + + protected: + void initializeRefercount(); + void resetBaseptr(); +}; + +void AllocateMemoryMap(std::string filename, int flags, size_t size, + void **base_ptr_, int *fd_); + +std::shared_ptr +AllocateRefcountedMemoryMapAllocation(std::string filename, int flags, + size_t size); + class MemoryMapWriterAllocation : public Allocation { public: explicit MemoryMapWriterAllocation(void *ptr, size_t size, diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 66f1bcc8b68692abe588b6429b027462eaebde24..4205f2253a652ccc5f6d4886df1b1194f5e5062f 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -1482,6 +1482,9 @@ REGISTER_ACTIVATION_OP(cosh, Cosh, CoshFunctor, CoshGradFunctor); REGISTER_ACTIVATION_OP(asinh, Asinh, AsinhFunctor, AsinhGradFunctor); REGISTER_ACTIVATION_OP(acosh, Acosh, AcoshFunctor, AcoshGradFunctor); REGISTER_ACTIVATION_OP(atanh, Atanh, AtanhFunctor, AtanhGradFunctor); +REGISTER_ACTIVATION_OP(brelu, BRelu, BReluFunctor, BReluGradFunctor); +REGISTER_ACTIVATION_OP(thresholded_relu, ThresholdedRelu, + ThresholdedReluFunctor, ThresholdedReluGradFunctor); /* ========================== sigmoid register ============================= */ @@ -1567,23 +1570,6 @@ REGISTER_OPERATOR( ops::ActivationOpTripleGrad::FwdDeps()>, ops::ActivationTripleGradOpInplaceInferer); -REGISTER_ACTIVATION_CPU_KERNEL(tanh, Tanh, TanhFunctor, TanhGradFunctor); -REGISTER_OP_CPU_KERNEL( - tanh_grad_grad, ops::TanhDoubleGradKernel>, - ops::TanhDoubleGradKernel>, - ops::TanhDoubleGradKernel>); -// Register TripleGrad Kernel -REGISTER_OP_CPU_KERNEL( - tanh_triple_grad, - ops::TanhTripeGradKernel>, - ops::TanhTripeGradKernel>, - ops::TanhTripeGradKernel>); /* ========================================================================== */ /* ========================== relu register ============================= */ @@ -1623,16 +1609,6 @@ REGISTER_OPERATOR( ops::ActivationOpDoubleGrad2::FwdDeps()>, ops::ActivationDoubleGradOpInplaceInferer); -REGISTER_ACTIVATION_CPU_KERNEL(leaky_relu, LeakyRelu, LeakyReluFunctor, - LeakyReluGradFunctor); -REGISTER_OP_CPU_KERNEL( - leaky_relu_grad_grad, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel< - plat::CPUDeviceContext, ops::LeakyReluGradGradFunctor>); /* ========================================================================== */ /* ======================== elu register ============================ */ diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index 4b79397b6cdf2e5c2993f7a72f512cc924c208e7..b076db01c22c62b17fdd85b7208467eea1375fed 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -253,6 +253,14 @@ struct SigmoidFunctor : public BaseActivationFunctor { template \ using name##GradFunctor = phi::funcs::name##GradFunctor; +#define USE_PHI_DOUBLE_GRAD_FUNCTOR(name) \ + template \ + using name##GradGradFunctor = phi::funcs::name##GradGradFunctor; + +#define USE_PHI_TRIPLE_GRAD_FUNCTOR(name) \ + template \ + using name##TripleGradFunctor = phi::funcs::name##TripleGradFunctor; + USE_PHI_FUNCTOR(Cos) USE_PHI_FUNCTOR(Tan) USE_PHI_FUNCTOR(Acos) @@ -264,6 +272,13 @@ USE_PHI_FUNCTOR(Cosh) USE_PHI_FUNCTOR(Asinh) USE_PHI_FUNCTOR(Acosh) USE_PHI_FUNCTOR(Atanh) +USE_PHI_FUNCTOR(Tanh) +USE_PHI_DOUBLE_GRAD_FUNCTOR(Tanh) +USE_PHI_TRIPLE_GRAD_FUNCTOR(Tanh) +USE_PHI_FUNCTOR(BRelu) +USE_PHI_FUNCTOR(ThresholdedRelu) +USE_PHI_FUNCTOR(LeakyRelu) +USE_PHI_DOUBLE_GRAD_FUNCTOR(LeakyRelu) template struct SigmoidGradFunctor : public BaseActivationFunctor { @@ -497,117 +512,6 @@ using ReluGradGradFunctor = phi::funcs::ReluGradGradFunctor; template using ReluCUDAFunctor = phi::funcs::ReluCUDAFunctor; -// tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) -template -struct TanhFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.tanh(); - } -}; - -template -struct TanhGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * (static_cast(1) - out * out); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - -template -struct TanhGradGradFunctor : public BaseActivationFunctor { - template - void operator()(const Device& dev, const framework::Tensor* Out, - const framework::Tensor* ddX, const framework::Tensor* dOut, - framework::Tensor* dOutNew, framework::Tensor* ddOut) const { - auto* d = dev.eigen_device(); - auto ddx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddX, "Input", "DDX", "TanhGradGrad")); - auto out = framework::EigenVector::Flatten( - GET_DATA_SAFELY(Out, "Input", "Out", "TanhGradGrad")); - // tanh grad grad : ddout = (1 - out^2) * ddx, dout = - (dout_old * 2 * out - // * ddx) - if (dOutNew) { - auto dout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhGradGrad")); - auto dout_new = framework::EigenVector::Flatten( - GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "TanhGradGrad")); - dout_new.device(*d) = - static_cast(-1) * dout * static_cast(2) * out * ddx; - } - if (ddOut) { - auto ddout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddOut, "Output", "DDOut", "TanhGradGrad")); - ddout.device(*d) = (static_cast(1) - out * out) * ddx; - } - } - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; -/* - Out - DOut D_Dout - DDx -> TanhTripleGrad -> D_DDx - D_DDout d_OutNew - D_Dout_new - - D_Dout = (-2) * Out * DDx * D_Dout_new - D_DDx = (1-Out^2)*D_DDout + (-2) * Out * DOut * D_Dout_new - D_OutNew = (-2) * Out * DDx * D_DDout + (-2) * DOut * DDx * D_Dout_new - - Out, DDX, DOut, D_DDOut, D_DOut_New // input - D_OutNew, D_DOut, D_DDx // output -*/ -template -struct TanhTripleGradFunctor : public BaseActivationFunctor { - template - void operator()(const Device& dev, const framework::Tensor* Out, - const framework::Tensor* ddX, const framework::Tensor* dOut, - const framework::Tensor* d_DDOut, - const framework::Tensor* d_dOut_New, - framework::Tensor* d_d_Out, framework::Tensor* d_Out_New, - framework::Tensor* d_DDx) const { - auto* d = dev.eigen_device(); - auto ddx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddX, "Input", "DDX", "TanhTripleGrad")); - auto out = framework::EigenVector::Flatten( - GET_DATA_SAFELY(Out, "Input", "Out", "TanhTripleGrad")); - auto dout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhTripleGrad")); - auto d_ddOut = framework::EigenVector::Flatten( - GET_DATA_SAFELY(d_DDOut, "Input", "D_DDOut", "TanhTripleGrad")); - auto d_dOutNew = framework::EigenVector::Flatten( - GET_DATA_SAFELY(d_dOut_New, "Input", "D_DOut_New", "TanhTripleGrad")); - - if (d_Out_New) { - auto d_OutNew = framework::EigenVector::Flatten( - GET_DATA_SAFELY(d_Out_New, "Output", "D_OutNew", "TanhTripleGrad")); - d_OutNew.device(*d) = (static_cast(-2) * out * ddx * d_ddOut) - - (static_cast(2) * dout * ddx * d_dOutNew); - } - if (d_d_Out) { - auto d_dOut = framework::EigenVector::Flatten( - GET_DATA_SAFELY(d_d_Out, "Output", "D_DOut", "TanhTripleGrad")); - d_dOut.device(*d) = static_cast(-2) * out * ddx * d_dOutNew; - } - if (d_DDx) { - auto d_ddx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(d_DDx, "Output", "D_DDx", "TanhTripleGrad")); - d_ddx.device(*d) = (static_cast(1) - (out * out)) * d_ddOut - - static_cast(2) * out * dout * d_dOutNew; - } - } - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - // tanhshrink(x) = x - tanh(x) // where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) template @@ -909,42 +813,6 @@ struct SquareGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; -template -struct BReluFunctor : public BaseActivationFunctor { - float t_min; - float t_max; - - // NOTE: Explicit hides the `BaseActivationFunctor::GetAttrs` - // not polymorphism for speed. - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"t_min", &t_min}, {"t_max", &t_max}}; - } - - template - void operator()(Device d, X x, Out out) const { - out.device(d) = - x.cwiseMax(static_cast(t_min)).cwiseMin(static_cast(t_max)); - } -}; - -template -struct BReluGradFunctor : public BaseActivationFunctor { - float t_min; - float t_max; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"t_min", &t_min}, {"t_max", &t_max}}; - } - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * - ((x > static_cast(t_min)) * (x < static_cast(t_max))) - .template cast(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - // relu6(x) = min(max(0, x), 6) template struct Relu6Functor : public BaseActivationFunctor { @@ -1168,41 +1036,6 @@ struct SoftReluGradFunctor : public BaseActivationFunctor { } }; -template -struct LeakyReluFunctor : public BaseActivationFunctor { - float alpha; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - - template - void operator()(Device d, X x, Out out) const { - if (alpha < 1.f) { - out.device(d) = x.cwiseMax(static_cast(alpha) * x); - } else { - out.device(d) = x.cwiseMin(static_cast(alpha) * x); - } - } -}; - -template -struct LeakyReluGradFunctor : public BaseActivationFunctor { - float alpha; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - auto temp1 = - static_cast(alpha) * (x < static_cast(0)).template cast(); - auto temp2 = (x >= static_cast(0)).template cast(); - dx.device(d) = dout * (temp1 + temp2).template cast(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - template struct ELUFunctor : public BaseActivationFunctor { float alpha; @@ -1430,37 +1263,6 @@ struct STanhGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; -template -struct ThresholdedReluFunctor : public BaseActivationFunctor { - float threshold; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"threshold", &threshold}}; - } - - template - void operator()(Device d, X x, Out out) const { - auto th = static_cast(threshold); - out.device(d) = (x > th).template cast() * x; - } -}; - -template -struct ThresholdedReluGradFunctor : public BaseActivationFunctor { - float threshold; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"threshold", &threshold}}; - } - - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - auto th = static_cast(threshold); - dx.device(d) = dout * (x > th).template cast(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - template struct HardSigmoidFunctor : public BaseActivationFunctor { float slope; @@ -1531,121 +1333,6 @@ struct SwishGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; -/* - * in arguments: x, out, ddx - * out arguments: ddout, dout, dx - */ -template -inline void ExtractActivationDoubleGradTensor( - const framework::ExecutionContext& ctx, const framework::Tensor** X, - const framework::Tensor** Out, const framework::Tensor** ddX, - framework::Tensor** dX, framework::Tensor** dOut, - framework::Tensor** ddOut) { - auto ddx_var = ctx.InputVar("DDX"); - auto ddo_var = ctx.OutputVar("DDOut"); - PADDLE_ENFORCE_NOT_NULL( - ddx_var, platform::errors::NotFound( - "Cannot get input Variable Out, variable name = %s", - ctx.InputName("DDX"))); - if (CanBeUsedBySelectedRows.count(ctx.Type())) { - *ddX = paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*ddx_var); - if (ddo_var) { - *ddOut = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar( - ddo_var); - } - } else { - *ddX = ctx.Input("DDX"); - if (ddo_var) { - *ddOut = ctx.Output("DDOut"); - } - } - PADDLE_ENFORCE_NOT_NULL( - *ddX, - platform::errors::NotFound( - "Cannot get the tensor from the Variable Output, variable name = %s", - ctx.OutputName("DDX"))); - - if (static_cast(kDepValue) & static_cast(ActBwdOpFwdDeps::kDepX)) { - auto x_var = ctx.InputVar("X"); - PADDLE_ENFORCE_NOT_NULL( - x_var, platform::errors::NotFound( - "Cannot get input Variable Out, variable name = %s", - ctx.InputName("X"))); - auto dx_var = ctx.OutputVar("DX"); - if (CanBeUsedBySelectedRows.count(ctx.Type())) { - *X = paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_var); - if (dx_var) { - *dX = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar( - dx_var); - } - } else { - *X = ctx.Input("X"); - if (dx_var) { - *dX = ctx.Output("DX"); - } - } - } else { - VLOG(10) << "Inplace activation of Op: " << ctx.Type(); - *X = *ddX; - } - if (static_cast(kDepValue) & - static_cast(ActBwdOpFwdDeps::kDepOut)) { - auto out_var = ctx.InputVar("Out"); - PADDLE_ENFORCE_NOT_NULL( - out_var, - platform::errors::NotFound( - "Cannot get the tensor from the Variable Out, variable name = %s", - ctx.InputName("Out"))); - auto dout_var = ctx.OutputVar("DOut"); - if (CanBeUsedBySelectedRows.count(ctx.Type())) { - *Out = - paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*out_var); - if (dout_var) { - *dOut = - paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar( - dout_var); - } - } else { - *Out = ctx.Input("Out"); - if (dout_var) { - *dOut = ctx.Output("DOut"); - } - } - } else { - VLOG(10) << "Inplace activation of Op: " << ctx.Type(); - *Out = *ddX; - } -} - -template -class ActivationDoubleGradKernel - : public framework::OpKernel { - public: - using T = typename Functor::ELEMENT_TYPE; - void Compute(const framework::ExecutionContext& ctx) const override { - const framework::Tensor *X, *Out, *ddX; - X = Out = ddX = nullptr; - framework::Tensor *ddOut, *dOut, *dX; - ddOut = dOut = dX = nullptr; - - ExtractActivationDoubleGradTensor(ctx, &X, &Out, &ddX, - &dX, &dOut, &ddOut); - - if (ddOut) ddOut->mutable_data(ctx.GetPlace()); - if (dOut) dOut->mutable_data(ctx.GetPlace()); - if (dX) dX->mutable_data(Out->dims(), ctx.GetPlace()); - - auto& place = ctx.template device_context(); - - Functor functor; - auto attrs = functor.GetAttrs(); - for (auto& attr : attrs) { - *attr.second = ctx.Attr(attr.first); - } - functor(place, X, Out, ddX, ddOut, dOut, dX); - } -}; - template struct AbsGradGradFunctor : public BaseActivationFunctor { template @@ -1667,35 +1354,6 @@ struct AbsGradGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; -template -struct LeakyReluGradGradFunctor : public BaseActivationFunctor { - float alpha; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - template - void operator()(const Device& dev, const framework::Tensor* X, - const framework::Tensor* Out, const framework::Tensor* ddX, - framework::Tensor* ddOut, framework::Tensor* dOut, - framework::Tensor* dX) const { - if (ddOut) { - auto* d = dev.eigen_device(); - auto ddx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddX, "Input", "DDX", "LeakyReluGradGrad")); - auto x = framework::EigenVector::Flatten( - GET_DATA_SAFELY(X, "Input", "X", "LeakyReluGradGrad")); - auto ddout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddOut, "Output", "DOut", "LeakyReluGradGrad")); - ddout.device(*d) = - ddx * - ((x > static_cast(0)).template cast() + - static_cast(alpha) * (x <= static_cast(0)).template cast()) - .template cast(); - } - } - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - template struct ELUGradGradFunctor : public BaseActivationFunctor { float alpha; @@ -2504,7 +2162,6 @@ struct LogGradGradFunctor : public BaseActivationFunctor { __macro(log1p, Log1p, Log1pFunctor, Log1pGradFunctor); \ __macro(log2, Log2, Log2Functor, Log2GradFunctor); \ __macro(log10, Log10, Log10Functor, Log10GradFunctor); \ - __macro(brelu, BRelu, BReluFunctor, BReluGradFunctor); \ __macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor); \ __macro(stanh, STanh, STanhFunctor, STanhGradFunctor); \ __macro(softplus, Softplus, SoftplusFunctor, SoftplusGradFunctor); \ @@ -2515,7 +2172,5 @@ struct LogGradGradFunctor : public BaseActivationFunctor { __macro(hard_sigmoid, HardSigmoid, HardSigmoidFunctor, \ HardSigmoidGradFunctor); \ __macro(swish, Swish, SwishFunctor, SwishGradFunctor); \ - __macro(thresholded_relu, ThresholdedRelu, ThresholdedReluFunctor, \ - ThresholdedReluGradFunctor); \ __macro(mish, Mish, MishFunctor, MishGradFunctor); \ __macro(hard_swish, HardSwish, HardSwishFunctor, HardSwishGradFunctor); diff --git a/paddle/fluid/operators/activation_op.kps b/paddle/fluid/operators/activation_op.kps index 92a101451e211f912e5390171654affa3be4e973..256f20db08445e8b8d5933aa0e3151f69fcb5b10 100644 --- a/paddle/fluid/operators/activation_op.kps +++ b/paddle/fluid/operators/activation_op.kps @@ -18,38 +18,6 @@ limitations under the License. */ namespace paddle { namespace operators { -template -struct CudaLeakyReluFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - float alpha; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - - // leakyrelu(x) = x > 0 ? x : alpha * x - __device__ __forceinline__ T operator()(const T x) const { - return x > zero ? x : static_cast(alpha) * x; - } -}; - -template -struct CudaLeakyReluGradFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - float alpha; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - - // dx = dout * (x > 0 ? 1 : alpha) - __device__ __forceinline__ T operator()(const T dout, const T x) const { - return x > zero ? dout : static_cast(alpha) * dout; - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - template struct CudaSigmoidFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; @@ -224,31 +192,6 @@ struct CudaZeroGradFunctor : public BaseActivationFunctor { } }; -template -struct CudaTanhFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // tanh(x) = tanh(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(tanh(x)); - } -}; - -template -struct CudaTanhGradFunctor : public BaseActivationFunctor { - T one = static_cast(1.0f); - - // dx = dout * (1 - out^2) - __device__ __forceinline__ T operator()(const T dout, const T out) const { - return dout * (one - out * out); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - template struct CudaReciprocalFunctor : public BaseActivationFunctor { T one = static_cast(1.0f); @@ -476,45 +419,6 @@ struct CudaLog10GradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; -template -struct CudaBReluFunctor : public BaseActivationFunctor { - float t_min; - float t_max; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"t_min", &t_min}, {"t_max", &t_max}}; - } - - // brelu(x) = min(max(x, t_min), t_max) - __device__ __forceinline__ T operator()(const T x) const { - T t_min_cast = static_cast(t_min); - T t_max_cast = static_cast(t_max); - T temp_max = x > t_min_cast ? x : t_min_cast; - T temp_min = temp_max < t_max_cast ? temp_max : t_max_cast; - return temp_min; - } -}; - -template -struct CudaBReluGradFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - float t_min; - float t_max; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"t_min", &t_min}, {"t_max", &t_max}}; - } - - // dx = (x > t_min && x < t_max) ? dout : 0 - __device__ __forceinline__ T operator()(const T dout, const T x) const { - T t_min_cast = static_cast(t_min); - T t_max_cast = static_cast(t_max); - return (x > t_min_cast && x < t_max_cast) ? dout : zero; - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - template struct CudaSoftReluFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; @@ -907,38 +811,6 @@ struct CudaMishGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; -template -struct CudaThresholdedReluFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - float threshold; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"threshold", &threshold}}; - } - - // thresholded_relu(x) = x > threshold ? x : 0 - __device__ __forceinline__ T operator()(const T x) const { - return x > static_cast(threshold) ? x : zero; - } -}; - -template -struct CudaThresholdedReluGradFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - float threshold; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"threshold", &threshold}}; - } - - // dx = x > threshold ? dout : 0 - __device__ __forceinline__ T operator()(const T dout, const T x) const { - return x > static_cast(threshold) ? dout : zero; - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - template struct CudaHardSwishFunctor : public BaseActivationFunctor { T zero = static_cast(0.0f); @@ -1212,6 +1084,22 @@ class ActivationGradCudaKernel } }; +USE_PHI_FUNCTOR(CudaCos) +USE_PHI_FUNCTOR(CudaTan) +USE_PHI_FUNCTOR(CudaAcos) +USE_PHI_FUNCTOR(CudaSin) +USE_PHI_FUNCTOR(CudaAsin) +USE_PHI_FUNCTOR(CudaAtan) +USE_PHI_FUNCTOR(CudaSinh) +USE_PHI_FUNCTOR(CudaCosh) +USE_PHI_FUNCTOR(CudaAsinh) +USE_PHI_FUNCTOR(CudaAcosh) +USE_PHI_FUNCTOR(CudaAtanh) +USE_PHI_FUNCTOR(CudaTanh) +USE_PHI_FUNCTOR(CudaBRelu) +USE_PHI_FUNCTOR(CudaLeakyRelu) +USE_PHI_FUNCTOR(CudaThresholdedRelu) + } // namespace operators } // namespace paddle @@ -1270,20 +1158,6 @@ namespace plat = paddle::platform; ops::ActivationGradCudaKernel>); -/* ======================== leaky relu register ============================ */ -REGISTER_ACTIVATION_CUDA_KERNEL(leaky_relu, LeakyRelu, CudaLeakyReluFunctor, - CudaLeakyReluGradFunctor); - -REGISTER_OP_CUDA_KERNEL( - leaky_relu_grad_grad, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel< - plat::CUDADeviceContext, ops::LeakyReluGradGradFunctor>); -/* ========================================================================== */ - /* ======================== elu register ============================ */ REGISTER_OP_CUDA_KERNEL( elu, ops::ActivationCudaKernel>); /* ========================================================================== */ -/* =========================== tanh register ============================ */ -REGISTER_ACTIVATION_CUDA_KERNEL(tanh, Tanh, CudaTanhFunctor, - CudaTanhGradFunctor); - -REGISTER_OP_CUDA_KERNEL( - tanh_grad_grad, - ops::TanhDoubleGradKernel>, - ops::TanhDoubleGradKernel>, - ops::TanhDoubleGradKernel>); - -REGISTER_OP_CUDA_KERNEL( - tanh_triple_grad, - ops::TanhTripeGradKernel>, - ops::TanhTripeGradKernel>, - ops::TanhTripeGradKernel>); -/* ========================================================================== */ - /* =========================== sqrt register ============================= */ REGISTER_ACTIVATION_CUDA_KERNEL(sqrt, Sqrt, CudaSqrtFunctor, CudaSqrtGradFunctor); @@ -1521,7 +1372,6 @@ REGISTER_OP_CUDA_KERNEL( __macro(log1p, Log1p, CudaLog1pFunctor, CudaLog1pGradFunctor); \ __macro(log2, Log2, CudaLog2Functor, CudaLog2GradFunctor); \ __macro(log10, Log10, CudaLog10Functor, CudaLog10GradFunctor); \ - __macro(brelu, BRelu, CudaBReluFunctor, CudaBReluGradFunctor); \ __macro(soft_relu, SoftRelu, CudaSoftReluFunctor, CudaSoftReluGradFunctor); \ __macro(stanh, STanh, CudaSTanhFunctor, CudaSTanhGradFunctor); \ __macro(softplus, Softplus, CudaSoftplusFunctor, CudaSoftplusGradFunctor); \ @@ -1535,8 +1385,6 @@ REGISTER_OP_CUDA_KERNEL( CudaHardSigmoidGradFunctor); \ __macro(swish, Swish, CudaSwishFunctor, CudaSwishGradFunctor); \ __macro(mish, Mish, CudaMishFunctor, CudaMishGradFunctor); \ - __macro(thresholded_relu, ThresholdedRelu, CudaThresholdedReluFunctor, \ - CudaThresholdedReluGradFunctor); \ __macro(hard_swish, HardSwish, CudaHardSwishFunctor, \ CudaHardSwishGradFunctor); FOR_EACH_ACTIVATION_CUDA_OP(REGISTER_ACTIVATION_CUDA_KERNEL) diff --git a/paddle/fluid/operators/allclose_op.cc b/paddle/fluid/operators/allclose_op.cc index 8fb9929c39e9223303f4427f1a0d7e1ed66134d4..88d7cb7c1f5f4bf47dc82f8632116424253d6d19 100644 --- a/paddle/fluid/operators/allclose_op.cc +++ b/paddle/fluid/operators/allclose_op.cc @@ -12,52 +12,20 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/allclose_op.h" #include #include + +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { -template -struct GetTensorValue { - T operator()(const platform::CPUDeviceContext& dev_ctx, - const framework::Tensor& tensor) const { - return *(tensor.data()); - } -}; - -template -struct AllcloseFunctor { - void operator()(const platform::CPUDeviceContext& ctx, - const framework::Tensor& in, const framework::Tensor& other, - const double rtol, const double atol, bool equal_nan, - framework::Tensor* output) { - auto* in_a = in.data(); - auto* in_b = other.data(); - auto* out_data = output->mutable_data(ctx.GetPlace()); - auto num = in.numel(); - *out_data = true; - for (int i = 0; i < num; i++) { - const T a = in_a[i], b = in_b[i]; - bool val; - if (std::isnan(a) || std::isnan(b)) { - val = equal_nan && std::isnan(a) == std::isnan(b); - } else { - T left = (a > b ? a - b : b - a); - T right = atol + (b > 0 ? rtol * b : (-rtol) * b); - T diff = (left > right ? left - right : right - left); - val = a == b || left <= right || diff <= 1e-15; - } - *out_data &= val; - } - } -}; - class AllcloseOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -96,40 +64,6 @@ class AllcloseOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "Allclose"); - OP_INOUT_CHECK(ctx->HasInput("Other"), "Input", "Other", "Allclose"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Allclose"); - - auto input_dim = ctx->GetInputDim("Input"); - auto other_dim = ctx->GetInputDim("Other"); - PADDLE_ENFORCE_EQ(input_dim.size(), other_dim.size(), - platform::errors::PreconditionNotMet( - "Input(Input) and Input(Other) must have the same " - "dimension size.")); - int n = input_dim.size(); - bool is_runtime = ctx->IsRuntime(); - for (int i = 0; i < n; i++) { - if (is_runtime) { - PADDLE_ENFORCE_EQ(input_dim[i], other_dim[i], - platform::errors::PreconditionNotMet( - "The value at dim %d of Input(Input) is not " - "equal to the Input(Other): %ld != %ld.", - i, input_dim[i], other_dim[i])); - } else { - if (!(input_dim[i] < 0 || other_dim[i] < 0)) { - PADDLE_ENFORCE_EQ(input_dim[i], other_dim[i], - platform::errors::PreconditionNotMet( - "The value at dim %d of Input(Input) is not " - "equal to the Input(Other): %ld != %ld.", - i, input_dim[i], other_dim[i])); - } - } - } - - ctx->SetOutputDim("Out", phi::make_ddim({1})); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -152,13 +86,13 @@ class AllcloseOpVarTypeInference : public framework::VarTypeInference { namespace ops = paddle::operators; using CPU = paddle::platform::CPUDeviceContext; +DECLARE_INFER_SHAPE_FUNCTOR(allclose, AllcloseInferShapeFunctor, + PD_INFER_META(phi::AllValueCompareInferMeta)); REGISTER_OPERATOR( allclose, ops::AllcloseOp, ops::AllcloseOpMaker, paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker, - ops::AllcloseOpVarTypeInference); -REGISTER_OP_CPU_KERNEL(allclose, ops::AllcloseKernel, - ops::AllcloseKernel); + ops::AllcloseOpVarTypeInference, AllcloseInferShapeFunctor); /* ========================== register checkpoint ===========================*/ REGISTER_OP_VERSION(allclose) diff --git a/paddle/fluid/operators/allclose_op.cu b/paddle/fluid/operators/allclose_op.cu deleted file mode 100644 index 32c90ff8fdc109b30b140f0f70b336615ce93c17..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/allclose_op.cu +++ /dev/null @@ -1,84 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/operators/allclose_op.h" - -namespace paddle { -namespace operators { - -template -struct GetTensorValue { - T operator()(const platform::CUDADeviceContext& dev_ctx, - const framework::Tensor& tensor) const { - const T* data = tensor.data(); - T value; - const auto gpu_place = dev_ctx.GetPlace(); - memory::Copy(platform::CPUPlace(), &value, gpu_place, data, sizeof(T), - dev_ctx.stream()); - return value; - } -}; - -template -__global__ void AllcloseCUDAKernel(const T* in_data, const T* other_data, - const double rtol, const double atol, - bool equal_nan, int num, bool* out_data) { - unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; - bool val; - for (int i = idx; i < num; i += blockDim.x * gridDim.x) { - const T a = in_data[i], b = other_data[i]; - if (isnan(a) || isnan(b)) { - val = equal_nan && isnan(a) == isnan(b); - } else { - T left = (a > b ? a - b : b - a); - T right = atol + (b > 0 ? rtol * b : (-rtol) * b); - T diff = (left > right ? left - right : right - left); - val = a == b || left <= right || diff <= 1e-15; - } - if (!val) *out_data = false; - } -} - -template -struct AllcloseFunctor { - void operator()(const platform::CUDADeviceContext& dev_ctx, - const framework::Tensor& in, const framework::Tensor& other, - const double rtol, const double atol, bool equal_nan, - framework::Tensor* output) { - int num = in.numel(); - const T* in_data = in.data(); - const T* other_data = other.data(); - bool* out_data = output->mutable_data(dev_ctx.GetPlace()); - int block = 1024; - int grid = (block - 1 + num) / block; - grid = (grid > block) ? block : grid; -#ifdef PADDLE_WITH_HIP - hipMemset(out_data, true, sizeof(bool)); -#else - cudaMemset(out_data, true, sizeof(bool)); -#endif - AllcloseCUDAKernel<<>>( - in_data, other_data, rtol, atol, equal_nan, num, out_data); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -using CUDA = paddle::platform::CUDADeviceContext; -REGISTER_OP_CUDA_KERNEL(allclose, ops::AllcloseKernel, - ops::AllcloseKernel); diff --git a/paddle/fluid/operators/allclose_op.h b/paddle/fluid/operators/allclose_op.h deleted file mode 100644 index 7a36754194ace5fad14d5a77e9d0be7f1c182087..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/allclose_op.h +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/platform/place.h" - -namespace paddle { -namespace operators { -using Tensor = framework::Tensor; - -template -struct GetTensorValue { - T operator()(const platform::DeviceContext& ctx, - const framework::Tensor& tensor) const; -}; - -template -struct AllcloseFunctor { - void operator()(const DeviceContext& ctx, const framework::Tensor& in, - const framework::Tensor& other, const float rtol, - const float atol, bool equal_nan, framework::Tensor* output); -}; - -template -class AllcloseKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - // get attrs - bool equal_nan = ctx.Attr("equal_nan"); - // get input/output - const auto* input = ctx.Input("Input"); - const auto* other = ctx.Input("Other"); - auto* out = ctx.Output("Out"); - - double rtol_v = std::stod(ctx.Attr("rtol")); - double atol_v = std::stod(ctx.Attr("atol")); - - auto& dev_ctx = ctx.template device_context(); - GetTensorValue get_tensor_value; - if (ctx.HasInput("Rtol")) { - const auto* rtol = ctx.Input("Rtol"); - PADDLE_ENFORCE_EQ( - rtol->numel(), 1, - platform::errors::InvalidArgument( - "Input(Rtol) size must be 1, but get %d.", rtol->numel())); - PADDLE_ENFORCE_EQ( - framework::TransToProtoVarType(rtol->dtype()), - framework::proto::VarType::FP64, - platform::errors::InvalidArgument( - "Input(Rtol) type must be double, but get %s.", - framework::DataTypeToString( - framework::TransToProtoVarType(rtol->dtype())))); - rtol_v = get_tensor_value(dev_ctx, *rtol); - } - if (ctx.HasInput("Atol")) { - const auto* atol = ctx.Input("Atol"); - PADDLE_ENFORCE_EQ( - atol->numel(), 1, - platform::errors::InvalidArgument( - "Input(Atol) size must be 1, but get %d", atol->numel())); - PADDLE_ENFORCE_EQ( - framework::TransToProtoVarType(atol->dtype()), - framework::proto::VarType::FP64, - platform::errors::InvalidArgument( - "Input(Atol) type must be double, but get %s", - framework::DataTypeToString( - framework::TransToProtoVarType(atol->dtype())))); - atol_v = get_tensor_value(dev_ctx, *atol); - } - - AllcloseFunctor()(dev_ctx, *input, *other, rtol_v, atol_v, - equal_nan, out); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc new file mode 100644 index 0000000000000000000000000000000000000000..237cfcc6f1172518097863158ca6dbd595af4186 --- /dev/null +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc @@ -0,0 +1,88 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h" +#include "paddle/fluid/operators/mlu/mlu_baseop.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class CheckFiniteAndUnscaleMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + auto& dev_ctx = ctx.template device_context(); + const auto xs = ctx.MultiInput("X"); + const auto* scale = ctx.Input("Scale"); + auto outs = ctx.MultiOutput("Out"); + auto* found_inf = ctx.Output("FoundInfinite"); + + found_inf->mutable_data(dev_ctx.GetPlace()); + + MLUCnnlTensorDesc scale_desc(*scale); + MLUCnnlTensorDesc found_inf_desc(*found_inf, CNNL_LAYOUT_ARRAY, + ToCnnlDataType()); + + for (size_t i = 0; i < xs.size(); ++i) { + const auto* x = xs[i]; + auto* out = outs[i]; + out->mutable_data(ctx.GetPlace()); + + // check is_finite or is_nan + Tensor is_finite(found_inf->type()); + if (i != 0) { + is_finite.Resize(phi::make_ddim({1})); + is_finite.mutable_data(ctx.GetPlace()); + } else { + is_finite.ShareDataWith(*found_inf); + } + + MLUCnnlTensorDesc x_desc(*x); + + MLUCnnl::IsNanInf(ctx, x_desc.get(), GetBasePtr(x), + GetBasePtr(&is_finite)); + + // save is_finite by logical_and op after checking every input + if (i != 0) { + MLUCnnlTensorDesc is_finite_desc(is_finite, CNNL_LAYOUT_ARRAY, + ToCnnlDataType()); + MLUCnnl::Logic(ctx, CNNL_LOGIC_OP_OR, found_inf_desc.get(), + GetBasePtr(found_inf), is_finite_desc.get(), + GetBasePtr(&is_finite), found_inf_desc.get(), + GetBasePtr(found_inf)); + } + + // The normal logic is : + // out = in, if found_inf = true + // out = in/scale, if found_inf = false + // But when found_inf is true, the data of Out should not be used. + // So, on MLU, we always compute out with in/scale. + MLUCnnlTensorDesc out_desc(*out); + MLUCnnl::Div(ctx, CNNL_COMPUTATION_HIGH_PRECISION, x_desc.get(), + GetBasePtr(x), scale_desc.get(), GetBasePtr(scale), + out_desc.get(), GetBasePtr(out)); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; +REGISTER_OP_MLU_KERNEL(check_finite_and_unscale, + ops::CheckFiniteAndUnscaleMLUKernel, + ops::CheckFiniteAndUnscaleMLUKernel); diff --git a/paddle/fluid/operators/controlflow/compare_op_mlu.cc b/paddle/fluid/operators/controlflow/compare_op_mlu.cc index 9dc287ab76a67c6026ec8794793e77179063af3d..c39743ef9914c039f13428d43a66b1aa66ada0ed 100644 --- a/paddle/fluid/operators/controlflow/compare_op_mlu.cc +++ b/paddle/fluid/operators/controlflow/compare_op_mlu.cc @@ -11,7 +11,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/controlflow/compare_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/mlu/mlu_baseop.h" namespace paddle { diff --git a/paddle/fluid/operators/diag_v2_op.cc b/paddle/fluid/operators/diag_v2_op.cc index 93fbff67e220bcf7d1f8dab112a07cc42649595f..ac8c12bcd7ebaa6f47e8d3582887ac327a9f8957 100644 --- a/paddle/fluid/operators/diag_v2_op.cc +++ b/paddle/fluid/operators/diag_v2_op.cc @@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include - #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/infermeta/unary.h" @@ -58,15 +56,56 @@ class DiagV2OpMaker : public framework::OpProtoAndCheckerMaker { } }; +class DiagV2GradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "X", "X", "DiagV2Grad"); + OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output", + framework::GradVarName("X"), "DiagV2Grad"); + + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType( + ctx, framework::GradVarName("Out")), + ctx.GetPlace()); + } +}; + +template +class DiagV2GradOpMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr grad_op) const override { + grad_op->SetType("diag_v2_grad"); + grad_op->SetInput("X", this->Input("X")); + grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); + grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); + grad_op->SetAttrMap(this->Attrs()); + } +}; + +DECLARE_NO_NEED_BUFFER_VARS_INFERER(DiagGradV2NoNeedBufferVarsInferer, "X"); + } // namespace operators } // namespace paddle namespace ops = paddle::operators; + DECLARE_INFER_SHAPE_FUNCTOR(diag_v2, DiagInferShapeFunctor, PD_INFER_META(phi::DiagInferMeta)); -REGISTER_OPERATOR( - diag_v2, ops::DiagV2Op, ops::DiagV2OpMaker, - paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker, - DiagInferShapeFunctor); +REGISTER_OPERATOR(diag_v2, ops::DiagV2Op, ops::DiagV2OpMaker, + ops::DiagV2GradOpMaker, + ops::DiagV2GradOpMaker, + DiagInferShapeFunctor); + +REGISTER_OPERATOR(diag_v2_grad, ops::DiagV2GradOp, + ops::DiagGradV2NoNeedBufferVarsInferer); diff --git a/paddle/fluid/operators/elementwise/elementwise_functor.h b/paddle/fluid/operators/elementwise/elementwise_functor.h index 14baeaa74d2421135401e94fbc10367d50b876fe..54931d99292f9d1453e2a3deb72e75ed63c9f46f 100644 --- a/paddle/fluid/operators/elementwise/elementwise_functor.h +++ b/paddle/fluid/operators/elementwise/elementwise_functor.h @@ -1,11 +1,8 @@ /* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -90,86 +87,6 @@ struct MinFunctor { template using Complex = paddle::platform::complex; -// Fmax -template -struct FMaxFunctor { - inline HOSTDEVICE T operator()(const T a, const T b) const { - return std::fmax(a, b); - } -}; - -template <> -struct FMaxFunctor { - inline HOSTDEVICE paddle::platform::float16 operator()( - const paddle::platform::float16 a, - const paddle::platform::float16 b) const { - float float_a = static_cast(a); - float float_b = static_cast(b); - auto result = std::fmax(float_a, float_b); - return static_cast(result); - } -}; - -template <> -struct FMaxFunctor { - inline HOSTDEVICE int operator()(const int a, const int b) const { - float float_a = static_cast(a); - float float_b = static_cast(b); - auto result = std::fmax(float_a, float_b); - return std::lrint(result); - } -}; - -template <> -struct FMaxFunctor { - inline HOSTDEVICE int64_t operator()(const int64_t a, const int64_t b) const { - double double_a = static_cast(a); - double double_b = static_cast(b); - auto result = std::fmax(double_a, double_b); - return std::llrint(result); - } -}; - -// Fmin -template -struct FMinFunctor { - inline HOSTDEVICE T operator()(const T a, const T b) const { - return std::fmin(a, b); - } -}; - -template <> -struct FMinFunctor { - inline HOSTDEVICE paddle::platform::float16 operator()( - const paddle::platform::float16 a, - const paddle::platform::float16 b) const { - float float_a = static_cast(a); - float float_b = static_cast(b); - auto result = std::fmin(float_a, float_b); - return static_cast(result); - } -}; - -template <> -struct FMinFunctor { - inline HOSTDEVICE int operator()(const int a, const int b) const { - float float_a = static_cast(a); - float float_b = static_cast(b); - auto result = std::fmin(float_a, float_b); - return std::lrint(result); - } -}; - -template <> -struct FMinFunctor { - inline HOSTDEVICE int64_t operator()(const int64_t a, const int64_t b) const { - double double_a = static_cast(a); - double double_b = static_cast(b); - auto result = std::fmin(double_a, double_b); - return std::llrint(result); - } -}; - template struct MinGradXFunctor { inline HOSTDEVICE T operator()(const T x, const T y, const T dout) const { diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.cc b/paddle/fluid/operators/elementwise/elementwise_max_op.cc index 91da732ef0d3dfbda5d9b7734071ec5831bcfa3f..d91315cc511aa80c0e9c44ccc688b2746eac764e 100644 --- a/paddle/fluid/operators/elementwise/elementwise_max_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_max_op.cc @@ -151,21 +151,3 @@ REGISTER_OPERATOR(elementwise_fmax, ops::ElementwiseOp, ops::ElementwiseFMaxGradOpMaker); REGISTER_OPERATOR(elementwise_fmax_grad, ops::ElementwiseOpGrad); - -REGISTER_OP_CPU_KERNEL( - elementwise_fmax, - ops::ElementwiseFMaxKernel, - ops::ElementwiseFMaxKernel, - ops::ElementwiseFMaxKernel, - ops::ElementwiseFMaxKernel, - ops::ElementwiseFMaxKernel); -REGISTER_OP_CPU_KERNEL( - elementwise_fmax_grad, - ops::ElementwiseFMaxGradKernel, - ops::ElementwiseFMaxGradKernel, - ops::ElementwiseFMaxGradKernel, - ops::ElementwiseFMaxGradKernel, - ops::ElementwiseFMaxGradKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.cu b/paddle/fluid/operators/elementwise/elementwise_max_op.cu index 123332a4a23de5c9534c8523993b87d8738f9869..0d5f56fda17322d86ef13990e9fc2432816dc9cb 100644 --- a/paddle/fluid/operators/elementwise/elementwise_max_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_max_op.cu @@ -86,21 +86,3 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseMaxGradKernel, ops::ElementwiseMaxGradKernel); - -REGISTER_OP_CUDA_KERNEL( - elementwise_fmax, - ops::ElementwiseFMaxKernel, - ops::ElementwiseFMaxKernel, - ops::ElementwiseFMaxKernel, - ops::ElementwiseFMaxKernel, - ops::ElementwiseFMaxKernel); -REGISTER_OP_CUDA_KERNEL( - elementwise_fmax_grad, - ops::ElementwiseFMaxGradKernel, - ops::ElementwiseFMaxGradKernel, - ops::ElementwiseFMaxGradKernel, - ops::ElementwiseFMaxGradKernel, - ops::ElementwiseFMaxGradKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.h b/paddle/fluid/operators/elementwise/elementwise_max_op.h index cff30be50a3d14c646cb7d13d6d8aeeb3de250f4..afe1073d89a06618af95490ac6d264073bd930d4 100644 --- a/paddle/fluid/operators/elementwise/elementwise_max_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_max_op.h @@ -35,21 +35,6 @@ class ElementwiseMaxKernel : public framework::OpKernel { } }; -template -class ElementwiseFMaxKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* z = ctx.Output("Out"); - - z->mutable_data(ctx.GetPlace()); - int axis = ctx.Attr("axis"); - ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, - FMaxFunctor(), z); - } -}; - template struct MaxGradDx { HOSTDEVICE T operator()(T x, T y, T out, T dout) const { @@ -104,88 +89,5 @@ class ElementwiseMaxGradKernel : public ElemwiseGradKernel { } }; -template -struct FMaxGradDx { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { - return dout * static_cast((x >= y) || isnan(y)); - } -}; - -template <> -struct FMaxGradDx { - HOSTDEVICE paddle::platform::float16 operator()( - paddle::platform::float16 x, paddle::platform::float16 y, - paddle::platform::float16 out, paddle::platform::float16 dout) const { - return dout * static_cast( - (x >= y) || paddle::platform::isnan(y)); - } -}; - -template <> -struct FMaxGradDx { - HOSTDEVICE int operator()(int x, int y, int out, int dout) const { - return dout * static_cast((x >= y)); - } -}; - -template <> -struct FMaxGradDx { - HOSTDEVICE int64_t operator()(int64_t x, int64_t y, int64_t out, - int64_t dout) const { - return dout * static_cast((x >= y)); - } -}; - -template -struct FMaxGradDy { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { - return dout * static_cast(!((x >= y) || isnan(y))); - } -}; - -template <> -struct FMaxGradDy { - HOSTDEVICE paddle::platform::float16 operator()( - paddle::platform::float16 x, paddle::platform::float16 y, - paddle::platform::float16 out, paddle::platform::float16 dout) const { - return dout * static_cast( - !((x >= y) || paddle::platform::isnan(y))); - } -}; - -template <> -struct FMaxGradDy { - HOSTDEVICE int64_t operator()(int64_t x, int64_t y, int64_t out, - int64_t dout) const { - return dout * static_cast(!((x >= y))); - } -}; - -template <> -struct FMaxGradDy { - HOSTDEVICE int operator()(int x, int y, int out, int dout) const { - return dout * static_cast(!((x >= y))); - } -}; - -template -class ElementwiseFMaxGradKernel : public ElemwiseGradKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - ElemwiseGradKernel::Compute(ctx); - using Tensor = framework::Tensor; - - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - auto* out = dout; // Fake out, not used - int axis = ctx.Attr("axis"); - ElemwiseGradCompute, FMaxGradDy>( - ctx, *x, *y, *out, *dout, axis, dx, dy, FMaxGradDx(), - FMaxGradDy()); - } -}; } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.cc b/paddle/fluid/operators/elementwise/elementwise_min_op.cc index 3a1951999546eb859f6299b0bf7b064ff1b90a1a..dad80a2c33f3abfde457a6d750f89e47374fae13 100644 --- a/paddle/fluid/operators/elementwise/elementwise_min_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_min_op.cc @@ -147,21 +147,3 @@ REGISTER_OPERATOR(elementwise_fmin, ops::ElementwiseOp, ops::ElementwiseFMinGradOpMaker); REGISTER_OPERATOR(elementwise_fmin_grad, ops::ElementwiseOpGrad); - -REGISTER_OP_CPU_KERNEL( - elementwise_fmin, - ops::ElementwiseFMinKernel, - ops::ElementwiseFMinKernel, - ops::ElementwiseFMinKernel, - ops::ElementwiseFMinKernel, - ops::ElementwiseFMinKernel); -REGISTER_OP_CPU_KERNEL( - elementwise_fmin_grad, - ops::ElementwiseFMinGradKernel, - ops::ElementwiseFMinGradKernel, - ops::ElementwiseFMinGradKernel, - ops::ElementwiseFMinGradKernel, - ops::ElementwiseFMinGradKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.cu b/paddle/fluid/operators/elementwise/elementwise_min_op.cu index 5af985567d898d500b59e10d6032be57871c7e98..fb8bc9ac7f83c8dd99e40685acc68eec4c77b3ce 100644 --- a/paddle/fluid/operators/elementwise/elementwise_min_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_min_op.cu @@ -82,21 +82,3 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseMinGradKernel, ops::ElementwiseMinGradKernel); - -REGISTER_OP_CUDA_KERNEL( - elementwise_fmin, - ops::ElementwiseFMinKernel, - ops::ElementwiseFMinKernel, - ops::ElementwiseFMinKernel, - ops::ElementwiseFMinKernel, - ops::ElementwiseFMinKernel); -REGISTER_OP_CUDA_KERNEL( - elementwise_fmin_grad, - ops::ElementwiseFMinGradKernel, - ops::ElementwiseFMinGradKernel, - ops::ElementwiseFMinGradKernel, - ops::ElementwiseFMinGradKernel, - ops::ElementwiseFMinGradKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.h b/paddle/fluid/operators/elementwise/elementwise_min_op.h index 88fb044d42206eb0f89ac84df166e2e7ff33c5b3..283ad2adde978680d4d0c3a579d55e588368a28e 100644 --- a/paddle/fluid/operators/elementwise/elementwise_min_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_min_op.h @@ -35,21 +35,6 @@ class ElementwiseMinKernel : public framework::OpKernel { } }; -template -class ElementwiseFMinKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* z = ctx.Output("Out"); - - z->mutable_data(ctx.GetPlace()); - int axis = ctx.Attr("axis"); - ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, - FMinFunctor(), z); - } -}; - template struct MinGradDx { HOSTDEVICE T operator()(T x, T y, T out, T dout) const { @@ -124,89 +109,5 @@ class ElementwiseMinGradKernel : public ElemwiseGradKernel { ElementwiseMinGrad(ctx, x, y, out, dout, dx, dy); } }; - -template -struct FMinGradDx { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { - return dout * static_cast((x <= y) || isnan(y)); - } -}; - -template <> -struct FMinGradDx { - HOSTDEVICE paddle::platform::float16 operator()( - paddle::platform::float16 x, paddle::platform::float16 y, - paddle::platform::float16 out, paddle::platform::float16 dout) const { - return dout * static_cast( - (x <= y) || paddle::platform::isnan(y)); - } -}; - -template <> -struct FMinGradDx { - HOSTDEVICE int operator()(int x, int y, int out, int dout) const { - return dout * static_cast((x <= y)); - } -}; - -template <> -struct FMinGradDx { - HOSTDEVICE int64_t operator()(int64_t x, int64_t y, int64_t out, - int64_t dout) const { - return dout * static_cast((x <= y)); - } -}; - -template -struct FMinGradDy { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { - return dout * static_cast(!((x <= y) || isnan(y))); - } -}; - -template <> -struct FMinGradDy { - HOSTDEVICE paddle::platform::float16 operator()( - paddle::platform::float16 x, paddle::platform::float16 y, - paddle::platform::float16 out, paddle::platform::float16 dout) const { - return dout * static_cast( - !((x <= y) || paddle::platform::isnan(y))); - } -}; - -template <> -struct FMinGradDy { - HOSTDEVICE int operator()(int x, int y, int out, int dout) const { - return dout * static_cast(!((x <= y))); - } -}; - -template <> -struct FMinGradDy { - HOSTDEVICE int64_t operator()(int64_t x, int64_t y, int64_t out, - int64_t dout) const { - return dout * static_cast(!((x <= y))); - } -}; - -template -class ElementwiseFMinGradKernel : public ElemwiseGradKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - ElemwiseGradKernel::Compute(ctx); - using Tensor = framework::Tensor; - - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - auto* out = dout; // Fake out, not used - int axis = ctx.Attr("axis"); - ElemwiseGradCompute, FMinGradDy>( - ctx, *x, *y, *out, *dout, axis, dx, dy, FMinGradDx(), - FMinGradDy()); - } -}; } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h index 763fc5f2674104a718e33f5ef5ac7b2a1a7b23f5..ad8fd317013908e8908dff8bea3440e24779454e 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h @@ -32,6 +32,45 @@ using dnnl::stream; template class EltwiseMKLDNNKernel : public framework::OpKernel { + private: + dnnl::post_ops get_post_ops(const framework::ExecutionContext& ctx) const { + dnnl::post_ops post_operations; + if (ctx.HasAttr("activation_type")) { + const float scale = ctx.HasAttr("activation_scale") + ? ctx.Attr("activation_scale") + : 1.0f; + const float alpha = ctx.HasAttr("activation_alpha") + ? ctx.Attr("activation_alpha") + : 0.0f; + const float beta = ctx.HasAttr("activation_beta") + ? ctx.Attr("activation_beta") + : 0.0f; + + static std::unordered_map algo_map = { + {"relu", dnnl::algorithm::eltwise_relu}, + {"tanh", dnnl::algorithm::eltwise_tanh}, + {"leaky_relu", dnnl::algorithm::eltwise_relu}, + {"swish", dnnl::algorithm::eltwise_swish}, + {"hardswish", dnnl::algorithm::eltwise_hardswish}, + {"sqrt", dnnl::algorithm::eltwise_sqrt}, + {"abs", dnnl::algorithm::eltwise_abs}, + {"clip", dnnl::algorithm::eltwise_clip}, + {"gelu", dnnl::algorithm::eltwise_gelu_erf}, + {"gelu_tanh", dnnl::algorithm::eltwise_gelu_tanh}, + {"relu6", dnnl::algorithm::eltwise_bounded_relu}, + {"sigmoid", dnnl::algorithm::eltwise_logistic}}; + + const auto& activation_type = + algo_map.find(ctx.Attr("activation_type")); + + if (activation_type != algo_map.end()) { + post_operations.append_eltwise(scale, activation_type->second, alpha, + beta); + } + } + return post_operations; + } + public: void Compute(const framework::ExecutionContext& ctx) const override { const auto& dev_ctx = @@ -47,9 +86,9 @@ class EltwiseMKLDNNKernel : public framework::OpKernel { float scale_o = ctx.Attr("Scale_out"); int axis = ctx.Attr("axis"); - platform::BinaryMKLDNNHandler handler(BINARY_OP, axis, mkldnn_engine, - ctx.GetPlace(), x, y, z, scale_x, - scale_y, scale_o); + platform::BinaryMKLDNNHandler handler( + BINARY_OP, axis, mkldnn_engine, ctx.GetPlace(), x, y, z, scale_x, + scale_y, scale_o, get_post_ops(ctx)); const auto src_x_memory = handler.AcquireSrcMemory(x); const auto src_y_memory = handler.AcquireSecondSrcMemory(y); diff --git a/paddle/fluid/operators/filter_by_instag_op.cu b/paddle/fluid/operators/filter_by_instag_op.cu index 508730c3c7335dbad8cf70417d2c19be4a8480a2..7870efba4e7a1a285bbd4b28b04c2b15f263c347 100644 --- a/paddle/fluid/operators/filter_by_instag_op.cu +++ b/paddle/fluid/operators/filter_by_instag_op.cu @@ -96,30 +96,6 @@ __global__ void filter_copy_fuse_kernel( if (N < ins_end) ins_end = N; - /* - if (!x1_lods_filled) { - for (int p = ins_start; p < ins_end; p++) { - x1_lods_data[p] = p; - } - if (idx == 0) { - x1_lods_data[N] = N; - } - } - - if (!x2_lods_filled) { - for (int p = ins_start; p < ins_end; p++) { - x2_lods_data[p] = p; - } - if (idx == 0) { - x2_lods_data[N] = N; - } - } - - if (!x1_lods_filled || !x2_lods_filled) { - b.sync(); - } - */ - int flag_data[5]; int prefix_sum_data[5]; int prefix_sum_data2[5]; @@ -173,8 +149,6 @@ __global__ void filter_copy_fuse_kernel( local_addr = prefix_sum_data[ins_end - 1 - ins_start]; sum_addr = local_addr; - // flag - // local_flag = 0; for (int p = ins_start; p < ins_end; p++) { local_flag += flag_data[p - ins_start]; } @@ -188,7 +162,6 @@ __global__ void filter_copy_fuse_kernel( sum_out_lods = local_out_lods; } - // 32 threads for (int i = 1; i < warp_thread_num; i *= 2) { int temp_addr = g.shfl_up(sum_addr, i); int temp_flag = g.shfl_up(sum_flag, i); @@ -266,27 +239,16 @@ __global__ void filter_copy_fuse_kernel( if (ins_start < ins_end) { int out_lods_idx = p_flag + 1; - - // ins_start = 1 - // BUG fix for (int p = ins_start; p < ins_end; p++) { if (flag_data[p - ins_start] == 1) { - // batch_len = 2 - // batch_len = 4 size_t batch_len = x1_lods_data[p + 1] - x1_lods_data[p]; - // t = 0 - // t = 1 int t = out_lods_idx - 1; - // out_lods_data[0] = 0; int previous; - if (out_lods_idx == p_flag + 1) { - // out_lods_data[t] = p_out_lods; previous = p_out_lods; } else { previous = out_lods_data[t]; } - map_data[t * 3] = (int64_t)previous; map_data[t * 3 + 1] = x1_lods_data[p]; map_lods_data[t] = t; @@ -300,7 +262,6 @@ __global__ void filter_copy_fuse_kernel( if (sum_out_lods4 > 1) { int out_data_num = sum_out_lods4 - 1; int out_start = ins_start; - if (out_start < out_data_num) { int out_end = ins_end >= out_data_num ? out_data_num : ins_end; for (int p = out_start; p < out_end; p++) { @@ -314,11 +275,8 @@ __global__ void filter_copy_fuse_kernel( if (flag_data[p - ins_start] == 1) { auto output_start_idx = prefix_sum_data2[p - ins_start]; T* dst = out_data + output_start_idx * x1_embed_size; - const T* src_start = x1_data + x1_lods_data[p] * x1_embed_size; const T* src_end = x1_data + x1_lods_data[p + 1] * x1_embed_size; - - // optimized for (const T *j = src_start; j != src_end; dst++, j++) { *dst = *j; } @@ -338,12 +296,10 @@ __global__ void copy_grad_kernel(const size_t N, const int ins_per_thread, int idx = blockIdx.x * blockDim.x + threadIdx.x; int ins_start = idx * ins_per_thread; int ins_end = (idx + 1) * ins_per_thread; - if (ins_start >= N) { return; } if (ins_end > N) ins_end = N; - for (int p = ins_start; p < ins_end; p++) { T* dst = x1_grad_data + map_data[p * 3 + 1] * x1_embed_size; const T* src_start = out_grad_data + map_data[p * 3] * x1_embed_size; @@ -394,21 +350,17 @@ class FilterByInstagGPUKernel : public framework::OpKernel { const Tensor* x3 = context.Input("Filter_tag"); const int64_t* x3_data = x3->data(); - // int x2_lods_filled = 1; - Vector x2_lods; - // Vector, in GPU if (x2->lod().size() != 0) { // lod_level = 1 x2_lods = x2->lod()[0]; - // x2_lods_filled = 1; - } else { // lod_level = 0 const size_t x2_lods_size = x2->dims()[0]; + const size_t instag_per_num = x2->dims()[1]; // x2_lods.resize(x2->dims()[0] + 1); // move to cuda x2_lods.push_back(0); for (size_t i = 0; i < x2_lods_size; i++) { - x2_lods.push_back(i + 1); + x2_lods.push_back(x2_lods.back() + instag_per_num); } } @@ -417,13 +369,8 @@ class FilterByInstagGPUKernel : public framework::OpKernel { size_t* x2_lods_data = mixv_x2_lods.CUDAMutableData(gpu_place); - // Vector, in GPU - // int x1_lods_filled = 1; Vector x1_lods; - if (!is_x1_lod) { - // move to cuda - // x1_lods.resize(x1->dims()[0] + 1); x1_lods.push_back(0); for (int i = 0; i < x1->dims()[0]; i++) { x1_lods.push_back(i + 1); @@ -432,7 +379,6 @@ class FilterByInstagGPUKernel : public framework::OpKernel { // x1_lods = context.Input("Ins")->lod()[0]; // new: lod_level=0 => lod() return {} if (x1->lod().size() != 0) { // lod_level = 1 - // x1_lods_filled = 1; x1_lods = x1->lod()[0]; } else { // lod_level = 0 // x1_lods.resize(x1->dims()[0] + 1); @@ -458,10 +404,6 @@ class FilterByInstagGPUKernel : public framework::OpKernel { LoDTensor* loss_weight = context.Output("LossWeight"); int out_first = x1_lods.back(); - // int out_first = x1->dims()[0]; - // if (x1_lods_filled) { - // out_first = x1_lods.back(); - // } out->Resize(phi::make_ddim({(int64_t)out_first, (int64_t)x1_embed_size})); map->Resize(phi::make_ddim({(int64_t)x2_lods_size, 3})); diff --git a/paddle/fluid/operators/flatten_op.h b/paddle/fluid/operators/flatten_op.h index 5ef13b38c8a86e16cefdc97be6934b313fdb7bc4..feae954e355b85f5a18f8a48919770fd46a73f70 100644 --- a/paddle/fluid/operators/flatten_op.h +++ b/paddle/fluid/operators/flatten_op.h @@ -16,7 +16,6 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/phi_utils.h" -#include "paddle/fluid/operators/math/pooling.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/flatten_grad_kernel.h" diff --git a/paddle/fluid/operators/gather_nd_op.cc b/paddle/fluid/operators/gather_nd_op.cc index e5ca15a39ef51f7807246c2ee1d473a0499b6463..7d7d6ae81a0935402f94cbc16e31fbba8009ce9c 100644 --- a/paddle/fluid/operators/gather_nd_op.cc +++ b/paddle/fluid/operators/gather_nd_op.cc @@ -16,7 +16,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/infermeta/backward.h" #include "paddle/phi/infermeta/binary.h" -#include "paddle/phi/infermeta/ternary.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/gather_op_npu.cc b/paddle/fluid/operators/gather_op_npu.cc index a83abb245224baf837296aa6be8f6ceb96ac700c..21093f585b59eea24a231b4dcdf264dc16178fbd 100644 --- a/paddle/fluid/operators/gather_op_npu.cc +++ b/paddle/fluid/operators/gather_op_npu.cc @@ -17,7 +17,6 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/kron_op.h" #include "paddle/fluid/platform/device/npu/npu_info.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" diff --git a/paddle/fluid/operators/group_norm_op.cc b/paddle/fluid/operators/group_norm_op.cc index 2d284fb516e62b08fb48ab96d2478675c495c6f6..4331523d26edc1012ff67e4a08f69d682753bb7a 100644 --- a/paddle/fluid/operators/group_norm_op.cc +++ b/paddle/fluid/operators/group_norm_op.cc @@ -167,9 +167,11 @@ class GroupNormGradOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext *ctx) const override { // check input + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "GroupNormGrad"); OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "GroupNormGrad"); OP_INOUT_CHECK(ctx->HasInput("Variance"), "Input", "Variance", "GroupNormGrad"); + OP_INOUT_CHECK(ctx->HasInput("Mean"), "Input", "Mean", "GroupNormGrad"); OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Y")), "Input", framework::GradVarName("Y"), "GroupNormGrad"); @@ -216,10 +218,12 @@ class GroupNormGradMaker : public framework::SingleGradOpMaker { void Apply(GradOpPtr op) const override { op->SetType("group_norm_grad"); + op->SetInput("X", this->Input("X")); op->SetInput("Scale", this->Input("Scale")); op->SetInput("Bias", this->Input("Bias")); op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y")); op->SetInput("Y", this->Output("Y")); + op->SetInput("Mean", this->Output("Mean")); op->SetInput("Variance", this->Output("Variance")); op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu index b376334f1e93cc3be9e716d808525011edb29b94..ab8c50d90b8ece68b8e4e05d46cecd13fa84d7e0 100644 --- a/paddle/fluid/operators/group_norm_op.cu +++ b/paddle/fluid/operators/group_norm_op.cu @@ -81,46 +81,74 @@ __global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C, int W, CudaAtomicAddWithWarp(&var[bid * groups + gid], x_var); } -template -__device__ __forceinline__ void ThreadReduce(const T* input, int size, - const int offset, AccT* mean, - AccT* var) { +template +__device__ __forceinline__ void ThreadReduce(phi::Array arrs, + int size, const int offset, + AccT* out_mean, AccT* out_var) { + const T* x = arrs[0]; + const T* y; + if (Num == 2) { + y = arrs[1]; + } using VecT = kps::details::VectorType; int tid = threadIdx.x; if (offset > 0) { - input -= offset; + x -= offset; + if (Num == 2) { + y -= offset; + } size += offset; if (tid >= offset) { - AccT temp = input[tid]; - *mean += temp; - *var += temp * temp; + if (Num == 1) { + *out_mean += x[tid]; + *out_var += x[tid] * x[tid]; + } else if (Num == 2) { + *out_mean += y[tid]; + *out_var += y[tid] * x[tid]; + } } size -= blockDim.x; - input += blockDim.x; + x += blockDim.x; + if (Num == 2) { + y += blockDim.x; + } } int remain = size % (VecSize * blockDim.x); - T ins[VecSize]; - VecT* ins_vec = reinterpret_cast(&ins); + T ins_x[VecSize]; + T ins_y[VecSize]; + VecT* ins_vec_x = reinterpret_cast(&ins_x); + VecT* ins_vec_y = reinterpret_cast(&ins_y); // vector part for (; VecSize * tid < (size - remain); tid += blockDim.x) { - *ins_vec = reinterpret_cast(input)[tid]; + *ins_vec_x = reinterpret_cast(x)[tid]; + if (Num == 2) { + *ins_vec_y = reinterpret_cast(y)[tid]; + } #pragma unroll for (int i = 0; i < VecSize; ++i) { - AccT temp = ins[i]; - *mean += temp; - *var += temp * temp; + if (Num == 1) { + *out_mean += ins_x[i]; + *out_var += ins_x[i] * ins_x[i]; + } else if (Num == 2) { + *out_mean += ins_y[i]; + *out_var += ins_y[i] * ins_x[i]; + } } } // scalar part tid = size - remain + threadIdx.x; for (; tid < size; tid += blockDim.x) { - AccT temp = input[tid]; - *mean += temp; - *var += temp * temp; + if (Num == 1) { + *out_mean += x[tid]; + *out_var += x[tid] * x[tid]; + } else if (Num == 2) { + *out_mean += y[tid]; + *out_var += y[tid] * x[tid]; + } } } @@ -148,7 +176,10 @@ __global__ void VectorizedGetMeanAndVarNCHW(const T* x, T* mean, T* var, AccT x_var = static_cast(0); const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T); x += i * size; - ThreadReduce(x, size, input_offset, &x_mean, &x_var); + phi::Array ins; + ins[0] = x; + ThreadReduce(ins, size, input_offset, &x_mean, &x_var); + x_mean = kps::details::BlockXReduce>( x_mean, kps::AddFunctor()); x_var = kps::details::BlockXReduce>( @@ -310,10 +341,12 @@ class GroupNormKernel }; template -__global__ void GroupNormBackwardGetMeanAndVar( - const T* x, const T* scale, const T* bias, const T* d_y, int N, int C, - int W, int imsize, int groups, int group_size, T epsilon, T* d_mean, - T* d_var, T* d_scale, T* d_bias, const DataLayout data_layout) { +__global__ void GroupNormBackwardGetMeanAndVar(const T* x, const T* scale, + const T* bias, const T* d_y, + int N, int C, int W, int imsize, + int groups, int group_size, + T epsilon, T* d_mean, T* d_var, + T* d_scale, T* d_bias) { int gid = blockIdx.y; int cid = blockIdx.x; int bid = blockIdx.z; @@ -329,15 +362,11 @@ __global__ void GroupNormBackwardGetMeanAndVar( for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) { T val, dval; - if (data_layout == DataLayout::kNCHW) { - val = x[(bid * C + ccid) * imsize + imid] - x_bias; - dval = d_y[(bid * C + ccid) * imsize + imid]; - } else { - int hid = imid / W; - int wid = imid % W; - val = x[(bid * H + hid) * W * C + wid * C + ccid] - x_bias; - dval = d_y[(bid * H + hid) * W * C + wid * C + ccid]; - } + + int hid = imid / W; + int wid = imid % W; + val = x[(bid * H + hid) * W * C + wid * C + ccid] - x_bias; + dval = d_y[(bid * H + hid) * W * C + wid * C + ccid]; d_var_data += val * dval; d_mean_data += dval * x_scale; @@ -357,8 +386,7 @@ __global__ void GroupNormBackward(const T* x, const T* d_y, const T* scale, const T* bias, const T* var, const T* d_mean, const T* d_var, int N, int C, int W, int imsize, int groups, int group_size, - T epsilon, T* d_x, - const DataLayout data_layout) { + T epsilon, T* d_x) { int gid = blockIdx.y; int cid = blockIdx.x; int bid = blockIdx.z; @@ -379,26 +407,142 @@ __global__ void GroupNormBackward(const T* x, const T* d_y, const T* scale, if (x_scale != 0) x_scale_inv = 1.0 / x_scale; for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) { - if (data_layout == DataLayout::kNCHW) { - T tmp = x[(bid * C + ccid) * imsize + imid]; - T v_y = (tmp - x_bias) * x_scale_inv; - T dly = d_y[(bid * C + ccid) * imsize + imid]; - d_x[(bid * C + ccid) * imsize + imid] = - x_var_inv * - (dly * x_scale - number_inv * d_x_var * v_y - number_inv * d_x_mean); - } else { - int hid = imid / W; - int wid = imid % W; - T tmp = x[(bid * H + hid) * W * C + wid * C + ccid]; - T v_y = (tmp - x_bias) * x_scale_inv; - T dly = d_y[(bid * H + hid) * W * C + wid * C + ccid]; - d_x[(bid * H + hid) * W * C + wid * C + ccid] = - x_var_inv * - (dly * x_scale - number_inv * d_x_var * v_y - number_inv * d_x_mean); + int hid = imid / W; + int wid = imid % W; + T tmp = x[(bid * H + hid) * W * C + wid * C + ccid]; + T v_y = (tmp - x_bias) * x_scale_inv; + T dly = d_y[(bid * H + hid) * W * C + wid * C + ccid]; + d_x[(bid * H + hid) * W * C + wid * C + ccid] = + x_var_inv * + (dly * x_scale - number_inv * d_x_var * v_y - number_inv * d_x_mean); + } +} + +template +__global__ void VectorizedGetDsDbCUDAKernel(int imsize, const T* x, const T* dy, + T* ds, T* db) { + int i = blockIdx.x; + AccT ds_sum = static_cast(0); + AccT db_sum = static_cast(0); + const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T); + x += i * imsize; + + phi::Array ins; + ins[0] = x; + ins[1] = dy; + ThreadReduce(ins, imsize, input_offset, &db_sum, + &ds_sum); + + ds_sum = kps::details::BlockXReduce>( + ds_sum, kps::AddFunctor()); + db_sum = kps::details::BlockXReduce>( + db_sum, kps::AddFunctor()); + __syncthreads(); + if (threadIdx.x == 0) { + ds[i] = ds_sum; + db[i] = db_sum; + } +} + +template +__global__ void ScalarGetDsDbCUDAKernel(int imsize, const T* x, const T* dy, + T* ds, T* db) { + const int nc = blockIdx.x; + T ds_sum = 0; + T db_sum = 0; + for (int i = threadIdx.x; i < imsize; i += blockDim.x) { + const int index = nc * imsize + i; + ds_sum += dy[index] * x[index]; + db_sum += dy[index]; + } + CudaAtomicAddWithWarp(&ds[nc], ds_sum); + CudaAtomicAddWithWarp(&db[nc], db_sum); +} + +template +__global__ void GetScaleBiasGradientCUDAKernel(int N, int C, int group, + T epsilon, const T* mean, + const T* var, const T* ds, + const T* db, T* d_scale, + T* d_bias) { + const int c = blockIdx.x * blockDim.x + threadIdx.x; + if (c < C) { + const int G = group; + const int D = C / G; + T sum1 = 0; + T sum2 = 0; + for (int n = 0; n < N; ++n) { + const int nc = n * C + c; + const int ng = n * G + c / D; + sum1 += (d_scale == nullptr) + ? T(0) + : ((ds[nc] - db[nc] * static_cast(mean[ng])) * + static_cast(rsqrt(var[ng] + epsilon))); + sum2 += (d_bias == nullptr) ? T(0) : db[nc]; + } + if (d_scale != nullptr) { + d_scale[c] = sum1; + } + if (d_bias != nullptr) { + d_bias[c] = sum2; } } } +template +__global__ void GetBackwardParamsCUDAKernel(int imsize, int groups, + int group_size, T epsilon, + const T* mean, const T* var, + const T* scale, const T* ds, + const T* db, T* p1, T* p2, T* p3) { + const int n = blockIdx.x; + const int g = blockIdx.y; + const int ng = n * groups + g; + T sum1 = 0; + T sum2 = 0; + T var_inv = rsqrt(var[ng] + epsilon); + for (int64_t i = threadIdx.x; i < group_size; i += blockDim.x) { + const int64_t index = ng * group_size + i; + const int64_t c = g * group_size + i; + const T scale_v = scale == nullptr ? T(1) : static_cast(scale[c]); + sum1 += ds[index] * scale_v; + sum2 += db[index] * scale_v; + const T scale_c = scale == nullptr ? T(0) : static_cast(scale[c]); + p1[index] = scale_c * var_inv; + } + + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage ds_storage; + __shared__ typename BlockReduce::TempStorage db_storage; + sum1 = BlockReduce(ds_storage).Reduce(sum1, cub::Sum()); + sum2 = BlockReduce(db_storage).Reduce(sum2, cub::Sum()); + + if (threadIdx.x == 0) { + const T s = T(1) / static_cast(group_size * imsize); + const T x = (sum2 * static_cast(mean[ng]) - sum1) * + static_cast(var_inv) * static_cast(var_inv) * + static_cast(var_inv) * s; + p2[ng] = x; + p3[ng] = -x * static_cast(mean[ng]) - sum2 * static_cast(var_inv) * s; + } +} + +template +__global__ void GetXGradientCUDAKernel(int imsize, int C, int group_size, + int groups, T* p1, T* p2, T* p3, + const T* x, const T* dy, T* dx) { + int cid = blockIdx.x; + int gid = blockIdx.y; + int bid = blockIdx.z; + int ccid = bid * C + gid * group_size + cid; + int ng = bid * groups + gid; + int nc = gid * group_size + cid; + for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) { + int index = (bid * C + nc) * imsize + imid; + dx[index] = p1[ccid] * dy[index] + p2[ng] * x[index] + p3[ng]; + } +} + template class GroupNormGradKernel : public framework::OpKernel { @@ -408,7 +552,9 @@ class GroupNormGradKernel const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); const float epsilon = ctx.Attr("epsilon"); - auto* x = ctx.Input("Y"); + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* mean = ctx.Input("Mean"); auto* var = ctx.Input("Variance"); auto* scale = ctx.Input("Scale"); auto* bias = ctx.Input("Bias"); @@ -433,31 +579,27 @@ class GroupNormGradKernel phi::funcs::SetConstant set_zero; auto& dev_ctx = ctx.template device_context(); - Tensor temp_var; - temp_var.mutable_data(var->dims(), ctx.GetPlace()); - set_zero(dev_ctx, &temp_var, static_cast(0)); - T* temp_var_data = temp_var.data(); - - Tensor temp_mean; - temp_mean.mutable_data(var->dims(), ctx.GetPlace()); - set_zero(dev_ctx, &temp_mean, static_cast(0)); - T* temp_mean_data = temp_mean.data(); + Tensor ds, db; + ds.mutable_data({x_dims[0], C}, ctx.GetPlace()); + db.mutable_data({x_dims[0], C}, ctx.GetPlace()); + T* ds_data = ds.data(); + T* db_data = db.data(); + auto* y_data = y->data(); auto* x_data = x->data(); T* d_x_data = nullptr; if (d_x) d_x_data = d_x->data(); - auto* y_data = d_y->data(); + auto* dy_data = d_y->data(); auto* var_data = var->data(); + auto* mean_data = mean->data(); T* d_scale_data = nullptr; if (d_scale) { d_scale->mutable_data(ctx.GetPlace()); - set_zero(dev_ctx, d_scale, static_cast(0)); d_scale_data = d_scale->data(); } T* d_bias_data = nullptr; if (d_bias) { d_bias->mutable_data(ctx.GetPlace()); - set_zero(dev_ctx, d_bias, static_cast(0)); d_bias_data = d_bias->data(); } @@ -479,22 +621,103 @@ class GroupNormGradKernel #ifdef __HIPCC__ int block_size = std::max(std::min(256, imsize), 64); + const int block_dims = 256; #else int block_size = std::min(1024, imsize); + const int block_dims = 1024; #endif dim3 grid(group_size, groups, x_dims[0]); dim3 threads(block_size, 1, 1); int flags = (scale_data != nullptr) * kHasScale + (bias_data != nullptr) * kHasBias; - UNROLL_ALL_CASES(flags, GroupNormBackwardGetMeanAndVar, x_data, scale_data, - bias_data, y_data, x_dims[0], C, W, imsize, groups, - group_size, epsilon, temp_mean_data, temp_var_data, - d_scale_data, d_bias_data, data_layout); - if (d_x_data != nullptr) { - UNROLL_ALL_CASES(flags, GroupNormBackward, x_data, y_data, scale_data, - bias_data, var_data, temp_mean_data, temp_var_data, - x_dims[0], C, W, imsize, groups, group_size, epsilon, - d_x_data, data_layout); + if (data_layout == DataLayout::kNCHW) { + using AccT = typename details::MPTypeTrait::Type; + constexpr int vec_size = sizeof(float4) / sizeof(T); + const int max_num_threads = 1024; + int max_block_size = std::min(imsize / vec_size, max_num_threads); + int block_size_nchw = 1; + while (block_size_nchw < max_block_size) { + block_size_nchw *= 2; + } + block_size_nchw = std::max(block_size_nchw, kps::details::kWarpSize); + dim3 blocks(block_size_nchw); + if (imsize < vec_size) { + if (d_scale) { + set_zero(dev_ctx, d_scale, static_cast(0)); + } + if (d_bias) { + set_zero(dev_ctx, d_bias, static_cast(0)); + } + ScalarGetDsDbCUDAKernel< + T><<>>( + imsize, x_data, dy_data, ds_data, db_data); + } else { + VectorizedGetDsDbCUDAKernel< + T, AccT, vec_size><<>>( + imsize, x_data, dy_data, ds_data, db_data); + } + + if (d_scale || d_bias) { + const int block = 256; + GetScaleBiasGradientCUDAKernel< + T><<<(C + block - 1) / block, block, 0, dev_ctx.stream()>>>( + x_dims[0], C, groups, epsilon, mean_data, var_data, ds_data, + db_data, d_scale_data, d_bias_data); + } + + if (d_x_data != nullptr) { + // p1 * dy + p2 * x + p3, + // p1, p2, p3 represent the reverse calculation of temporary variables + // p1 = scale * var_inv + // p2 = (db * scale * mean - ds * scale) * pow(var_inv, 3) * (1/n) + // p3 = -p2 * mean[ng] - db * scale * var_inv * (1/n); + Tensor p1, p2, p3; + p1.mutable_data({x_dims[0] * C}, ctx.GetPlace()); + p2.mutable_data({x_dims[0], groups}, ctx.GetPlace()); + p3.mutable_data({x_dims[0], groups}, ctx.GetPlace()); + T* p1_data = p1.data(); + T* p2_data = p2.data(); + T* p3_data = p3.data(); + + GetBackwardParamsCUDAKernel<<< + dim3(x_dims[0], groups), block_dims, 0, dev_ctx.stream()>>>( + imsize, groups, group_size, epsilon, mean_data, var_data, + scale_data, ds_data, db_data, p1_data, p2_data, p3_data); + GetXGradientCUDAKernel<<>>( + imsize, C, group_size, groups, p1_data, p2_data, p3_data, x_data, + dy_data, d_x_data); + } + + } else { + if (d_scale) { + set_zero(dev_ctx, d_scale, static_cast(0)); + } + if (d_bias) { + set_zero(dev_ctx, d_bias, static_cast(0)); + } + + Tensor temp_var; + temp_var.mutable_data(var->dims(), ctx.GetPlace()); + set_zero(dev_ctx, &temp_var, static_cast(0)); + T* temp_var_data = temp_var.data(); + + Tensor temp_mean; + temp_mean.mutable_data(var->dims(), ctx.GetPlace()); + set_zero(dev_ctx, &temp_mean, static_cast(0)); + T* temp_mean_data = temp_mean.data(); + + int flags = (scale_data != nullptr) * kHasScale + + (bias_data != nullptr) * kHasBias; + UNROLL_ALL_CASES(flags, GroupNormBackwardGetMeanAndVar, y_data, + scale_data, bias_data, dy_data, x_dims[0], C, W, imsize, + groups, group_size, epsilon, temp_mean_data, + temp_var_data, d_scale_data, d_bias_data); + if (d_x_data != nullptr) { + UNROLL_ALL_CASES(flags, GroupNormBackward, y_data, dy_data, scale_data, + bias_data, var_data, temp_mean_data, temp_var_data, + x_dims[0], C, W, imsize, groups, group_size, epsilon, + d_x_data); + } } } }; diff --git a/paddle/fluid/operators/interpolate_v2_op.cu b/paddle/fluid/operators/interpolate_v2_op.cu index d61eb46d97e98972963f5871a4c6e7b06468337c..cd297c53f89a0f7efc622de7c385b9f75dc7462b 100644 --- a/paddle/fluid/operators/interpolate_v2_op.cu +++ b/paddle/fluid/operators/interpolate_v2_op.cu @@ -61,13 +61,13 @@ inline platform::GpuLaunchConfig GetGpuLaunchConfig3D( template __forceinline__ __device__ void PreCalculatorForLinearInterpInputIndex( - int* in_img_idx, int* w_id, T* w1lambda, T* w2lambda, T src_w, - const int in_img_w) { - src_w = (src_w > 0) ? src_w : 0.f; - *in_img_idx = static_cast(src_w); - *w_id = (*in_img_idx < in_img_w - 1) ? 1 : 0; - *w1lambda = src_w - *in_img_idx; - *w2lambda = 1.f - *w1lambda; + int* in_img_idx, int* x_id, T* lambda1, T* lambda2, T src_x, + const int in_img_x) { + src_x = (src_x > 0) ? src_x : 0.f; + *in_img_idx = static_cast(src_x); + *x_id = (*in_img_idx < in_img_x - 1) ? 1 : 0; + *lambda1 = src_x - *in_img_idx; + *lambda2 = 1.f - *lambda1; } struct FastDivModForInterpolate { @@ -670,83 +670,102 @@ __global__ void KeBilinearInterpBwShareMemory( } } +__device__ __forceinline__ int GetInputIndex(const size_t nc, const int height, + const int width, const int h, + const int w) { + return (nc * height + h) * width + w; +} + +template +__global__ void KeBilinearInterpNCHWBw(T* in, const int in_h, const int in_w, + const int out_h, const int out_w, + const int n, const int num_channels, + float ratio_h, float ratio_w, + const T* __restrict__ out, + const T align_type_value) { + int index = threadIdx.x + blockDim.x * blockIdx.x; + int stride = blockDim.x * gridDim.x; + int num_out = n * num_channels * out_h * out_w; + int num_in = n * num_channels * in_h * in_w; + + for (; index < num_out; index += stride) { + int index_tmp = index; + int w2 = index_tmp % out_w; + index_tmp /= out_w; + int h2 = index_tmp % out_h; + int nc = index_tmp / out_h; + + int h1, y_id; + T h1lambda, h0lambda; + T src_y = ratio_h * (h2 + align_type_value) - align_type_value; + + PreCalculatorForLinearInterpInputIndex(&h1, &y_id, &h1lambda, &h0lambda, + src_y, in_h); + int w1, x_id; + T w1lambda, w0lambda; + T src_x = ratio_w * (w2 + align_type_value) - align_type_value; + PreCalculatorForLinearInterpInputIndex(&w1, &x_id, &w1lambda, &w0lambda, + src_x, in_w); + + T d2val = out[index]; + + platform::CudaAtomicAdd(in + GetInputIndex(nc, in_h, in_w, h1, w1), + h0lambda * w0lambda * d2val); + platform::CudaAtomicAdd(in + GetInputIndex(nc, in_h, in_w, h1, w1 + x_id), + h0lambda * w1lambda * d2val); + platform::CudaAtomicAdd(in + GetInputIndex(nc, in_h, in_w, h1 + y_id, w1), + h1lambda * w0lambda * d2val); + platform::CudaAtomicAdd( + in + GetInputIndex(nc, in_h, in_w, h1 + y_id, w1 + x_id), + h1lambda * w1lambda * d2val); + } +} + template __global__ void KeBilinearInterpBw(T* in, const int in_h, const int in_w, const T* __restrict__ out, const int out_h, const int out_w, const int n, - const int num_channels, float ratio_h, - float ratio_w, const T align_type_value, - bool is_nchw) { + const int out_chw, const int num_channels, + float ratio_h, float ratio_w, + const T align_type_value, + FastDivModForInterpolate divmods) { int tid = blockIdx.x * blockDim.x + threadIdx.x; int stride = blockDim.x * gridDim.x; int in_chw = in_h * in_w * num_channels; - int out_chw = num_channels * out_h * out_w; int nthreads = n * out_chw; - if (is_nchw) { - for (; tid < nthreads; tid += stride) { - int out_id_h = tid / out_chw; - int out_id_w = tid % out_chw; - const int in_img_size = in_h * in_w; - const int out_img_size = out_h * out_w; - T value = out[out_id_h * out_chw + out_id_w]; - - int channel_id = out_id_w / out_img_size; - int out_img_idy = (out_id_w % out_img_size) / out_w; - int out_img_idx = tid % out_w; - int in_img_idx, in_img_idy, w_id, h_id; - T w1lambda, h1lambda, w2lambda, h2lambda; - - T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value; - T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value; - - PreCalculatorForLinearInterpInputIndex(&in_img_idx, &w_id, &w1lambda, - &w2lambda, src_w, in_w); - PreCalculatorForLinearInterpInputIndex(&in_img_idy, &h_id, &h1lambda, - &h2lambda, src_h, in_h); - - T* in_pos = &in[out_id_h * in_chw + channel_id * in_img_size + - in_img_idy * in_w + in_img_idx]; - platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * value); - platform::CudaAtomicAdd(&in_pos[w_id], h2lambda * w1lambda * value); - platform::CudaAtomicAdd(&in_pos[h_id * in_w], - h1lambda * w2lambda * value); - platform::CudaAtomicAdd(&in_pos[h_id * in_w + w_id], - h1lambda * w1lambda * value); - } - } else { - for (; tid < nthreads; tid += stride) { - int out_id_h = tid / out_chw; - int out_id_w = tid % out_chw; - const int in_img_size = in_h * in_w; - const int out_img_size = out_h * out_w; - T value = out[out_id_h * out_chw + out_id_w]; - - int out_img_idy = out_id_w / (out_w * num_channels); - int out_img_idx = out_id_w % (out_w * num_channels) / num_channels; - int channel_id = tid % num_channels; - - int in_img_idx, in_img_idy, w_id, h_id; - T w1lambda, h1lambda, w2lambda, h2lambda; - T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value; - T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value; - - PreCalculatorForLinearInterpInputIndex(&in_img_idx, &w_id, &w1lambda, - &w2lambda, src_w, in_w); - PreCalculatorForLinearInterpInputIndex(&in_img_idy, &h_id, &h1lambda, - &h2lambda, src_h, in_h); - - T* in_pos = &in[out_id_h * in_chw + in_img_idy * in_w * num_channels + - in_img_idx * num_channels + channel_id]; - platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * value); - platform::CudaAtomicAdd(&in_pos[w_id * num_channels], - h2lambda * w1lambda * value); - platform::CudaAtomicAdd(&in_pos[h_id * in_w * num_channels], - h1lambda * w2lambda * value); - platform::CudaAtomicAdd( - &in_pos[h_id * in_w * num_channels + w_id * num_channels], - h1lambda * w1lambda * value); - } + for (; tid < nthreads; tid += stride) { + auto out_id_divmod = divmods.output_w_div.Divmod(tid); + int out_id_h = out_id_divmod.val[0]; + int out_id_w = out_id_divmod.val[1]; + + int channel_id = divmods.channels_div.Divmod(tid).val[1]; + auto outimg_id_divmod = divmods.output_wc_div.Divmod(out_id_w); + int out_img_idy = outimg_id_divmod.val[0]; + int out_img_idx = + divmods.channels_div.Divmod(outimg_id_divmod.val[1]).val[0]; + + int in_img_idx, in_img_idy, w_id, h_id; + T w1lambda, h1lambda, w2lambda, h2lambda; + T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value; + T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value; + + PreCalculatorForLinearInterpInputIndex(&in_img_idx, &w_id, &w1lambda, + &w2lambda, src_w, in_w); + PreCalculatorForLinearInterpInputIndex(&in_img_idy, &h_id, &h1lambda, + &h2lambda, src_h, in_h); + + T value = out[tid]; + T* in_pos = &in[out_id_h * in_chw + in_img_idy * in_w * num_channels + + in_img_idx * num_channels + channel_id]; + platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * value); + platform::CudaAtomicAdd(&in_pos[w_id * num_channels], + h2lambda * w1lambda * value); + platform::CudaAtomicAdd(&in_pos[h_id * in_w * num_channels], + h1lambda * w2lambda * value); + platform::CudaAtomicAdd( + &in_pos[h_id * in_w * num_channels + w_id * num_channels], + h1lambda * w1lambda * value); } } @@ -1907,11 +1926,23 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx, ctx.cuda_device_context().stream()>>>( input_grad_data, in_h, in_w, output_grad_data, out_h, out_w, n, c, ratio_h, ratio_w, align_type_value, is_nchw); + } else if (!optimize_flag & is_nchw) { + // + const int num_kernels = n * c * out_h * out_w; + const int num_threads = + std::min(ctx.cuda_device_context().GetMaxThreadsPerBlock(), 1024); + KeBilinearInterpNCHWBw< + T><<>>( + input_grad_data, in_h, in_w, out_h, out_w, n, c, ratio_h, ratio_w, + output_grad_data, align_type_value); } else { + int64_t cw = c * out_w; + auto interp_divmods = FastDivModForInterpolate(c, out_chw, cw); KeBilinearInterpBw<<>>( - input_grad_data, in_h, in_w, output_grad_data, out_h, out_w, n, c, - ratio_h, ratio_w, align_type_value, is_nchw); + input_grad_data, in_h, in_w, output_grad_data, out_h, out_w, n, + out_chw, c, ratio_h, ratio_w, align_type_value, interp_divmods); } } else if ("bicubic" == interp_method) { #ifdef __HIPCC__ diff --git a/paddle/fluid/operators/kron_op.cc b/paddle/fluid/operators/kron_op.cc index 58d51ab1c723f296d3728a23de95a116acbb4df3..68d0c7978b4e45f216abd5fa5c4be93f788e8f04 100644 --- a/paddle/fluid/operators/kron_op.cc +++ b/paddle/fluid/operators/kron_op.cc @@ -17,9 +17,7 @@ limitations under the License. */ #include #include -#include "paddle/fluid/operators/kron_op.h" -#include "paddle/fluid/platform/complex.h" -#include "paddle/fluid/platform/float16.h" +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { @@ -178,27 +176,4 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(kron, ops::KronOp, ops::KronOpMaker, ops::KronGradOpMaker, ops::KronGradOpMaker); -REGISTER_OP_CPU_KERNEL( - kron, ops::KronKernel, - ops::KronKernel, - ops::KronKernel, - ops::KronKernel, - ops::KronKernel, - ops::KronKernel>, - ops::KronKernel>); - REGISTER_OPERATOR(kron_grad, ops::KronGradOp); -REGISTER_OP_CPU_KERNEL( - kron_grad, ops::KronGradKernel, - ops::KronGradKernel, - ops::KronGradKernel, - ops::KronGradKernel, - ops::KronGradKernel, - ops::KronGradKernel>, - ops::KronGradKernel>); diff --git a/paddle/fluid/operators/kron_op.cu b/paddle/fluid/operators/kron_op.cu deleted file mode 100644 index e5124e65007509568ae8cd8ab65b33c504a12fe9..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/kron_op.cu +++ /dev/null @@ -1,42 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/kron_op.h" -#include "paddle/fluid/platform/complex.h" -#include "paddle/fluid/platform/float16.h" - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - kron, ops::KronKernel, - ops::KronKernel, - ops::KronKernel, - ops::KronKernel, - ops::KronKernel, - ops::KronKernel>, - ops::KronKernel>); - -REGISTER_OP_CUDA_KERNEL( - kron_grad, ops::KronGradKernel, - ops::KronGradKernel, - ops::KronGradKernel, - ops::KronGradKernel, - ops::KronGradKernel, - ops::KronGradKernel>, - ops::KronGradKernel>); diff --git a/paddle/fluid/operators/kron_op.h b/paddle/fluid/operators/kron_op.h deleted file mode 100644 index 274b47c03a4d3d381dceda43d502a6e2d14669a5..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/kron_op.h +++ /dev/null @@ -1,415 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/for_range.h" -#if defined(__NVCC__) || defined(__HIPCC__) -#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" -#include "thrust/device_vector.h" -#endif - -namespace paddle { -namespace operators { - -// Process an element in the output, used with a parallel-for -template -struct KronElemFunctor { - KronElemFunctor(const T* a, const T* b, T* out, const int64_t* shape_b, - const int64_t* stride_a, const int64_t* stride_b, - const int64_t* stride_out, int ndims) - : a_(a), - b_(b), - out_(out), - shape_b_(shape_b), - stride_a_(stride_a), - stride_b_(stride_b), - stride_out_(stride_out), - ndims_(ndims) {} - - HOSTDEVICE void operator()(int64_t idx) const { - // it computes 1 element in the output - int64_t index = idx; - int64_t index_a = 0; - int64_t index_b = 0; - for (int i = 0; i < ndims_; i++) { - auto pos_i = index / stride_out_[i]; - index = index % stride_out_[i]; - auto pos_ai = pos_i / shape_b_[i]; - auto pos_bi = pos_i % shape_b_[i]; - index_a += stride_a_[i] * pos_ai; - index_b += stride_b_[i] * pos_bi; - } - out_[idx] = a_[index_a] * b_[index_b]; - } - - private: - const T* a_; - const T* b_; - T* out_; - const int64_t* shape_b_; - const int64_t* stride_a_; - const int64_t* stride_b_; - const int64_t* stride_out_; - const int ndims_; -}; - -template -struct KronOpFunctor { - void operator()(const DeviceContext& dev_ctx, const framework::Tensor& x, - const framework::Tensor& y, framework::Tensor* out) { - int ndims = out->dims().size(); - int64_t numel = out->numel(); - - const framework::DDim& dim_x = x.dims(); - const framework::DDim& dim_y = y.dims(); - const framework::DDim& dim_out = out->dims(); - const framework::DDim stride_x = phi::stride(dim_x); - const framework::DDim stride_y = phi::stride(dim_y); - const framework::DDim stride_out = phi::stride(dim_out); - - const int64_t *p_stride_x = nullptr, *p_stride_y = nullptr, - *p_stride_out = nullptr, *p_shape_y = nullptr; -#if defined(__NVCC__) || defined(__HIPCC__) - thrust::device_vector d_stride_x(ndims); - thrust::device_vector d_stride_y(ndims); - thrust::device_vector d_stride_out(ndims); - thrust::device_vector d_shape_y(ndims); - thrust::copy(stride_x.Get(), stride_x.Get() + ndims, d_stride_x.begin()); - thrust::copy(stride_y.Get(), stride_y.Get() + ndims, d_stride_y.begin()); - thrust::copy(stride_out.Get(), stride_out.Get() + ndims, - d_stride_out.begin()); - thrust::copy(dim_y.Get(), dim_y.Get() + ndims, d_shape_y.begin()); - - p_stride_x = thrust::raw_pointer_cast(d_stride_x.data()); - p_stride_y = thrust::raw_pointer_cast(d_stride_y.data()); - p_stride_out = thrust::raw_pointer_cast(d_stride_out.data()); - p_shape_y = thrust::raw_pointer_cast(d_shape_y.data()); -#else - p_stride_x = stride_x.Get(); - p_stride_y = stride_y.Get(); - p_stride_out = stride_out.Get(); - p_shape_y = dim_y.Get(); -#endif - - platform::ForRange for_range(dev_ctx, numel); - KronElemFunctor functor(x.data(), y.data(), out->data(), - p_shape_y, p_stride_x, p_stride_y, p_stride_out, - ndims); - for_range(functor); - } -}; - -template -struct KronGradElemFunctor { - KronGradElemFunctor(const T* dout, const T* A, const T* B, T* dout_a, - T* dout_b, const int64_t* stride_dout, - const int64_t* stride_a, const int64_t* stride_b, - const int64_t* shape_b, const int64_t numel_a, - const int64_t numel_b, const int ndims) - : dout_(dout), - A_(A), - B_(B), - dout_a_(dout_a), - dout_b_(dout_b), - stride_dout_(stride_dout), - stride_a_(stride_a), - stride_b_(stride_b), - shape_b_(shape_b), - numel_a_(numel_a), - numel_b_(numel_b), - ndims_(ndims) {} - - HOSTDEVICE void operator()(int64_t idx) { - int64_t index = idx; - int64_t index_a = 0; - int64_t index_b = 0; - for (int i = 0; i < ndims_; i++) { - auto pos_i = index / stride_dout_[i]; - index = index % stride_dout_[i]; - auto pos_ai = pos_i / shape_b_[i]; - auto pos_bi = pos_i % shape_b_[i]; - index_a += stride_a_[i] * pos_ai; - index_b += stride_b_[i] * pos_bi; - } - - if (dout_a_) { - size_t index_out_a = index_a * numel_b_ + index_b; - dout_a_[index_out_a] = dout_[idx] * B_[index_b]; - } - if (dout_b_) { - size_t index_out_b = index_b * numel_a_ + index_a; - dout_b_[index_out_b] = dout_[idx] * A_[index_a]; - } - } - - private: - const T* dout_; - const T* A_; - const T* B_; - T* dout_a_; - T* dout_b_; - const int64_t* stride_dout_; - const int64_t* stride_a_; - const int64_t* stride_b_; - const int64_t* shape_b_; - const int64_t numel_a_; - const int64_t numel_b_; - const int ndims_; -}; - -template -struct KronGradElemFunctor> { - KronGradElemFunctor(const platform::complex* dout, - const platform::complex* A, - const platform::complex* B, - platform::complex* dout_a, - platform::complex* dout_b, const int64_t* stride_dout, - const int64_t* stride_a, const int64_t* stride_b, - const int64_t* shape_b, const int64_t numel_a, - const int64_t numel_b, const int ndims) - : dout_(dout), - A_(A), - B_(B), - dout_a_(dout_a), - dout_b_(dout_b), - stride_dout_(stride_dout), - stride_a_(stride_a), - stride_b_(stride_b), - shape_b_(shape_b), - numel_a_(numel_a), - numel_b_(numel_b), - ndims_(ndims) {} - - HOSTDEVICE void operator()(int64_t idx) { - int64_t index = idx; - int64_t index_a = 0; - int64_t index_b = 0; - for (int i = 0; i < ndims_; i++) { - auto pos_i = index / stride_dout_[i]; - index = index % stride_dout_[i]; - auto pos_ai = pos_i / shape_b_[i]; - auto pos_bi = pos_i % shape_b_[i]; - index_a += stride_a_[i] * pos_ai; - index_b += stride_b_[i] * pos_bi; - } - - if (dout_a_) { - size_t index_out_a = index_a * numel_b_ + index_b; - dout_a_[index_out_a] = - dout_[idx] * - platform::complex(B_[index_b].real, -B_[index_b].imag); - } - if (dout_b_) { - size_t index_out_b = index_b * numel_a_ + index_a; - dout_b_[index_out_b] = - dout_[idx] * - platform::complex(A_[index_a].real, -A_[index_a].imag); - } - } - - private: - const platform::complex* dout_; - const platform::complex* A_; - const platform::complex* B_; - platform::complex* dout_a_; - platform::complex* dout_b_; - const int64_t* stride_dout_; - const int64_t* stride_a_; - const int64_t* stride_b_; - const int64_t* shape_b_; - const int64_t numel_a_; - const int64_t numel_b_; - const int ndims_; -}; - -template -struct KronGradOpFunctor { - void operator()(const DeviceContext& dev_ctx, const framework::Tensor& dout, - const framework::Tensor& x, const framework::Tensor& y, - framework::Tensor* dx, framework::Tensor* dy) { - int ndims = dout.dims().size(); - int64_t numel = dout.numel(); - int64_t numel_x = x.numel(); - int64_t numel_y = y.numel(); - - const framework::DDim& dim_x = x.dims(); - const framework::DDim& dim_y = y.dims(); - const framework::DDim& dim_dout = dout.dims(); - - const framework::DDim stride_x = phi::stride(dim_x); - const framework::DDim stride_y = phi::stride(dim_y); - const framework::DDim stride_dout = phi::stride(dim_dout); - - const int64_t* p_stride_x = nullptr; - const int64_t* p_stride_y = nullptr; - const int64_t* p_stride_dout = nullptr; - const int64_t* p_shape_y = nullptr; -#if defined(__NVCC__) || defined(__HIPCC__) - thrust::device_vector d_stride_x(ndims); - thrust::device_vector d_stride_y(ndims); - thrust::device_vector d_stride_dout(ndims); - thrust::device_vector d_shape_y(ndims); - thrust::copy(stride_x.Get(), stride_x.Get() + ndims, d_stride_x.begin()); - thrust::copy(stride_y.Get(), stride_y.Get() + ndims, d_stride_y.begin()); - thrust::copy(stride_dout.Get(), stride_dout.Get() + ndims, - d_stride_dout.begin()); - thrust::copy(dim_y.Get(), dim_y.Get() + ndims, d_shape_y.begin()); - - p_stride_x = thrust::raw_pointer_cast(d_stride_x.data()); - p_stride_y = thrust::raw_pointer_cast(d_stride_y.data()); - p_stride_dout = thrust::raw_pointer_cast(d_stride_dout.data()); - p_shape_y = thrust::raw_pointer_cast(d_shape_y.data()); -#else - p_stride_x = stride_x.Get(); - p_stride_y = stride_y.Get(); - p_stride_dout = stride_dout.Get(); - p_shape_y = dim_y.Get(); -#endif - // dout_x: dout * kron(ones(X), Y) re-aranged in shape (numel_x, numel_y) - // dout_y: dout * kron(X, ones(Y)) re-aranged in shaoe (numel_y, numel_x) - framework::Tensor dout_x; - T* p_dout_x = nullptr; - if (dx) { - dout_x.mutable_data({numel_x, numel_y}, dev_ctx.GetPlace()); - p_dout_x = dout_x.data(); - } - framework::Tensor dout_y; - T* p_dout_y = nullptr; - if (dy) { - dout_y.mutable_data({numel_y, numel_x}, dev_ctx.GetPlace()); - p_dout_y = dout_y.data(); - } - - platform::ForRange for_range(dev_ctx, numel); - KronGradElemFunctor func(dout.data(), x.data(), y.data(), - p_dout_x, p_dout_y, p_stride_dout, p_stride_x, - p_stride_y, p_shape_y, numel_x, numel_y, ndims); - for_range(func); - -// reduce_sum along aixs 1 -#if defined(__NVCC__) || defined(__HIPCC__) - auto stream = dev_ctx.stream(); // it is a cuda device_context - if (dx) { - TensorReduceImpl>( - dev_ctx, dout_x, dx, kps::IdentityFunctor(), {1}, stream); - } - if (dy) { - TensorReduceImpl>( - dev_ctx, dout_y, dy, kps::IdentityFunctor(), {1}, stream); - } -#else - auto* place = dev_ctx.eigen_device(); - Eigen::array reduce_dim = {1}; - if (dx) { - auto eigen_dout_x = framework::EigenMatrix::Reshape(dout_x, 1); - auto eigen_vec_dx = framework::EigenVector::Flatten(*dx); - eigen_vec_dx.device(*place) = eigen_dout_x.sum(reduce_dim); - } - if (dy) { - auto eigen_dout_y = framework::EigenMatrix::Reshape(dout_y, 1); - auto eigen_vec_dy = framework::EigenVector::Flatten(*dy); - eigen_vec_dy.device(*place) = eigen_dout_y.sum(reduce_dim); - } -#endif - } -}; - -inline framework::Tensor UnsqueezeTo(const framework::Tensor& src, int ndims) { - const framework::DDim& shape = src.dims(); - int rank = shape.size(); - framework::Tensor res; - res.ShareDataWith(src); - PADDLE_ENFORCE_LE( - rank, ndims, - platform::errors::InvalidArgument( - "The input Tensor's rank should be less than or equal to ndims" - "Received input Tensor's rank = %d, ndims = %d", - rank, ndims)); - if (rank < ndims) { - std::vector new_dim(ndims, 1); - for (int i = ndims - rank; i < ndims; i++) { - new_dim[i] = shape[i - ndims + rank]; - } - res.Resize(phi::make_ddim(new_dim)); - } - return res; -} - -template -class KronKernel : public framework::OpKernel { - public: - virtual void Compute(const framework::ExecutionContext& ctx) const { - auto& dev_ctx = ctx.template device_context(); - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - int ndims = out->dims().size(); - framework::Tensor xx = UnsqueezeTo(*x, ndims); - framework::Tensor yy = UnsqueezeTo(*y, ndims); - - KronOpFunctor func; - func(dev_ctx, xx, yy, out); - } -}; - -template -class KronGradKernel : public framework::OpKernel { - public: - virtual void Compute(const framework::ExecutionContext& ctx) const { - auto& dev_ctx = ctx.template device_context(); - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - if (dx) { - dx->mutable_data(ctx.GetPlace()); - } - if (dy) { - dy->mutable_data(ctx.GetPlace()); - } - - int ndims = dout->dims().size(); - framework::Tensor xx = UnsqueezeTo(*x, ndims); - framework::Tensor yy = UnsqueezeTo(*y, ndims); - - framework::Tensor* pdxx = nullptr; - framework::Tensor* pdyy = nullptr; - framework::Tensor dxx; - framework::Tensor dyy; - if (dx) { - dxx = UnsqueezeTo(*dx, ndims); - pdxx = &dxx; - } - - if (dy) { - dyy = UnsqueezeTo(*dy, ndims); - pdyy = &dyy; - } - - KronGradOpFunctor func; - func(dev_ctx, *dout, xx, yy, pdxx, pdyy); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/log_softmax_op.cu b/paddle/fluid/operators/log_softmax_op.cu index 8770abdac838f63b0c9f3a95b1ac0283a80ecbf2..26b6ce43303d181c41b60cf36c229d00acb0e626 100644 --- a/paddle/fluid/operators/log_softmax_op.cu +++ b/paddle/fluid/operators/log_softmax_op.cu @@ -12,459 +12,43 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include #include "paddle/fluid/operators/log_softmax_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" -#include "paddle/phi/common/amp_type_traits.h" -#include "paddle/phi/kernels/funcs/elementwise_functor.h" -#include "paddle/phi/kernels/funcs/functors.h" +#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h" namespace paddle { namespace operators { -#define LAUNCH_WARP_FORWAR_COMPUTE(near_greater_power_of_two) \ - case near_greater_power_of_two: \ - ComputeLogSoftmaxForwardInWarp< \ - T, AccT, near_greater_power_of_two><<>>( \ - dst, src, outer_size, dim_size); \ - break; - -template -__device__ __forceinline__ T WarpReduceSum(T value) { -#pragma unroll - for (int offset = KernelWarpSize / 2; offset > 0; offset /= 2) { - T sum_val = platform::CudaShuffleXorSync(0xFFFFFFFF, value, offset); - value = value + sum_val; - } - return value; -} - -template -__device__ __forceinline__ T WarpReduceMax(T value) { -#pragma unroll - for (int offset = KernelWarpSize / 2; offset > 0; offset /= 2) { - T max_val = platform::CudaShuffleXorSync(0xFFFFFFFF, value, offset); - value = max(value, max_val); - } - return value; -} - -int GetNearGreaterPowerOfTwo(int value) { - int log2_value = 0; - while ((1 << log2_value) < value) { - ++log2_value; - } - return 1 << log2_value; -} - -template -__global__ void ComputeLogSoftmaxForwardInWarp(T *dst, const T *src, - int batch_size, - int element_count) { - constexpr int near_greater_power_of_two = NearGreaterPowerOfTwo; - constexpr int kernel_warp_size = - (near_greater_power_of_two < 32) ? near_greater_power_of_two : 32; - constexpr int warp_iter = near_greater_power_of_two / kernel_warp_size; - int batch_id = blockDim.y * blockIdx.x + threadIdx.y; - - int thread_in_warp_idx = threadIdx.x; - - // 1.read data from global memory to registers - AccT elements[warp_iter]; - // set effective_element_count as the num of elements when warps do effective - // work - // set effective_element_count as 0, when warps do ineffective work - int effective_element_count = (batch_id < batch_size) ? element_count : 0; - for (int it = 0; it < warp_iter; ++it) { - int element_index = thread_in_warp_idx + it * kernel_warp_size; - if (element_index < effective_element_count) { - elements[it] = - static_cast(src[batch_id * element_count + element_index]); - } else { - elements[it] = -std::numeric_limits::infinity(); - } - } - - // 2.compute max_value. For each thread, loop all registers to find max - AccT max_value = elements[0]; -#pragma unroll - for (int it = 1; it < warp_iter; ++it) { - max_value = (max_value > elements[it]) ? max_value : elements[it]; - } - max_value = WarpReduceMax(max_value); - - // 3.For each warp, accumulate all thread registers - AccT sum = 0.0f; -#pragma unroll - for (int it = 0; it < warp_iter; ++it) { - sum += std::exp(elements[it] - max_value); - } - sum = WarpReduceSum(sum); - - // 4.store result. - sum = std::log(sum); -#pragma unroll - for (int it = 0; it < warp_iter; ++it) { - int element_index = thread_in_warp_idx + it * kernel_warp_size; - if (element_index < effective_element_count) { - dst[batch_id * element_count + element_index] = - static_cast(elements[it] - max_value - sum); - } else { - break; - } - } -} - -template -void LaunchSoftmaxForwardForLastAxis(T *dst, const T *src, int dim_size, - int outer_size, gpuStream_t stream) { - int threads_per_block = 128; - int near_greater_power_of_two = GetNearGreaterPowerOfTwo(dim_size); - int kernel_warp_size = - (near_greater_power_of_two < 32) ? near_greater_power_of_two : 32; - int warps_per_block = (threads_per_block / kernel_warp_size); - int blocks = (outer_size + warps_per_block - 1) / warps_per_block; - dim3 threads(kernel_warp_size, warps_per_block, 1); - - switch (near_greater_power_of_two) { - LAUNCH_WARP_FORWAR_COMPUTE(1); - LAUNCH_WARP_FORWAR_COMPUTE(2); - LAUNCH_WARP_FORWAR_COMPUTE(4); // dim_size: 3~4 - LAUNCH_WARP_FORWAR_COMPUTE(8); // dim_size: 5~8 - LAUNCH_WARP_FORWAR_COMPUTE(16); // dim_size: 9~16 - LAUNCH_WARP_FORWAR_COMPUTE(32); // dim_size: 17~32 - LAUNCH_WARP_FORWAR_COMPUTE(64); // dim_size: 33~64 - LAUNCH_WARP_FORWAR_COMPUTE(128); // dim_size 65~128 - LAUNCH_WARP_FORWAR_COMPUTE(256); // dim_size 129~256 - LAUNCH_WARP_FORWAR_COMPUTE(512); // dim_size 257~512 - LAUNCH_WARP_FORWAR_COMPUTE(1024); // dim_size 513~1024 - - default: - break; - } -} - -// Returns the final item after reduce operation along block.x. -// Firstly, get shared memory(smem) offset, find the starting position for every -// y. -// Secondly, initialise every smem position with value 'val' of thread itself. -// Thirdly, apply standard reduction along x direction as below: -// -// -> x direction -// [o o o o o o o o] time 0 -// | |/ / -// | /| / -// | / | / -// |/ |/ -// [o o o o x x x x] time 1 -// | |/ / -// |/|/ -// [o o x x x x x x] time 2 -// |/ -// [o x x x x x x x] time 3 -// -// Finally, return the first item. -// Imaging multiple reductions executed in paralell along y axis, -// Note that when blockDim.x is not 1, it's a EVEN number in all cases, -// and the size of shared memory is even as well. -template class Functor> -__forceinline__ __device__ T BlockReduceAlongDimX(T *shared, T val) { - Functor func; - // This reduction is not Block-wise reduction, only reduce along block.x. - // therefore the shared mem has offsets for different block.y. - shared += threadIdx.y * blockDim.x; - shared[threadIdx.x] = val; - int offset = blockDim.x / 2; - - while (offset > 0) { - __syncthreads(); - if (threadIdx.x < offset) { - shared[threadIdx.x] = - func(shared[threadIdx.x], shared[threadIdx.x + offset]); - } - offset /= 2; - } - __syncthreads(); - return shared[0]; -} - -template -__global__ void LogSoftmaxForwardCUDAKernelNotLastAxis( - T *output, const T *input, int outer_size, int dim_size, int inner_size) { - extern __shared__ unsigned char smem[]; - auto sdata = reinterpret_cast(smem); - - const int outer_stride = inner_size * dim_size; - const int dim_stride = inner_size; - - for (int x_id = blockIdx.x; x_id < outer_size; x_id += gridDim.x) { - for (int y_id = blockIdx.y * blockDim.y + threadIdx.y; y_id < inner_size; - y_id += blockDim.y * gridDim.y) { - const int data_offset = x_id * outer_stride + y_id; - // When blockDim.x==1, no block.x-reduction opetaions are needed. - // And threadIdx.x is 0 all the time, so the for-loops below are literally - // loops (No parallel executions). Loop all elements along axis and - // calculate the Max, Sum and (input[id]-Max-log(Sum)) to get the final - // log_softmax values along that axis. - // 1. reduce max - AccT max_value = -std::numeric_limits::infinity(); - // For one thread, iterate all items it responsable for, and get - // max_value. - // If there are N threads, N max_value will be returned. - for (int d = threadIdx.x; d < dim_size; d += blockDim.x) { - const AccT value = - static_cast(input[data_offset + d * dim_stride]); - max_value = phi::funcs::MaxFunctor()(max_value, value); - } - // If there are more than 1 threads along block x, reduce all max_values - // and get the global max_value, which is the max value along "axis". - // If there is only one thread along block x, no need to reduce, as the - // 'max_value' is the global max_value. - if (blockDim.x > 1) { - max_value = BlockReduceAlongDimX( - sdata, max_value); - } - - // 2. reduce sum - AccT sum = 0; - // Below is the same execution as '1. reduce max' - for (int d = threadIdx.x; d < dim_size; d += blockDim.x) { - sum += std::exp(static_cast(input[data_offset + d * dim_stride]) - - max_value); - } - if (blockDim.x > 1) { - sum = BlockReduceAlongDimX(sdata, sum); - } - - // 3. input-max-log_sum and write to output - for (int d = threadIdx.x; d < dim_size; d += blockDim.x) { - output[data_offset + d * dim_stride] = static_cast( - static_cast(input[data_offset + d * dim_stride]) - max_value - - std::log(sum)); - } - } - } -} - -// block.y covers inner_size. Threads along the x axis process dim_size -// elements, and make sure not to exceed the 1024 threads per block. -// Note that dim_threads namely blockDim.x is either 1 or a even number. -inline dim3 GetBlockSize(int dim_size, int inner_size) { - int inner_threads = inner_size; - inner_threads = std::min(inner_threads, 1024); - int dim_threads = 1; - - while (dim_threads * inner_threads <= 1024 && dim_threads <= dim_size) { - dim_threads *= 2; - } - dim_threads /= 2; - return dim3(dim_threads, inner_threads); -} - -// First cover the y axis as many blocks as possible. -// Then cover the x axis as many blocks as possible, -// and make sure not to exceed the max_active_blocks. -inline dim3 GetGridSize(dim3 block, int max_active_blocks, int outer_size, - int dim_size, int inner_size) { - int inner_blocks = (inner_size + block.y - 1) / block.y; - if (inner_blocks > max_active_blocks) inner_blocks = max_active_blocks; - - int outer_blocks = (max_active_blocks + inner_blocks - 1) / inner_blocks; - if (outer_blocks > outer_size) outer_blocks = outer_size; - return dim3(outer_blocks, inner_blocks); -} - -// When designing grid size and block size, priority is given to block size, -// and grid will be determined according to the maximum number of active blocks, -// which is set by as a experience value. -template -void ComputeLaunchConfigure(Kernel k, int outer_size, int dim_size, - int inner_size, dim3 &grid, dim3 &block, - int &shared_mem, int num_sm) { - block = GetBlockSize(dim_size, inner_size); - int block_threads = block.x * block.y; - shared_mem = block.x == 1 ? 0 : block_threads * sizeof(T); - int max_active_blocks = num_sm * 2; - grid = - GetGridSize(block, max_active_blocks, outer_size, dim_size, inner_size); -} - -template -void LaunchLogSoftmaxForwardCUDAKernelNotLastAxis(T *output_data, - const T *input_data, - int outer_size, int dim_size, - int inner_size, int num_sm, - gpuStream_t stream) { - int shared_mem; - dim3 grid; - dim3 block; - - ComputeLaunchConfigure( - &LogSoftmaxForwardCUDAKernelNotLastAxis, outer_size, dim_size, - inner_size, grid, block, shared_mem, num_sm); - - LogSoftmaxForwardCUDAKernelNotLastAxis< - T, MPDType><<>>( - output_data, input_data, outer_size, dim_size, inner_size); -} +using Tensor = framework::Tensor; template class LogSoftmaxKernel : public framework::OpKernel { - using MPDType = typename phi::dtype::MPTypeTrait::Type; - public: - void Compute(const framework::ExecutionContext &context) const override { - const auto *x = context.Input("X"); - auto *out = context.Output("Out"); - const auto *input_data = x->data(); - auto *output_data = out->mutable_data(context.GetPlace()); - - const int rank = x->dims().size(); - const int axis = CanonicalAxis(context.Attr("axis"), rank); + void Compute(const framework::ExecutionContext &ctx) const override { + auto *x = ctx.Input("X"); + auto *out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); - int dim_size = x->dims()[axis]; - int inner_size = 1; - for (int i = axis + 1; i < x->dims().size(); ++i) { - inner_size *= x->dims()[i]; - } - int outer_size = SizeToAxis(axis, x->dims()); - gpuStream_t stream = context.cuda_device_context().stream(); - int num_sm = context.cuda_device_context().GetSMCount(); - - if (inner_size == 1 && dim_size <= 1024 && dim_size * sizeof(T) <= 4096) { - LaunchSoftmaxForwardForLastAxis(output_data, input_data, - dim_size, outer_size, stream); - } else { - LaunchLogSoftmaxForwardCUDAKernelNotLastAxis( - output_data, input_data, outer_size, dim_size, inner_size, num_sm, - stream); - } + int input_axis = ctx.Attr("axis"); + auto &dev_ctx = ctx.template device_context(); + phi::SoftmaxForwardCUDAKernelDriver(dev_ctx, *x, input_axis, out); } }; -// Backward below -#define LAUNCH_WARP_BACKWARD_COMPUTE(near_greater_power_of_two) \ - case near_greater_power_of_two: \ - ComputeLogSoftmaxBackwardInWarp< \ - T, AccT, near_greater_power_of_two><<>>( \ - output, grad_output, grad_input, outer_size, dim_size); \ - break; - -template -__global__ void ComputeLogSoftmaxBackwardInWarp(const T *output, - const T *grad_output, - T *grad_input, int batch_size, - int element_count) { - constexpr int near_greater_power_of_two = NearGreaterPowerOfTwo; - constexpr int kernel_warp_size = - (near_greater_power_of_two < 32) ? near_greater_power_of_two : 32; - constexpr int warp_iter = near_greater_power_of_two / kernel_warp_size; - int batch_id = blockDim.y * blockIdx.x + threadIdx.y; - - int thread_in_warp_idx = threadIdx.x; - - // 1.read data from global memory to registers - AccT output_register[warp_iter]; - AccT grad_output_register[warp_iter]; - int effective_element_count = (batch_id < batch_size) ? element_count : 0; - for (int iter = 0; iter < warp_iter; ++iter) { - int element_index = thread_in_warp_idx + iter * kernel_warp_size; - if (element_index < effective_element_count) { - output_register[iter] = - static_cast(output[batch_id * element_count + element_index]); - grad_output_register[iter] = static_cast( - grad_output[batch_id * element_count + element_index]); - } else { - output_register[iter] = static_cast(0); - grad_output_register[iter] = static_cast(0); - } - } - - // 2. For each warp, accumulate all thread registers - AccT sum = grad_output_register[0]; -#pragma unroll - for (int iter = 1; iter < warp_iter; ++iter) { - sum += grad_output_register[iter]; - } - sum = WarpReduceSum(sum); - -// 3. write result in grad_input -#pragma unroll - for (int iter = 0; iter < warp_iter; ++iter) { - int element_index = thread_in_warp_idx + iter * kernel_warp_size; - if (element_index < effective_element_count) { - grad_input[batch_id * element_count + element_index] = static_cast( - (grad_output_register[iter] - std::exp(output_register[iter]) * sum)); - } - } -} - -template -void LaunchSoftmaxBackwardForLastAxis(T *grad_input, const T *grad_output, - const T *output, int dim_size, - int outer_size, gpuStream_t stream) { - int threads_per_block = 128; - int near_greater_power_of_two = GetNearGreaterPowerOfTwo(dim_size); - int kernel_warp_size = - (near_greater_power_of_two < 32) ? near_greater_power_of_two : 32; - int warps_per_block = (threads_per_block / kernel_warp_size); - int blocks = (outer_size + warps_per_block - 1) / warps_per_block; - dim3 threads(kernel_warp_size, warps_per_block, 1); - - switch (near_greater_power_of_two) { - LAUNCH_WARP_BACKWARD_COMPUTE(1); // dim_size: 1 - LAUNCH_WARP_BACKWARD_COMPUTE(2); // dim_size: 2 - LAUNCH_WARP_BACKWARD_COMPUTE(4); // dim_size: 3~4 - LAUNCH_WARP_BACKWARD_COMPUTE(8); // dim_size: 5~8 - LAUNCH_WARP_BACKWARD_COMPUTE(16); // dim_size: 9~16 - LAUNCH_WARP_BACKWARD_COMPUTE(32); // dim_size: 17~32 - LAUNCH_WARP_BACKWARD_COMPUTE(64); // dim_size: 33~64 - LAUNCH_WARP_BACKWARD_COMPUTE(128); // dim_size: 65~128 - LAUNCH_WARP_BACKWARD_COMPUTE(256); // dim_size: 129~256 - LAUNCH_WARP_BACKWARD_COMPUTE(512); // dim_size: 257~512 - LAUNCH_WARP_BACKWARD_COMPUTE(1024); // dim_size: 513~1024 - - default: - break; - } -} - template class LogSoftmaxGradKernel : public framework::OpKernel { - using MPDType = typename phi::dtype::MPTypeTrait::Type; - public: - void Compute(const framework::ExecutionContext &context) const override { - const auto *out = context.Input("Out"); - const auto *d_out = - context.Input(framework::GradVarName("Out")); - auto *d_x = context.Output(framework::GradVarName("X")); + void Compute(const framework::ExecutionContext &ctx) const override { + auto *out = ctx.Input("Out"); + auto *dout = ctx.Input(framework::GradVarName("Out")); + auto *dx = ctx.Output(framework::GradVarName("X")); + dx->mutable_data(ctx.GetPlace()); - const auto *out_data = out->data(); - const auto *d_out_data = d_out->data(); - auto *d_x_data = d_x->mutable_data(context.GetPlace()); - - const int rank = out->dims().size(); - const int axis = CanonicalAxis(context.Attr("axis"), rank); - - int dim_size = out->dims()[axis]; - int inner_size = 1; - for (int i = axis + 1; i < out->dims().size(); ++i) { - inner_size *= out->dims()[i]; - } - int outer_size = SizeToAxis(axis, out->dims()); - gpuStream_t stream = context.cuda_device_context().stream(); - - if (inner_size == 1 && dim_size <= 1024 && dim_size * sizeof(T) <= 4096) { - LaunchSoftmaxBackwardForLastAxis( - d_x_data, d_out_data, out_data, dim_size, outer_size, stream); - } else { - LogSoftmaxGradFunctor()( - context.template device_context(), out, - d_out, d_x, axis); - } + int input_axis = ctx.Attr("axis"); + auto &dev_ctx = ctx.template device_context(); + phi::SoftmaxBackwardCUDAKernelDriver(dev_ctx, *out, *dout, + input_axis, dx); } }; @@ -473,6 +57,17 @@ class LogSoftmaxGradKernel namespace ops = paddle::operators; namespace plat = paddle::platform; + +#ifdef PADDLE_WITH_HIP +REGISTER_OP_CUDA_KERNEL( + log_softmax, ops::LogSoftmaxKernel, + ops::LogSoftmaxKernel, + ops::LogSoftmaxKernel); +REGISTER_OP_CUDA_KERNEL( + log_softmax_grad, ops::LogSoftmaxGradKernel, + ops::LogSoftmaxGradKernel, + ops::LogSoftmaxGradKernel); +#else REGISTER_OP_CUDA_KERNEL( log_softmax, ops::LogSoftmaxKernel, ops::LogSoftmaxKernel, @@ -483,3 +78,4 @@ REGISTER_OP_CUDA_KERNEL( ops::LogSoftmaxGradKernel, ops::LogSoftmaxGradKernel, ops::LogSoftmaxGradKernel); +#endif diff --git a/paddle/fluid/operators/lstsq_op.cu b/paddle/fluid/operators/lstsq_op.cu index 92c9857f0b942f00c348a6199ea4b9789b398328..10e2867bf2953f5c6fbc3d50bd8156fa3b0266e9 100644 --- a/paddle/fluid/operators/lstsq_op.cu +++ b/paddle/fluid/operators/lstsq_op.cu @@ -17,9 +17,11 @@ #include #include +#include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/operators/lstsq_op.h" #include "paddle/fluid/operators/qr_op.h" #include "paddle/fluid/platform/dynload/cusolver.h" +#include "paddle/phi/kernels/triangular_solve_kernel.h" namespace paddle { namespace operators { @@ -70,6 +72,10 @@ class LstsqCUDAKernel : public framework::OpKernel { Tensor tau = dito.Fill(tau_dims_vec, 0); auto tau_data = tau.mutable_data(context.GetPlace()); + using Context = + typename framework::ConvertToPhiContext::TYPE; + auto& phi_dev_ctx = static_cast(dev_ctx); + if (m >= n) { Tensor tmp_x = dito.Transpose(new_x); Tensor tmp_y = dito.Transpose(new_y); @@ -93,8 +99,9 @@ class LstsqCUDAKernel : public framework::OpKernel { Tensor slice_y = dito.Slice(trans_y, {-2}, {0}, {min_mn}); // Step 3, solve R X = Y - triangular_solve(dev_ctx, res_r, slice_y, solution, - true, false, false); + phi::TriangularSolveKernel(phi_dev_ctx, res_r, slice_y, true, + false, false, solution); + } else { auto x_data = new_x.mutable_data(context.GetPlace()); auto y_data = new_y.mutable_data(context.GetPlace()); @@ -105,8 +112,8 @@ class LstsqCUDAKernel : public framework::OpKernel { // Step 2, solve R^H Z = Y Tensor trans_r = dito.Transpose(new_x); - triangular_solve(dev_ctx, trans_r, new_y, solution, - true, true, false); + phi::TriangularSolveKernel(phi_dev_ctx, trans_r, new_y, true, + true, false, solution); // Step 3, X <- Q Z BatchedOrgqr(dev_ctx, batch_count, n, n, min_mn, x_data, diff --git a/paddle/fluid/operators/lstsq_op.h b/paddle/fluid/operators/lstsq_op.h index 3cbbc62e7bec92f329535e788f19d439c9341a0e..520722dafcbea3ce8c545389317516cc22f7689f 100644 --- a/paddle/fluid/operators/lstsq_op.h +++ b/paddle/fluid/operators/lstsq_op.h @@ -22,7 +22,6 @@ #include "paddle/fluid/operators/math/matrix_solve.h" #include "paddle/fluid/operators/svd_helper.h" #include "paddle/fluid/operators/transpose_op.h" -#include "paddle/fluid/operators/triangular_solve_op.h" #include "paddle/fluid/platform/for_range.h" #include "paddle/phi/kernels/funcs/complex_functors.h" #include "paddle/phi/kernels/funcs/lapack/lapack_function.h" diff --git a/paddle/fluid/operators/lu_op.h b/paddle/fluid/operators/lu_op.h index f323e2e041d994eb01c9d4e934984b8a005ffcec..214b2eccae9f75e9bfcfa3df0b823918e2b0c353 100644 --- a/paddle/fluid/operators/lu_op.h +++ b/paddle/fluid/operators/lu_op.h @@ -15,12 +15,13 @@ limitations under the License. */ #pragma once #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/operators/set_value_op.h" #include "paddle/fluid/operators/svd_helper.h" -#include "paddle/fluid/operators/triangular_solve_op.h" #include "paddle/fluid/operators/tril_triu_op.h" #include "paddle/phi/kernels/funcs/lapack/lapack_function.h" #include "paddle/phi/kernels/math_kernel.h" +#include "paddle/phi/kernels/triangular_solve_kernel.h" namespace paddle { namespace operators { @@ -555,6 +556,11 @@ class LUGradKernel : public framework::OpKernel { framework::Tensor Pmat; Unpack_Pivot(dev_ctx, *P, &Pmat, m, k); + + using Context = + typename framework::ConvertToPhiContext::TYPE; + auto& phi_dev_ctx = static_cast(dev_ctx); + if (m <= n) { if (k < n) { framework::Tensor U_complement, U_grad_complement, phi_complement, @@ -605,8 +611,9 @@ class LUGradKernel : public framework::OpKernel { framework::Tensor psi_principal, phi_mH, psi_tmp; Tensor_Conj(dev_ctx, phi, &phi_mH); phi_mH = helper.Transpose(phi_mH); - triangular_solve(dev_ctx, U_narrow, phi_mH, - &psi_principal, true, false, false); + + phi::TriangularSolveKernel( + phi_dev_ctx, U_narrow, phi_mH, true, false, false, &psi_principal); Tensor_Conj(dev_ctx, psi_principal, &psi_principal); psi_principal = helper.Transpose(psi_principal); @@ -620,8 +627,9 @@ class LUGradKernel : public framework::OpKernel { SetValueCompute_dispatch(ctx, &psi, &psi_principal, &psi, axes, &slice_starts, &slice_ends, valuedims, xrank); - triangular_solve(dev_ctx, L_narrow_mH, psi, &psi_tmp, - true, false, true); + + phi::TriangularSolveKernel(phi_dev_ctx, L_narrow_mH, psi, + true, false, true, &psi_tmp); auto mat_dim_p = phi::funcs::CreateMatrixDescriptor(Pmat.dims(), 0, false); @@ -672,8 +680,10 @@ class LUGradKernel : public framework::OpKernel { &psi, axes, &slice_starts, &slice_ends, valuedims, xrank); framework::Tensor psi_principal, phi_mH, psi_tmp, U_narrow_mH; - triangular_solve(dev_ctx, L_narrow_mH, phi, - &psi_principal, true, false, true); + + phi::TriangularSolveKernel(phi_dev_ctx, L_narrow_mH, phi, + true, false, true, &psi_principal); + slice_starts[0] = 0; slice_starts[1] = 0; slice_ends[0] = k; @@ -695,8 +705,8 @@ class LUGradKernel : public framework::OpKernel { psi_tmp = helper.Transpose(psi_tmp); Tensor_Conj(dev_ctx, U_narrow, &U_narrow_mH); - triangular_solve(dev_ctx, U_narrow_mH, psi_tmp, &psi, - true, false, false); + phi::TriangularSolveKernel(phi_dev_ctx, U_narrow_mH, psi_tmp, + true, false, false, &psi); *dx = helper.Transpose(psi); } } diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index 31a98d9f630e1c01f3b886cbe91dd3882b384d05..af1069cb867993160d7346779d7de8161e37438c 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -20,7 +20,6 @@ math_library(sampler DEPS generator) # math_library(math_function DEPS blas dense_tensor tensor) math_library(maxouting) -math_library(pooling) if(WITH_MKLDNN) math_library(selected_rows_functor DEPS selected_rows_utils math_function blas mkldnn_axpy_handler) diff --git a/paddle/fluid/operators/math/matrix_solve.cc b/paddle/fluid/operators/math/matrix_solve.cc index 883ee9b148654f8621b26942739730426ba7fc7d..7b239b8166644697581d0051f12b6abacc6832fa 100644 --- a/paddle/fluid/operators/math/matrix_solve.cc +++ b/paddle/fluid/operators/math/matrix_solve.cc @@ -34,45 +34,6 @@ class MatrixSolveFunctor { template class MatrixSolveFunctor; template class MatrixSolveFunctor; -template -class TriangularSolveFunctor { - public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor* a, framework::Tensor* b, bool left, - bool upper, bool transpose, bool unitriangular) { - CBLAS_SIDE side = left ? CblasLeft : CblasRight; - CBLAS_UPLO uplo = upper ? CblasUpper : CblasLower; - CBLAS_TRANSPOSE transA = transpose ? CblasTrans : CblasNoTrans; - CBLAS_DIAG diag = unitriangular ? CblasUnit : CblasNonUnit; - - const T* a_data = a->data(); - T* b_data = b->mutable_data(context.GetPlace()); - - int a_dim_size = a->dims().size(); - int b_dim_size = b->dims().size(); - - int M = static_cast(b->dims()[b_dim_size - 2]); - int N = static_cast(b->dims()[b_dim_size - 1]); - auto lda = left ? std::max(1, M) : std::max(1, N); - auto ldb = std::max(1, N); - - int batch_size = 1; - auto& a_dim = a->dims(); - for (int i = 0; i < a_dim_size - 2; i++) { - batch_size *= a_dim[i]; - } - - auto blas = phi::funcs::GetBlas(context); - for (int i = 0; i < batch_size; i++) { - blas.TRSM(side, uplo, transA, diag, M, N, T(1), a_data + i * M * M, lda, - b_data + i * N * M, ldb); - } - } -}; - -template class TriangularSolveFunctor; -template class TriangularSolveFunctor; - } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/matrix_solve.cu.cc b/paddle/fluid/operators/math/matrix_solve.cu.cc index d3490ead212731f3fc6a75d61a31c11c72c9129d..737196dde1dfc26269fe083fe17037c829ef8109 100644 --- a/paddle/fluid/operators/math/matrix_solve.cu.cc +++ b/paddle/fluid/operators/math/matrix_solve.cu.cc @@ -161,67 +161,6 @@ class MatrixSolveFunctor { template class MatrixSolveFunctor; template class MatrixSolveFunctor; -template -class TriangularSolveFunctor { - public: - void operator()(const platform::CUDADeviceContext& context, const Tensor* a, - Tensor* b, bool left, bool upper, bool transpose, - bool unitriangular) { - CBLAS_SIDE side = left ? CblasLeft : CblasRight; - CBLAS_UPLO uplo = upper ? CblasUpper : CblasLower; - CBLAS_TRANSPOSE transA = transpose ? CblasTrans : CblasNoTrans; - CBLAS_DIAG diag = unitriangular ? CblasUnit : CblasNonUnit; - - const T* a_data = a->data(); - T* b_data = b->mutable_data(context.GetPlace()); - - int a_dim_size = a->dims().size(); - int b_dim_size = b->dims().size(); - - int M = static_cast(b->dims()[b_dim_size - 2]); - int N = static_cast(b->dims()[b_dim_size - 1]); - auto lda = left ? std::max(1, M) : std::max(1, N); - auto ldb = std::max(1, N); - - int batch_size = 1; - auto& a_dim = a->dims(); - for (int i = 0; i < a_dim_size - 2; i++) { - batch_size *= a_dim[i]; - } - - auto blas = phi::funcs::GetBlas(context); - if (batch_size <= 8 && M >= 64) { - for (auto i = 0; i < batch_size; i++) { - blas.TRSM(side, uplo, transA, diag, M, N, static_cast(1.0), - a_data + i * M * M, lda, b_data + i * N * M, ldb); - } - } else { - std::vector cpu_ptrs(batch_size * 2); - for (int i = 0; i < batch_size; ++i) { - cpu_ptrs[i] = a_data + i * M * M; - cpu_ptrs[i + batch_size] = b_data + i * M * N; - } - - // Copy the addresses of A and tmp_b from host to device. - memory::allocation::AllocationPtr tmp_gpu_ptrs_data = - memory::Alloc(context, cpu_ptrs.size() * sizeof(T*)); - memory::Copy(context.GetPlace(), tmp_gpu_ptrs_data->ptr(), - platform::CPUPlace(), static_cast(cpu_ptrs.data()), - cpu_ptrs.size() * sizeof(T*), context.stream()); - - const T** gpu_a_ptrs = - reinterpret_cast(tmp_gpu_ptrs_data->ptr()); - T** gpu_b_ptrs = - reinterpret_cast(tmp_gpu_ptrs_data->ptr()) + batch_size; - blas.BatchedTRSM(side, uplo, transA, diag, M, N, static_cast(1.0), - gpu_a_ptrs, lda, gpu_b_ptrs, ldb, batch_size); - } - } -}; - -template class TriangularSolveFunctor; -template class TriangularSolveFunctor; - } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/matrix_solve.h b/paddle/fluid/operators/math/matrix_solve.h index 1dc43205592f69cc105b43fe49b2f7872f8251c3..415d0c6dd8e0cf51958783c32aa49c66cce9e15c 100644 --- a/paddle/fluid/operators/math/matrix_solve.h +++ b/paddle/fluid/operators/math/matrix_solve.h @@ -117,14 +117,6 @@ class MatrixSolveFunctor { const framework::Tensor& b, framework::Tensor* out); }; -template -class TriangularSolveFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor* a, - framework::Tensor* b, bool left, bool upper, bool transpose, - bool unitriangular); -}; - } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/pooling.h b/paddle/fluid/operators/math/pooling.h deleted file mode 100644 index dfd3dad38644b65ef0b5e62e1b54ce210e9c489a..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/math/pooling.h +++ /dev/null @@ -1,315 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/amp/fp16_type_traits.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/macros.h" -#include "paddle/phi/core/hostdevice.h" - -namespace paddle { -namespace operators { -namespace math { - -/* - * \brief Extracting simple operations from pooling. - * Both MaxPool and AvgPool need "initial", "compute" and "finalize" - * operation. - * MaxPool initializes temp variable to the negative maximum to find the - * maximum value in the pooling field. - * AvgPool initializes temp variable to the zero to accumulate all values - * in pool pooling, and finally takes the average. - * MaxPoolGrad and AvgPoolGrad are gradient operations respectively. - */ -template -class MaxPool { - public: - DEVICE inline T initial() { return static_cast(-FLT_MAX); } - DEVICE inline void compute(const T& x, T* y) { *y = *y > x ? *y : x; } - DEVICE inline void finalize(const T& pool_field, T* y) {} -}; - -template -class AvgPool { - using MT = typename details::MPTypeTrait::Type; - MT intermediate_res; - - public: - DEVICE inline T initial() { - intermediate_res = static_cast(0.0f); - return static_cast(0); - } - - DEVICE inline void compute(const T& x, T* y) { - intermediate_res += static_cast(x); - } - - DEVICE inline void finalize(const T& pool_field, T* y) { - *y = static_cast(intermediate_res / (static_cast(pool_field))); - } -}; - -template -class MaxPoolGrad { - public: - static constexpr bool use_x = true; - HOSTDEVICE inline void compute(const T& x, const T& y, const T& dy, T scale, - T* dx) { - *dx += dy * static_cast(x == y); - } -}; - -template -class AvgPoolGrad { - public: - static constexpr bool use_x = false; - HOSTDEVICE inline void compute(const T& x, const T& y, const T& dy, T scale, - T* dx) { - *dx += (scale * dy); - } -}; - -/* used for adaptive pool to calculate start and end index of each divided grid - */ -HOSTDEVICE inline int AdaptStartIndex(int ph, int input_size, int output_size) { - return static_cast( - floor(static_cast(ph * input_size) / output_size)); -} - -HOSTDEVICE inline int AdaptEndIndex(int ph, int input_size, int output_size) { - return static_cast( - ceil(static_cast((ph + 1) * input_size) / output_size)); -} - -/* - * \brief Getting pooling results, and calculating gradient. - * - * In pool2d, all Tensors are in NCHW or NHWC format. Where N is batch size, C - * is the number of channels, H and W is the height and width of feature. - * In pool3d, all Tensors are in NCDHW or NDHWC format. Where N is batch size, C - * is the number of channels, D, H and W is the depth, height and width of - * feature. - * - * In max pooling, it is possible that the pooling region has multiple maximum - * elements. In this case, we should compute the gradient of the first maximum - * element. - * This is different from average pooling. So we rewrite the max_pool_grad: - * MaxPool2dGradFunctor, MaxPool3dGradFunctor. - */ -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -template -class Pool2dDirectCUDAFunctor { - public: - void operator()(const T* input, const std::vector& input_shape, - const std::vector& output_shape, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, bool exclusive, - bool adaptive, T* output, gpuStream_t stream, - PoolProcess pool_compute); -}; -#endif - -template -class Pool2dFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, bool exclusive, - bool adaptive, framework::Tensor* output, - PoolProcess pool_compute); - - // overload operator() to support argument data_format - void operator()(const DeviceContext& context, const framework::Tensor& input, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - const std::string data_format, bool exclusive, bool adaptive, - framework::Tensor* output, PoolProcess pool_compute); -}; - -template -class Pool2dGradFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, bool exclusive, - bool adaptive, framework::Tensor* input_grad, - PoolProcess pool_compute); - // overload operator() to support argument data_format - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - const std::string data_format, bool exclusive, bool adaptive, - framework::Tensor* input_grad, PoolProcess pool_compute); -}; - -template -class MaxPool2dGradFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - framework::Tensor* input_grad); - // overload operator() to support argument data_format - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - const std::string data_format, framework::Tensor* input_grad); -}; - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -template -class Pool3dDirectCUDAFunctor { - public: - void operator()(const T* input, const std::vector& input_shape, - const std::vector& output_shape, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, bool exclusive, - bool adaptive, T* output, gpuStream_t stream, - PoolProcess pool_compute); -}; -#endif - -template -class Pool3dFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, bool exclusive, - bool adaptive, framework::Tensor* output, - PoolProcess pool_compute); - // overload operator() to support argument data_format - void operator()(const DeviceContext& context, const framework::Tensor& input, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - const std::string data_format, bool exclusive, bool adaptive, - framework::Tensor* output, PoolProcess pool_compute); -}; - -template -class Pool3dGradFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, bool exclusive, - bool adaptive, framework::Tensor* input_grad, - PoolProcess pool_compute); - // overload operator() to support argument data_format - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - const std::string data_format, bool exclusive, bool adaptive, - framework::Tensor* input_grad, PoolProcess pool_compute); -}; - -template -class MaxPool3dGradFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - framework::Tensor* input_grad); - // overload operator() to support argument data_format - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - const std::string data_format, framework::Tensor* input_grad); -}; - -/* - * \brief Getting max pooling results and corresponding max index, and - * calculating gradient. - * In up-sampling-pooling, it is necessary to know max element index. - * In pool2d, all tensors are in NCHW format. In pool3d, all tensors are in - * NCDHW format. - */ -template -class MaxPool2dWithIndexFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, bool adaptive, - framework::Tensor* output, framework::Tensor* mask); -}; - -template -class MaxPool2dWithIndexGradFunctor { - public: - void operator()(const DeviceContext& context, - const framework::Tensor& output_grad, - const framework::Tensor& mask, const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, bool adaptive, - framework::Tensor* input_grad); -}; - -template -class MaxPool3dWithIndexFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, bool adaptive, - framework::Tensor* output, framework::Tensor* mask); -}; - -template -class MaxPool3dWithIndexGradFunctor { - public: - void operator()(const DeviceContext& context, - const framework::Tensor& output_grad, - const framework::Tensor& mask, const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, bool adaptive, - framework::Tensor* input_grad); -}; - -} // namespace math -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/matrix_rank_op.cc b/paddle/fluid/operators/matrix_rank_op.cc index 1f04875c2203b2af80aa3cb81aaf95fbb0a6fe6c..e7d08b6597360bb0431add6ae63eb99f401c8ce0 100644 --- a/paddle/fluid/operators/matrix_rank_op.cc +++ b/paddle/fluid/operators/matrix_rank_op.cc @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/matrix_rank_op.h" #include #include #include "paddle/fluid/operators/elementwise/elementwise_op_function.h" @@ -70,9 +69,9 @@ class MatrixRankeOp : public framework::OperatorWithKernel { std::vector x_batch_dims_array(max_dim); std::vector tol_dims_array(max_dim); std::vector out_dims_array(max_dim); - GetBroadcastDimsArrays(dim_x_batch, dim_tol, x_batch_dims_array.data(), - tol_dims_array.data(), out_dims_array.data(), - max_dim, axis); + phi::funcs::GetBroadcastDimsArrays( + dim_x_batch, dim_tol, x_batch_dims_array.data(), + tol_dims_array.data(), out_dims_array.data(), max_dim, axis); ctx->SetOutputDim("Out", phi::make_ddim(out_dims_array)); } } else { @@ -115,141 +114,9 @@ class MatrixRankeOpMaker : public framework::OpProtoAndCheckerMaker { } }; -template -void BatchEigenvalues(const T* x_data, T* eigenvalues_data, int batches, - int rows, int cols, int k) { - // Eigen::Matrix API need non-const pointer. - T* input = const_cast(x_data); - int stride = rows * cols; - for (int i = 0; i < batches; i++) { - auto m = Eigen::Map< - Eigen::Matrix>( - input + i * stride, rows, rows); - Eigen::SelfAdjointEigenSolver< - Eigen::Matrix> - eigen_solver(m); - auto eigenvalues = eigen_solver.eigenvalues().cwiseAbs(); - for (int j = 0; j < k; j++) { - *(eigenvalues_data + i * k + j) = eigenvalues[j]; - } - } -} - -template -void BatchSVD(const T* x_data, T* eigenvalues_data, int batches, int rows, - int cols, int k) { - // Eigen::Matrix API need non-const pointer. - T* input = const_cast(x_data); - int stride = rows * cols; - Eigen::BDCSVD< - Eigen::Matrix> - svd; - for (int i = 0; i < batches; i++) { - auto m = Eigen::Map< - Eigen::Matrix>( - input + i * stride, rows, cols); - svd.compute(m); - auto res_s = svd.singularValues(); - for (int j = 0; j < k; j++) { - eigenvalues_data[i * k + j] = res_s[j]; - } - } -} - -template -class MatrixRankCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* x = context.Input("X"); - auto* x_data = x->data(); - auto* out = context.Output("Out"); - out->mutable_data(context.GetPlace()); - bool hermitian = context.Attr("hermitian"); - - auto dim_x = x->dims(); - auto dim_out = out->dims(); - int rows = dim_x[dim_x.size() - 2]; - int cols = dim_x[dim_x.size() - 1]; - int k = std::min(rows, cols); - auto numel = x->numel(); - int batches = numel / (rows * cols); - - bool use_default_tol = context.Attr("use_default_tol"); - const Tensor* atol_tensor = nullptr; - Tensor temp_tensor; - T rtol_T = 0; - if (use_default_tol) { - framework::TensorFromVector(std::vector{0}, - context.device_context(), &temp_tensor); - atol_tensor = &temp_tensor; - rtol_T = std::numeric_limits::epsilon() * std::max(rows, cols); - } else if (context.HasInput("TolTensor")) { - atol_tensor = context.Input("TolTensor"); - } else { - framework::TensorFromVector(std::vector{context.Attr("tol")}, - context.device_context(), &temp_tensor); - atol_tensor = &temp_tensor; - } - - Tensor eigenvalue_tensor; - auto* eigenvalue_data = eigenvalue_tensor.mutable_data( - detail::GetEigenvalueDim(dim_x, k), context.GetPlace()); - if (hermitian) { - BatchEigenvalues(x_data, eigenvalue_data, batches, rows, cols, k); - } else { - BatchSVD(x_data, eigenvalue_data, batches, rows, cols, k); - } - - auto dito_T = - math::DeviceIndependenceTensorOperations( - context); - std::vector max_eigenvalue_shape = - phi::vectorize(detail::RemoveLastDim(eigenvalue_tensor.dims())); - Tensor max_eigenvalue_tensor = - dito_T.ReduceMax(eigenvalue_tensor, max_eigenvalue_shape); - - Tensor temp_rtol_tensor; - framework::TensorFromVector(std::vector{rtol_T}, &temp_rtol_tensor); - Tensor rtol_tensor = dito_T.Mul(temp_rtol_tensor, max_eigenvalue_tensor); - Tensor tol_tensor; - tol_tensor.mutable_data(dim_out, context.GetPlace()); - ElementwiseComputeEx, platform::CPUDeviceContext, - T, T>(context, atol_tensor, &rtol_tensor, -1, - GreaterElementFunctor(), &tol_tensor); - - tol_tensor.Resize(detail::NewAxisDim(tol_tensor.dims(), 1)); - - Tensor compare_result; - compare_result.mutable_data(detail::NewAxisDim(dim_out, k), - context.GetPlace()); - - int axis = -1; - if (eigenvalue_tensor.dims().size() >= tol_tensor.dims().size()) { - ElementwiseComputeEx, - platform::CPUDeviceContext, T, int>( - context, &eigenvalue_tensor, &tol_tensor, axis, - phi::funcs::GreaterThanFunctor(), &compare_result); - } else { - ElementwiseComputeEx, - platform::CPUDeviceContext, T, int>( - context, &eigenvalue_tensor, &tol_tensor, axis, - phi::funcs::LessThanFunctor(), &compare_result); - } - auto dito_int = - math::DeviceIndependenceTensorOperations(context); - std::vector result_shape = phi::vectorize(dim_out); - Tensor result = dito_int.ReduceSum(compare_result, result_shape); - out->ShareDataWith(result); - } -}; - } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(matrix_rank, ops::MatrixRankeOp, ops::MatrixRankeOpMaker); - -REGISTER_OP_CPU_KERNEL(matrix_rank, ops::MatrixRankCPUKernel, - ops::MatrixRankCPUKernel); diff --git a/paddle/fluid/operators/matrix_rank_op.cu b/paddle/fluid/operators/matrix_rank_op.cu deleted file mode 100644 index dccd716022d2ab74d3f6aa35aa70780ac4feba16..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/matrix_rank_op.cu +++ /dev/null @@ -1,316 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef PADDLE_WITH_HIP -// HIP not support cusolver -#include -#include -#include "paddle/fluid/memory/memory.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" -#include "paddle/fluid/operators/matrix_rank_op.h" -#include "paddle/fluid/operators/svd_helper.h" -#include "paddle/fluid/platform/dynload/cusolver.h" -#include "paddle/fluid/platform/for_range.h" -#include "paddle/phi/kernels/funcs/compare_functors.h" -#include "paddle/phi/kernels/funcs/complex_functors.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { -namespace detail { -DDim GetUDDim(const DDim& x_dim, int k) { - auto x_vec = phi::vectorize(x_dim); - x_vec[x_vec.size() - 1] = k; - return phi::make_ddim(x_vec); -} - -DDim GetVHDDim(const DDim& x_dim, int k) { - auto x_vec = phi::vectorize(x_dim); - x_vec[x_vec.size() - 2] = k; - return phi::make_ddim(x_vec); -} -} // namespace detail - -template -class MatrixRankGPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto& dev_ctx = - context.template device_context(); - - const Tensor* x = context.Input("X"); - auto* x_data = x->data(); - auto* out = context.Output("Out"); - out->mutable_data(context.GetPlace()); - bool hermitian = context.Attr("hermitian"); - - auto dim_x = x->dims(); - auto dim_out = out->dims(); - int rows = dim_x[dim_x.size() - 2]; - int cols = dim_x[dim_x.size() - 1]; - int k = std::min(rows, cols); - auto numel = x->numel(); - int batches = numel / (rows * cols); - - bool use_default_tol = context.Attr("use_default_tol"); - const Tensor* atol_tensor = nullptr; - Tensor temp_tensor; - T rtol_T = 0; - if (use_default_tol) { - framework::TensorFromVector(std::vector{0}, - context.device_context(), &temp_tensor); - atol_tensor = &temp_tensor; - rtol_T = std::numeric_limits::epsilon() * std::max(rows, cols); - } else if (context.HasInput("TolTensor")) { - atol_tensor = context.Input("TolTensor"); - } else { - framework::TensorFromVector(std::vector{context.Attr("tol")}, - context.device_context(), &temp_tensor); - atol_tensor = &temp_tensor; - } - - // Must Copy X once, because the gesvdj will destory the content when exit. - Tensor x_tmp; - paddle::framework::TensorCopy(*x, context.GetPlace(), &x_tmp); - auto info = memory::Alloc(dev_ctx, sizeof(int) * batches); - int* info_ptr = reinterpret_cast(info->ptr()); - - Tensor eigenvalue_tensor; - auto* eigenvalue_data = eigenvalue_tensor.mutable_data( - detail::GetEigenvalueDim(dim_x, k), context.GetPlace()); - if (hermitian) { - SyevjBatched(dev_ctx, batches, rows, x_tmp.data(), eigenvalue_data, - info_ptr); - platform::ForRange for_range( - dev_ctx, eigenvalue_tensor.numel()); - phi::funcs::AbsFunctor functor(eigenvalue_data, eigenvalue_data, - eigenvalue_tensor.numel()); - for_range(functor); - } else { - Tensor U, VH; - auto* u_data = - U.mutable_data(detail::GetUDDim(dim_x, k), context.GetPlace()); - auto* vh_data = - VH.mutable_data(detail::GetVHDDim(dim_x, k), context.GetPlace()); - GesvdjBatched(dev_ctx, batches, cols, rows, k, x_tmp.data(), vh_data, - u_data, eigenvalue_data, info_ptr, 1); - } - - auto dito_T = - math::DeviceIndependenceTensorOperations(context); - std::vector max_eigenvalue_shape = - phi::vectorize(detail::RemoveLastDim(eigenvalue_tensor.dims())); - Tensor max_eigenvalue_tensor = - dito_T.ReduceMax(eigenvalue_tensor, max_eigenvalue_shape); - Tensor temp_rtol_tensor; - framework::TensorFromVector(std::vector{rtol_T}, - context.device_context(), &temp_rtol_tensor); - Tensor rtol_tensor = dito_T.Mul(temp_rtol_tensor, max_eigenvalue_tensor); - Tensor tol_tensor; - tol_tensor.mutable_data(dim_out, context.GetPlace()); - ElementwiseComputeEx, platform::CUDADeviceContext, - T, T>(context, atol_tensor, &rtol_tensor, -1, - GreaterElementFunctor(), &tol_tensor); - - tol_tensor.Resize(detail::NewAxisDim(tol_tensor.dims(), 1)); - - Tensor compare_result; - compare_result.mutable_data(detail::NewAxisDim(dim_out, k), - context.GetPlace()); - int axis = -1; - ElementwiseComputeEx, - platform::CUDADeviceContext, T, int64_t>( - context, &eigenvalue_tensor, &tol_tensor, axis, - phi::funcs::GreaterThanFunctor(), &compare_result); - auto dito_int = - math::DeviceIndependenceTensorOperations(context); - std::vector result_shape = phi::vectorize(dim_out); - Tensor result = dito_int.ReduceSum(compare_result, result_shape); - out->ShareDataWith(result); - } - - void GesvdjBatched(const platform::CUDADeviceContext& dev_ctx, int batchSize, - int m, int n, int k, T* A, T* U, T* V, T* S, int* info, - int thin_UV = 1) const; - - void SyevjBatched(const platform::CUDADeviceContext& dev_ctx, int batchSize, - int n, T* A, T* W, int* info) const; -}; - -template <> -void MatrixRankGPUKernel::GesvdjBatched( - const platform::CUDADeviceContext& dev_ctx, int batchSize, int m, int n, - int k, float* A, float* U, float* V, float* S, int* info, - int thin_UV) const { - // do not compute singular vectors - const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR; - gesvdjInfo_t gesvdj_params = NULL; - int lda = m; - int ldu = m; - int ldt = n; - int lwork = 0; - auto handle = dev_ctx.cusolver_dn_handle(); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params)); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSgesvdj_bufferSize( - handle, jobz, thin_UV, m, n, A, lda, S, U, ldu, V, ldt, &lwork, - gesvdj_params)); - auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(float)); - float* workspace_ptr = reinterpret_cast(workspace->ptr()); - int stride_A = lda * n; - int stride_U = ldu * (thin_UV ? k : m); - int stride_V = ldt * (thin_UV ? k : n); - for (int i = 0; i < batchSize; i++) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSgesvdj( - handle, jobz, thin_UV, m, n, A + stride_A * i, lda, S + k * i, - U + stride_U * i, ldu, V + stride_V * i, ldt, workspace_ptr, lwork, - info, gesvdj_params)); - int error_info; - memory::Copy(platform::CPUPlace(), &error_info, dev_ctx.GetPlace(), info, - sizeof(int), dev_ctx.stream()); - PADDLE_ENFORCE_EQ( - error_info, 0, - platform::errors::PreconditionNotMet( - "For batch [%d]: CUSolver SVD is not zero. [%d]", i, error_info)); - } - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params)); -} - -template <> -void MatrixRankGPUKernel::GesvdjBatched( - const platform::CUDADeviceContext& dev_ctx, int batchSize, int m, int n, - int k, double* A, double* U, double* V, double* S, int* info, - int thin_UV) const { - // do not compute singular vectors - const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR; - gesvdjInfo_t gesvdj_params = NULL; - int lda = m; - int ldu = m; - int ldt = n; - int lwork = 0; - auto handle = dev_ctx.cusolver_dn_handle(); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params)); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDgesvdj_bufferSize( - handle, jobz, thin_UV, m, n, A, lda, S, U, ldu, V, ldt, &lwork, - gesvdj_params)); - auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(double)); - double* workspace_ptr = reinterpret_cast(workspace->ptr()); - int stride_A = lda * n; - int stride_U = ldu * (thin_UV ? k : m); - int stride_V = ldt * (thin_UV ? k : n); - for (int i = 0; i < batchSize; ++i) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDgesvdj( - handle, jobz, thin_UV, m, n, A + stride_A * i, lda, S + k * i, - U + stride_U * i, ldu, V + stride_V * i, ldt, workspace_ptr, lwork, - info, gesvdj_params)); - // check the error info - int error_info; - memory::Copy(platform::CPUPlace(), &error_info, dev_ctx.GetPlace(), info, - sizeof(int), dev_ctx.stream()); - PADDLE_ENFORCE_EQ( - error_info, 0, - platform::errors::PreconditionNotMet( - "For batch [%d]: CUSolver SVD is not zero. [%d]", i, error_info)); - } - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params)); -} - -template <> -void MatrixRankGPUKernel::SyevjBatched( - const platform::CUDADeviceContext& dev_ctx, int batchSize, int n, float* A, - float* W, int* info) const { - auto handle = dev_ctx.cusolver_dn_handle(); - // Compute eigenvalues only - const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR; - // matrix is saved as column-major in cusolver. - // numpy and torch use lower triangle to compute eigenvalues, so here use - // upper triangle - cublasFillMode_t uplo = CUBLAS_FILL_MODE_UPPER; - int lda = n; - int stride_A = lda * n; - int lwork = 0; - syevjInfo_t params = NULL; - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cusolverDnCreateSyevjInfo(¶ms)); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSsyevj_bufferSize( - handle, jobz, uplo, n, A, lda, W, &lwork, params)); - auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(float)); - float* workspace_ptr = reinterpret_cast(workspace->ptr()); - for (int i = 0; i < batchSize; i++) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSsyevj( - handle, jobz, uplo, n, A + stride_A * i, lda, W + n * i, workspace_ptr, - lwork, info, params)); - - int error_info; - memory::Copy(platform::CPUPlace(), &error_info, dev_ctx.GetPlace(), info, - sizeof(int), dev_ctx.stream()); - PADDLE_ENFORCE_EQ( - error_info, 0, - platform::errors::PreconditionNotMet( - "For batch [%d]: CUSolver eigenvalues is not zero. [%d]", i, - error_info)); - } - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cusolverDnDestroySyevjInfo(params)); -} - -template <> -void MatrixRankGPUKernel::SyevjBatched( - const platform::CUDADeviceContext& dev_ctx, int batchSize, int n, double* A, - double* W, int* info) const { - auto handle = dev_ctx.cusolver_dn_handle(); - // Compute eigenvalues only - const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR; - // upper triangle of A is stored - cublasFillMode_t uplo = CUBLAS_FILL_MODE_UPPER; - int lda = n; - int stride_A = lda * n; - int lwork = 0; - syevjInfo_t params = NULL; - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cusolverDnCreateSyevjInfo(¶ms)); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDsyevj_bufferSize( - handle, jobz, uplo, n, A, lda, W, &lwork, params)); - auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(double)); - double* workspace_ptr = reinterpret_cast(workspace->ptr()); - - for (int i = 0; i < batchSize; i++) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDsyevj( - handle, jobz, uplo, n, A + stride_A * i, lda, W + n * i, workspace_ptr, - lwork, info, params)); - int error_info; - memory::Copy(platform::CPUPlace(), &error_info, dev_ctx.GetPlace(), info, - sizeof(int), dev_ctx.stream()); - PADDLE_ENFORCE_EQ( - error_info, 0, - platform::errors::PreconditionNotMet( - "For batch [%d]: CUSolver eigenvalues is not zero. [%d]", i, - error_info)); - } - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cusolverDnDestroySyevjInfo(params)); -} - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(matrix_rank, ops::MatrixRankGPUKernel, - ops::MatrixRankGPUKernel); -#endif // not PADDLE_WITH_HIP diff --git a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc index ab02d4cfed9d54f9d168f6088df3e41d3e3e7c54..1078b451c55bae09c1274fe6ce3f45d21574d5e1 100644 --- a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc @@ -12,14 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/pool_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_reuse.h" +#include "paddle/phi/kernels/funcs/pooling.h" namespace paddle { namespace operators { using framework::DataLayout; +using framework::Tensor; using dnnl::memory; using dnnl::pooling_backward; using dnnl::pooling_forward; @@ -83,11 +85,11 @@ class PoolingMKLDNNHandler phi::slice_ddim(input_dims, 2, input_dims.size()); if (global_pooling) { - operators::UpdateKsize(&ksize, data_dims); + phi::funcs::UpdateKernelSize(&ksize, data_dims); } - operators::UpdatePadding(&paddings, global_pooling, 0, padding_algorithm, - data_dims, strides, ksize); + phi::funcs::UpdatePadding(&paddings, global_pooling, 0, padding_algorithm, + data_dims, strides, ksize); const auto src_tz = phi::vectorize(input->dims()); const auto dst_tz = phi::vectorize(output->dims()); @@ -173,11 +175,11 @@ class PoolingMKLDNNHandler framework::DDim data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size()); if (global_pooling) { - operators::UpdateKsize(&ksize, data_dims); + phi::funcs::UpdateKernelSize(&ksize, data_dims); } - operators::UpdatePadding(&paddings, global_pooling, 0, padding_algorithm, - data_dims, strides, ksize); + phi::funcs::UpdatePadding(&paddings, global_pooling, 0, padding_algorithm, + data_dims, strides, ksize); auto src_tz = phi::vectorize(in_x->dims()); auto diff_src_tz = phi::vectorize(in_x_grad->dims()); diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc index e9dadd5ec937cd11c84777a582cc1f7ac9fc3c33..4090d5ffca801512e423b02bfda3dd1a1bc49f03 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc @@ -24,6 +24,7 @@ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" +#include "paddle/phi/core/kernel_registry.h" USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); @@ -32,6 +33,8 @@ USE_OP_DEVICE_KERNEL(relu, MKLDNN); USE_OP_ITSELF(softmax); USE_OP_DEVICE_KERNEL(softmax, MKLDNN); +PD_DECLARE_KERNEL(softmax, CPU, ALL_LAYOUT); + namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc index 9d0062e31388413fd4a441687631faebe8846c6e..717af61b858dc16f9bdda20f530cbf06a09908eb 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc @@ -26,13 +26,14 @@ #include "paddle/fluid/platform/place.h" #include "paddle/phi/core/kernel_registry.h" -USE_OP(pool2d); +USE_OP_ITSELF(pool2d); USE_OP_DEVICE_KERNEL(pool2d, MKLDNN); USE_OP_ITSELF(relu); USE_OP_DEVICE_KERNEL(relu, MKLDNN); USE_OP_ITSELF(transpose); USE_OP_DEVICE_KERNEL(transpose, MKLDNN); +PD_DECLARE_KERNEL(pool2d, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(relu, CPU, ALL_LAYOUT); namespace paddle { diff --git a/paddle/fluid/operators/one_hot_v2_op.cc b/paddle/fluid/operators/one_hot_v2_op.cc index e212f4e7e2b7d1ad7964cc9351f1c4e241d5a79e..122b6a8a80aac95ab98ad95ed3e6339684978d12 100644 --- a/paddle/fluid/operators/one_hot_v2_op.cc +++ b/paddle/fluid/operators/one_hot_v2_op.cc @@ -12,9 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/one_hot_v2_op.h" #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -22,26 +26,6 @@ namespace operators { class OneHotV2Op : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "one_hot_v2"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "one_hot_v2"); - - auto x_dims = ctx->GetInputDim("X"); - PADDLE_ENFORCE_GE(x_dims.size(), 1, - platform::errors::InvalidArgument( - "Rank of Input(X) should be at least 1.")); - - int depth = ctx->Attrs().Get("depth"); - if (ctx->HasInput("depth_tensor")) { - depth = -1; - } - - auto out_dims_vec = phi::vectorize(x_dims); - out_dims_vec.push_back(depth); - auto out_dims = phi::make_ddim(out_dims_vec); - ctx->SetOutputDim("Out", out_dims); - ctx->ShareLoD("X", /* --> */ "Out"); - } protected: framework::OpKernelType GetExpectedKernelType( @@ -52,7 +36,7 @@ class OneHotV2Op : public framework::OperatorWithKernel { } framework::OpKernelType GetKernelTypeForVar( - const std::string& var_name, const Tensor& tensor, + const std::string& var_name, const framework::Tensor& tensor, const framework::OpKernelType& expected_kernel_type) const override { if (var_name == "depth_tensor") { return expected_kernel_type; @@ -114,10 +98,12 @@ Out is a LoDTensor: } // namespace paddle namespace ops = paddle::operators; + +DECLARE_INFER_SHAPE_FUNCTOR(one_hot_v2, OneHotInferShapeFunctor, + PD_INFER_META(phi::OneHotRawInferMeta)); + REGISTER_OPERATOR( one_hot_v2, ops::OneHotV2Op, ops::OneHotV2OpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL( - one_hot_v2, ops::OneHotV2Kernel, - ops::OneHotV2Kernel); + paddle::framework::EmptyGradOpMaker, + OneHotInferShapeFunctor); diff --git a/paddle/fluid/operators/one_hot_v2_op.cu b/paddle/fluid/operators/one_hot_v2_op.cu deleted file mode 100644 index 77e2a931e50de5b7775463fc7bbf6262e2ad4a53..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/one_hot_v2_op.cu +++ /dev/null @@ -1,100 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/one_hot_v2_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -namespace paddle { -namespace operators { -using platform::PADDLE_CUDA_NUM_THREADS; - -template -__global__ void FillOutputKernel(const InT* p_in_data, OutT* p_out_data, - const int64_t numel, const int depth) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < numel && p_in_data[idx] >= 0 && p_in_data[idx] < depth) { - *(p_out_data + (idx * depth) + p_in_data[idx]) = 1.0; - } -} - -template -struct OneHotV2OpCUDAFunctor { - const framework::LoDTensor* in_; - framework::LoDTensor* out_; - const DeviceContext& ctx_; - int depth_; - - OneHotV2OpCUDAFunctor(const framework::LoDTensor* in, - framework::LoDTensor* out, int depth, - const DeviceContext& ctx) - : in_(in), out_(out), depth_(depth), ctx_(ctx) {} - - template - void apply() const { - auto* p_in_data = in_->data(); - auto numel = in_->numel(); - auto* p_out_data = out_->mutable_data(ctx_.GetPlace()); - auto stream = ctx_.stream(); - phi::funcs::set_constant(ctx_, out_, 0.0); - - FillOutputKernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / - PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>( - p_in_data, p_out_data, numel, depth_); - } -}; - -using LoDTensor = framework::LoDTensor; -template -class OneHotV2CUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* in = context.Input("X"); - auto* out = context.Output("Out"); - - int depth = -1; - if (context.HasInput("depth_tensor")) { - auto* depth_tensor = context.Input("depth_tensor"); - if (platform::is_gpu_place(depth_tensor->place())) { - framework::Tensor temp; - paddle::framework::TensorCopySync(*depth_tensor, platform::CPUPlace(), - &temp); - depth = *temp.data(); - } else { - depth = *depth_tensor->data(); - } - - auto out_dims = out->dims(); - out_dims[out_dims.size() - 1] = depth; - out->Resize(out_dims); - } else { - depth = context.Attr("depth"); - } - framework::VisitDataType( - static_cast( - context.Attr("dtype")), - OneHotV2OpCUDAFunctor( - in, out, depth, context.template device_context())); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - one_hot_v2, - ops::OneHotV2CUDAKernel, - ops::OneHotV2CUDAKernel); diff --git a/paddle/fluid/operators/one_hot_v2_op_npu.cc b/paddle/fluid/operators/one_hot_v2_op_npu.cc index acf6baf50b418ae0fd68d64f52f80f47df1c60c3..e5702a37bb2b4a4180e209bb5e306be64830bd99 100644 --- a/paddle/fluid/operators/one_hot_v2_op_npu.cc +++ b/paddle/fluid/operators/one_hot_v2_op_npu.cc @@ -12,13 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/one_hot_v2_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { namespace operators { using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; template class OneHotV2NPUKernel : public framework::OpKernel { diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc b/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc new file mode 100644 index 0000000000000000000000000000000000000000..e5399ee36ba7ff4a983448d607c108db8870138c --- /dev/null +++ b/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc @@ -0,0 +1,163 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/optimizers/merged_momentum_op.h" +#include "paddle/fluid/operators/mlu/mlu_baseop.h" + +namespace paddle { +namespace operators { + +template +class MLUMergedMomentumOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto params = ctx.MultiInput("Param"); + auto params_out = ctx.MultiOutput("ParamOut"); + size_t n = params.size(); + PADDLE_ENFORCE_EQ(n, params_out.size(), + platform::errors::InvalidArgument( + "The size of Output(ParamOut) must be equal to " + "Input(Param), but got the size of Output(ParamOut) " + "is %d, the size of Input(Param) is %d.", + params_out.size(), n)); + for (size_t i = 0; i < n; ++i) { + PADDLE_ENFORCE_EQ(params[i], params_out[i], + platform::errors::InvalidArgument( + "The size of Input(Param) and Output(ParamOut) " + "must be the same Tensors.")); + } + + auto grads = ctx.MultiInput("Grad"); + PADDLE_ENFORCE_EQ( + n, grads.size(), + platform::errors::InvalidArgument( + "The size of Input(Grad) must be equal to Input(Param), but got " + "the size of Input(Grad) is %d, the size of Input(Param) is %d.", + grads.size(), n)); + + auto velocitys = ctx.MultiInput("Velocity"); + PADDLE_ENFORCE_EQ(n, velocitys.size(), + platform::errors::InvalidArgument( + "The size of Input(Velocity) must be equal to " + "Input(Param), but got the size of Input(Velocity) " + "is %d, the size of Input(Param) is %d.", + velocitys.size(), n)); + + auto velocitys_out = ctx.MultiOutput("VelocityOut"); + PADDLE_ENFORCE_EQ( + n, velocitys_out.size(), + platform::errors::InvalidArgument( + "The size of Output(VelocityOut) must be " + "equal to Input(Param), but got the size of Output(VelocityOut) is " + "%d, the size of Input(Param) is %d.", + velocitys_out.size(), n)); + for (size_t i = 0; i < n; ++i) { + PADDLE_ENFORCE_EQ(velocitys[i], velocitys_out[i], + platform::errors::InvalidArgument( + "Input(Velocity) and Output(VelocityOut) must be " + "the same Tensors.")); + } + + auto mu = ctx.Attr("mu"); + auto lrs = ctx.MultiInput("LearningRate"); + if (lrs.size() != 1) { + PADDLE_ENFORCE_EQ( + n, lrs.size(), + platform::errors::InvalidArgument( + "If the size of Input(LearningRate) is not 1, the size of " + "Input(LearningRate) must be " + "equal to Input(Param), but got the size of Input(LearningRate) " + "is %d, the size of Input(Param) is %d.", + lrs.size(), n)); + } + auto use_nesterov = ctx.Attr("use_nesterov"); + auto regularization_methods = + ctx.Attr>("regularization_method"); + auto regularization_coeffs = + ctx.Attr>("regularization_coeff"); + if (regularization_methods.size() != 0) { + PADDLE_ENFORCE_EQ( + n, regularization_methods.size(), + platform::errors::InvalidArgument( + "The size of Attr(regularization_method) must be equal " + "to Input(Param), but got the size of " + "Attr(regularization_method) is %d, the size of Input(Param) is " + "%d.", + regularization_methods.size(), n)); + PADDLE_ENFORCE_EQ( + n, regularization_coeffs.size(), + platform::errors::InvalidArgument( + "The size of Attr(regularization_coeff) must be equal " + "to Input(Param), but got the size of Attr(regularization_coeff) " + "is %d, the size of Input(Param) is %d.", + regularization_coeffs.size(), n)); + } + + VLOG(5) << "use_nesterov: " << use_nesterov + << ", regularization_methods.size(): " + << regularization_methods.size() + << ", regularization_coeffs.size(): " + << regularization_coeffs.size(); + + auto& dev_ctx = ctx.template device_context(); + + Tensor mu_tensor = ctx.AllocateTmpTensor({1}, dev_ctx); + MLUCnnlTensorDesc mu_tensor_desc(mu_tensor); + MLUCnnl::Fill(ctx, mu, mu_tensor_desc.get(), GetBasePtr(&mu_tensor)); + + for (size_t idx = 0; idx < n; ++idx) { + RegularizationType regularization_flag = + regularization_methods.size() > 0 && + regularization_methods[idx] == "l2_decay" + ? RegularizationType::kL2DECAY + : RegularizationType::kNONE; + T regularization_coeff = static_cast(0.0); + if (regularization_coeffs.size() != 0) { + regularization_coeff = static_cast(regularization_coeffs[idx]); + } + + auto learning_rate = lrs.size() > 1 ? lrs[idx] : lrs[0]; + auto param_out = params_out[idx]; + auto velocity_out = velocitys_out[idx]; + + auto grad = grads[idx]; + Tensor regularized_grad; + MLUCnnlTensorDesc param_desc(*param_out); + if (regularization_flag == RegularizationType::kL2DECAY) { + regularized_grad = ctx.AllocateTmpTensor( + param_out->dims(), dev_ctx); + MLUCnnlOpTensorDesc op_tensor_desc( + CNNL_OP_TENSOR_ADD, ToCnnlDataType(), CNNL_NOT_PROPAGATE_NAN); + MLUCnnl::OpTensor(ctx, op_tensor_desc.get(), param_desc.get(), + GetBasePtr(param_out), param_desc.get(), + GetBasePtr(grad), param_desc.get(), + GetBasePtr(®ularized_grad), ToCnnlDataType(), + regularization_coeff); + } else { + regularized_grad = *grad; + } + MLUCnnl::ApplyMomentum(ctx, param_desc.get(), + GetBasePtr(®ularized_grad), use_nesterov, + GetBasePtr(learning_rate), GetBasePtr(&mu_tensor), + GetBasePtr(param_out), GetBasePtr(velocity_out)); + } + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; +REGISTER_OP_MLU_KERNEL(merged_momentum, ops::MLUMergedMomentumOpKernel, + ops::MLUMergedMomentumOpKernel); diff --git a/paddle/fluid/operators/pool_cudnn_op.cu.cc b/paddle/fluid/operators/pool_cudnn_op.cu.cc deleted file mode 100644 index 6335004e69a37109664940e4d3445e3694be9cc9..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/pool_cudnn_op.cu.cc +++ /dev/null @@ -1,567 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/pool_op.h" -#include "paddle/phi/kernels/funcs/math_function.h" -#ifdef PADDLE_WITH_HIP -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/operator.h" -#endif -#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using ScopedTensorDescriptor = platform::ScopedTensorDescriptor; -using ScopedPoolingDescriptor = platform::ScopedPoolingDescriptor; -using DataLayout = platform::DataLayout; -using PoolingMode = platform::PoolingMode; -template -using ScalingParamType = typename platform::CudnnDataType::ScalingParamType; - -DataLayout getLayoutFromStr(std::string data_format) { - if (data_format == "NHWC") { - return DataLayout::kNHWC; - } else if (data_format == "NCHW") { - return DataLayout::kNCHW; - } else if (data_format == "NCDHW") { - return DataLayout::kNCDHW; - } else { - return DataLayout::kNCDHW; - } -} - -template -class PoolCUDNNOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::InvalidArgument("Pool operator CUDA kernel must use " - "CUDAPlace rather than CPUPlace.")); - - const Tensor *input = ctx.Input("X"); - Tensor *output = ctx.Output("Out"); - output->mutable_data(ctx.GetPlace()); - std::string pooling_type = ctx.Attr("pooling_type"); - bool exclusive = ctx.Attr("exclusive"); - bool adaptive = ctx.Attr("adaptive"); - std::vector ksize = ctx.Attr>("ksize"); - std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - std::string data_format = ctx.Attr("data_format"); - bool global_pooling = ctx.Attr("global_pooling"); - std::string padding_algorithm = ctx.Attr("padding_algorithm"); - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - // update paddings - auto in_x_dims = input->dims(); - framework::DDim data_dims; - if (channel_last) { - data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1); - } else { - data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size()); - } - UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm, - data_dims, strides, ksize); - if (data_dims.size() * 2 == static_cast(paddings.size())) { - for (int i = 0; i < data_dims.size(); ++i) { - paddings.erase(paddings.begin() + i + 1); - } - } - - if (global_pooling) { - UpdateKsize(&ksize, data_dims); - } - - const std::string str_NCHW = "NCHW", str_NHWC = "NHWC"; - const std::string str_NCDHW = "NCDHW", str_NDHWC = "NDHWC"; - - // -----------------transformed tensor ------------------------ - - Tensor transformed_input(input->type()); - Tensor transformed_output(output->type()); - DataLayout layout; - - if (data_format == str_NDHWC) { - layout = DataLayout::kNCDHW; - auto &dev_ctx = - ctx.template device_context(); - std::vector axis{0, 4, 1, 2, 3}; - - // input - transformed_input.Resize(input->dims()); - - auto in_dims_vec = phi::vectorize(input->dims()); - in_dims_vec[1] = input->dims()[4]; - in_dims_vec[2] = input->dims()[1]; - in_dims_vec[3] = input->dims()[2]; - in_dims_vec[4] = input->dims()[3]; - transformed_input.Resize(phi::make_ddim(in_dims_vec)); - transformed_input.mutable_data(ctx.GetPlace(), input->type()); - - phi::funcs::Transpose trans5; - trans5(dev_ctx, *input, &transformed_input, axis); - - // output - transformed_output.Resize(output->dims()); - - auto out_dims_vec = phi::vectorize(output->dims()); - out_dims_vec[1] = output->dims()[4]; - out_dims_vec[2] = output->dims()[1]; - out_dims_vec[3] = output->dims()[2]; - out_dims_vec[4] = output->dims()[3]; - transformed_output.Resize(phi::make_ddim(out_dims_vec)); -#ifdef PADDLE_WITH_HIP - // MIOPEN not support NHWC data layout - } else if (data_format == str_NHWC) { - layout = DataLayout::kNCHW; - auto &dev_ctx = - ctx.template device_context(); - std::vector axis{0, 3, 1, 2}; - - transformed_input.Resize(input->dims()); - auto in_dims_vec = phi::vectorize(input->dims()); - in_dims_vec[1] = input->dims()[3]; - in_dims_vec[2] = input->dims()[1]; - in_dims_vec[3] = input->dims()[2]; - transformed_input.Resize(phi::make_ddim(in_dims_vec)); - transformed_input.mutable_data(ctx.GetPlace(), input->type()); - - phi::funcs::Transpose trans; - trans(dev_ctx, *input, &transformed_input, axis); - - transformed_output.Resize(output->dims()); - auto out_dims_vec = phi::vectorize(output->dims()); - out_dims_vec[1] = output->dims()[3]; - out_dims_vec[2] = output->dims()[1]; - out_dims_vec[3] = output->dims()[2]; - transformed_output.Resize(phi::make_ddim(out_dims_vec)); -#endif - } else { - layout = getLayoutFromStr(data_format); - transformed_input = *input; - transformed_output = *output; - } - - const T *tranformed_input_data = transformed_input.data(); - T *tranformed_output_data = transformed_output.mutable_data( - transformed_output.dims(), ctx.GetPlace()); - - // ------------------- cudnn descriptors --------------------- - ScopedTensorDescriptor input_desc; - ScopedTensorDescriptor output_desc; - ScopedPoolingDescriptor pool_desc; - -#ifdef PADDLE_WITH_HIP - miopenTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( - layout, phi::vectorize(transformed_input.dims())); - miopenTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( - layout, phi::vectorize(transformed_output.dims())); -#else - cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( - layout, phi::vectorize(transformed_input.dims())); - cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( - layout, phi::vectorize(transformed_output.dims())); -#endif - PoolingMode pooling_mode; - if (pooling_type == "max") { - pooling_mode = PoolingMode::kMaximum; - } else { - pooling_mode = exclusive ? PoolingMode::kAverageExclusive - : PoolingMode::kAverageInclusive; - } - -#ifdef PADDLE_WITH_HIP - miopenPoolingDescriptor_t cudnn_pool_desc = - pool_desc.descriptor(pooling_mode, ksize, paddings, strides); -#else - cudnnPoolingDescriptor_t cudnn_pool_desc = - pool_desc.descriptor(pooling_mode, ksize, paddings, strides); -#endif - - // ------------------- cudnn pool algorithm --------------------- - auto handle = ctx.cuda_device_context().cudnn_handle(); - ScalingParamType alpha = 1.0f, beta = 0.0f; - -#ifdef PADDLE_WITH_HIP - char *pool_workspace; - size_t pool_worksize = 0; - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenPoolingGetWorkSpaceSizeV2( - cudnn_pool_desc, cudnn_output_desc, &pool_worksize)); - PADDLE_ENFORCE_GPU_SUCCESS(hipMalloc(&pool_workspace, pool_worksize)); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenPoolingForward( - handle, cudnn_pool_desc, &alpha, cudnn_input_desc, - tranformed_input_data, &beta, cudnn_output_desc, tranformed_output_data, - false, pool_workspace, pool_worksize)); - PADDLE_ENFORCE_GPU_SUCCESS(hipFree(pool_workspace)); -#else - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnPoolingForward( - handle, cudnn_pool_desc, &alpha, cudnn_input_desc, - tranformed_input_data, &beta, cudnn_output_desc, - tranformed_output_data)); -#endif - // add - if (data_format == str_NDHWC) { - auto &dev_ctx = - ctx.template device_context(); - std::vector axis{0, 2, 3, 4, 1}; - phi::funcs::Transpose - trans5_v2; - trans5_v2(dev_ctx, transformed_output, output, axis); - } -#ifdef PADDLE_WITH_HIP - // MIOPEN not support NHWC data layout - if (data_format == str_NHWC) { - auto &dev_ctx = - ctx.template device_context(); - std::vector axis{0, 2, 3, 1}; - phi::funcs::Transpose trans; - trans(dev_ctx, transformed_output, output, axis); - } -#endif - } -}; - -template -class PoolCUDNNGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::InvalidArgument("Pool operator CUDA kernel must use " - "CUDAPlace rather than CPUPlace.")); - - const Tensor *input = ctx.Input("X"); - const Tensor *output = ctx.Input("Out"); - const Tensor *output_grad = - ctx.Input(framework::GradVarName("Out")); - Tensor *input_grad = ctx.Output(framework::GradVarName("X")); - - std::string pooling_type = ctx.Attr("pooling_type"); - bool exclusive = ctx.Attr("exclusive"); - bool adaptive = ctx.Attr("adaptive"); - std::vector ksize = ctx.Attr>("ksize"); - std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - std::string data_format = ctx.Attr("data_format"); - bool global_pooling = ctx.Attr("global_pooling"); - std::string padding_algorithm = ctx.Attr("padding_algorithm"); - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - -#ifdef PADDLE_WITH_HIP - if (pooling_type == "max") { - using OpKernelMap = paddle::framework::OperatorWithKernel::OpKernelMap; - using OpKernelFunc = paddle::framework::OperatorWithKernel::OpKernelFunc; - auto &all_op_kernels = - paddle::framework::OperatorWithKernel::AllOpKernels(); - std::string op_type = "pool2d_grad"; - auto kernels_iter = all_op_kernels.find(op_type); - PADDLE_ENFORCE_NE( - kernels_iter, all_op_kernels.end(), - platform::errors::Unavailable( - "There are no kernels which are registered in the %s operator.", - op_type)); - OpKernelMap &kernels = kernels_iter->second; - paddle::framework::OpKernelType expected_kernel_key( - paddle::framework::ToDataType(typeid(T)), ctx.GetPlace()); - auto kernel_iter = kernels.find(expected_kernel_key); - PADDLE_ENFORCE_NE(kernel_iter, kernels.end(), - platform::errors::NotFound( - "Operator (%s) does not have kernel for %s.", - op_type, KernelTypeToString(expected_kernel_key))); - std::unique_ptr kernel_func_( - new OpKernelFunc(kernel_iter->second)); - (*kernel_func_)(ctx); - return; - } -#endif - - // update paddings - auto in_x_dims = input->dims(); - framework::DDim data_dims; - if (channel_last) { - data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1); - } else { - data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size()); - } - UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm, - data_dims, strides, ksize); - if (data_dims.size() * 2 == static_cast(paddings.size())) { - for (int i = 0; i < data_dims.size(); ++i) { - paddings.erase(paddings.begin() + i + 1); - } - } - - if (global_pooling) { - UpdateKsize(&ksize, data_dims); - } - - // ------- tensor grad -------------- - Tensor transformed_input(input->type()); - Tensor transformed_output(output->type()); - Tensor transformed_output_grad(output_grad->type()); - - input_grad->mutable_data(ctx.GetPlace()); - Tensor transformed_input_grad(input_grad->type()); - DataLayout layout; - const std::string str_NCHW = "NCHW", str_NHWC = "NHWC"; - const std::string str_NCDHW = "NCDHW", str_NDHWC = "NDHWC"; - if (data_format == str_NDHWC) { - layout = DataLayout::kNCDHW; - auto &dev_ctx = - ctx.template device_context(); - std::vector axis{0, 4, 1, 2, 3}; - - // input - transformed_input.Resize(input->dims()); - auto in_dims_vec = phi::vectorize(input->dims()); - in_dims_vec[1] = input->dims()[4]; - in_dims_vec[2] = input->dims()[1]; - in_dims_vec[3] = input->dims()[2]; - in_dims_vec[4] = input->dims()[3]; - transformed_input.Resize(phi::make_ddim(in_dims_vec)); - transformed_input.mutable_data(ctx.GetPlace(), input->type()); - - phi::funcs::Transpose trans5; - trans5(dev_ctx, *input, &transformed_input, axis); - - // output - transformed_output.Resize(output->dims()); - auto out_dims_vec = phi::vectorize(output->dims()); - out_dims_vec[1] = output->dims()[4]; - out_dims_vec[2] = output->dims()[1]; - out_dims_vec[3] = output->dims()[2]; - out_dims_vec[4] = output->dims()[3]; - transformed_output.Resize(phi::make_ddim(out_dims_vec)); - - transformed_output.mutable_data(ctx.GetPlace(), output->type()); - - phi::funcs::Transpose - trans5_v2; - trans5_v2(dev_ctx, *output, &transformed_output, axis); - - // output grad - transformed_output_grad.Resize(phi::make_ddim(out_dims_vec)); - transformed_output_grad.mutable_data(ctx.GetPlace(), output_grad->type()); - - phi::funcs::Transpose - trans5_v3; - trans5_v3(dev_ctx, *output_grad, &transformed_output_grad, axis); - - // input grad - transformed_input_grad.Resize(phi::make_ddim(in_dims_vec)); - -#ifdef PADDLE_WITH_HIP - // MIOPEN not support NHWC data layout - } else if (data_format == str_NHWC) { - layout = DataLayout::kNCHW; - auto &dev_ctx = - ctx.template device_context(); - std::vector axis{0, 3, 1, 2}; - - // input - transformed_input.Resize(input->dims()); - auto in_dims_vec = phi::vectorize(input->dims()); - in_dims_vec[1] = input->dims()[3]; - in_dims_vec[2] = input->dims()[1]; - in_dims_vec[3] = input->dims()[2]; - transformed_input.Resize(phi::make_ddim(in_dims_vec)); - transformed_input.mutable_data(ctx.GetPlace(), input->type()); - - phi::funcs::Transpose trans4; - trans4(dev_ctx, *input, &transformed_input, axis); - - // output - transformed_output.Resize(output->dims()); - auto out_dims_vec = phi::vectorize(output->dims()); - out_dims_vec[1] = output->dims()[3]; - out_dims_vec[2] = output->dims()[1]; - out_dims_vec[3] = output->dims()[2]; - transformed_output.Resize(phi::make_ddim(out_dims_vec)); - - transformed_output.mutable_data(ctx.GetPlace(), output->type()); - - phi::funcs::Transpose - trans4_v2; - trans4_v2(dev_ctx, *output, &transformed_output, axis); - - // output grad - transformed_output_grad.Resize(phi::make_ddim(out_dims_vec)); - transformed_output_grad.mutable_data(ctx.GetPlace(), output_grad->type()); - - phi::funcs::Transpose - trans4_v3; - trans4_v3(dev_ctx, *output_grad, &transformed_output_grad, axis); - - // input grad - transformed_input_grad.Resize(phi::make_ddim(in_dims_vec)); -#endif - } else { - layout = getLayoutFromStr(data_format); - transformed_input = *input; - transformed_output = *output; - transformed_output_grad = *output_grad; - transformed_input_grad = *input_grad; - } - - const T *input_data = transformed_input.data(); - const T *output_data = transformed_output.data(); - const T *output_grad_data = transformed_output_grad.data(); - - // ------------------- cudnn descriptors --------------------- - ScopedTensorDescriptor input_desc; - ScopedTensorDescriptor output_desc; - ScopedPoolingDescriptor pool_desc; - -#ifdef PADDLE_WITH_HIP - miopenTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( - layout, phi::vectorize(transformed_input.dims())); - miopenTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( - layout, phi::vectorize(transformed_output.dims())); -#else - cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( - layout, phi::vectorize(transformed_input.dims())); - cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( - layout, phi::vectorize(transformed_output.dims())); -#endif - PoolingMode pooling_mode; - if (pooling_type == "max") { - if (FLAGS_cudnn_deterministic) { - pooling_mode = PoolingMode::kMaximumDeterministic; - } else { - pooling_mode = PoolingMode::kMaximum; - } - } else { - pooling_mode = exclusive ? PoolingMode::kAverageExclusive - : PoolingMode::kAverageInclusive; - } - -#ifdef PADDLE_WITH_HIP - miopenPoolingDescriptor_t cudnn_pool_desc = - pool_desc.descriptor(pooling_mode, ksize, paddings, strides); -#else - cudnnPoolingDescriptor_t cudnn_pool_desc = - pool_desc.descriptor(pooling_mode, ksize, paddings, strides); -#endif - - // ------------------- cudnn pool algorithm --------------------- - auto handle = ctx.cuda_device_context().cudnn_handle(); - ScalingParamType alpha = 1.0f, beta = 0.0f; - if (input_grad) { - T *input_grad_data = transformed_input_grad.mutable_data( - transformed_input_grad.dims(), ctx.GetPlace()); -// Because beta is zero, it is unnecessary to reset input_grad. -#ifdef PADDLE_WITH_HIP - char *pool_workspace; - size_t pool_worksize = 0; - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenPoolingGetWorkSpaceSizeV2( - cudnn_pool_desc, cudnn_output_desc, &pool_worksize)); - PADDLE_ENFORCE_GPU_SUCCESS(hipMalloc(&pool_workspace, pool_worksize)); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenPoolingBackward( - handle, cudnn_pool_desc, &alpha, cudnn_output_desc, output_data, - cudnn_output_desc, output_grad_data, cudnn_input_desc, input_data, - &beta, cudnn_input_desc, input_grad_data, pool_workspace)); - PADDLE_ENFORCE_GPU_SUCCESS(hipFree(pool_workspace)); -#else - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnPoolingBackward( - handle, cudnn_pool_desc, &alpha, cudnn_output_desc, output_data, - cudnn_output_desc, output_grad_data, cudnn_input_desc, input_data, - &beta, cudnn_input_desc, input_grad_data)); -#endif - - if (data_format == str_NDHWC) { - auto &dev_ctx = - ctx.template device_context(); - std::vector axis{0, 2, 3, 4, 1}; - phi::funcs::Transpose - trans5_v4; - trans5_v4(dev_ctx, transformed_input_grad, input_grad, axis); - } -#ifdef PADDLE_WITH_HIP - // MIOPEN not support NHWC data layout - if (data_format == str_NHWC) { - auto &dev_ctx = - ctx.template device_context(); - std::vector axis{0, 2, 3, 1}; - phi::funcs::Transpose - trans4_v4; - trans4_v4(dev_ctx, transformed_input_grad, input_grad, axis); - } -#endif - } - } -}; - -template -class PoolCUDNNGradGradOpKernel : public PoolCUDNNOpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - std::string pooling_type = ctx.Attr("pooling_type"); - if (pooling_type == "max") { - PADDLE_THROW(platform::errors::InvalidArgument( - "Pool op grad grad only supports avgpool.")); - } else { - PoolCUDNNOpKernel::Compute(ctx); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -#ifdef PADDLE_WITH_HIP -// MIOPEN do not support double -REGISTER_OP_KERNEL(pool2d, CUDNN, plat::CUDAPlace, - ops::PoolCUDNNOpKernel, - ops::PoolCUDNNOpKernel); -REGISTER_OP_KERNEL(pool2d_grad, CUDNN, plat::CUDAPlace, - ops::PoolCUDNNGradOpKernel, - ops::PoolCUDNNGradOpKernel); - -REGISTER_OP_KERNEL(pool3d, CUDNN, plat::CUDAPlace, - ops::PoolCUDNNOpKernel, - ops::PoolCUDNNOpKernel); -REGISTER_OP_KERNEL(pool3d_grad, CUDNN, plat::CUDAPlace, - ops::PoolCUDNNGradOpKernel); -#else -REGISTER_OP_KERNEL(pool2d, CUDNN, plat::CUDAPlace, - ops::PoolCUDNNOpKernel, - ops::PoolCUDNNOpKernel, - ops::PoolCUDNNOpKernel); -REGISTER_OP_KERNEL(pool2d_grad, CUDNN, plat::CUDAPlace, - ops::PoolCUDNNGradOpKernel, - ops::PoolCUDNNGradOpKernel, - ops::PoolCUDNNGradOpKernel); -REGISTER_OP_KERNEL(pool2d_grad_grad, CUDNN, plat::CUDAPlace, - ops::PoolCUDNNGradGradOpKernel, - ops::PoolCUDNNGradGradOpKernel, - ops::PoolCUDNNGradGradOpKernel); - -REGISTER_OP_KERNEL(pool3d, CUDNN, plat::CUDAPlace, - ops::PoolCUDNNOpKernel, - ops::PoolCUDNNOpKernel, - ops::PoolCUDNNOpKernel); -REGISTER_OP_KERNEL(pool3d_grad, CUDNN, plat::CUDAPlace, - ops::PoolCUDNNGradOpKernel, - ops::PoolCUDNNGradOpKernel); -#endif diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc index ae095c2fa7aaa95cf667898b63a90988eb83caf0..44f3d8090e565c1581a49387db4b834b1abf8b62 100644 --- a/paddle/fluid/operators/pool_op.cc +++ b/paddle/fluid/operators/pool_op.cc @@ -15,6 +15,12 @@ limitations under the License. */ #include "paddle/fluid/operators/pool_op.h" #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/unary.h" + #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" @@ -23,125 +29,6 @@ limitations under the License. */ namespace paddle { namespace operators { -int PoolOutputSize(int input_size, int filter_size, int padding_1, - int padding_2, int stride, bool ceil_mode) { - int output_size; - if (!ceil_mode) { - output_size = - (input_size - filter_size + padding_1 + padding_2) / stride + 1; - } else { - output_size = - (input_size - filter_size + padding_1 + padding_2 + stride - 1) / - stride + - 1; - } - PADDLE_ENFORCE_GT( - output_size, 0, - platform::errors::InvalidArgument( - "the output size must be greater than 0. But received: " - "output_size = %d due to the settings of input_size(%d), " - "padding(%d,%d), " - "k_size(%d) and stride(%d). Please check again!", - output_size, input_size, padding_1, padding_2, filter_size, stride)); - return output_size; -} - -void PoolOp::InferShape(framework::InferShapeContext* ctx) const { - PADDLE_ENFORCE_EQ( - ctx->HasInput("X"), true, - platform::errors::NotFound("Input(X) of Pool operator is not found.")); - PADDLE_ENFORCE_EQ( - ctx->HasOutput("Out"), true, - platform::errors::NotFound("Output(Out) of Pool operator is not found.")); - - std::string pooling_type = ctx->Attrs().Get("pooling_type"); - std::vector ksize = ctx->Attrs().Get>("ksize"); - std::vector strides = ctx->Attrs().Get>("strides"); - std::vector paddings = ctx->Attrs().Get>("paddings"); - bool ceil_mode = ctx->Attrs().Get("ceil_mode"); - bool adaptive = ctx->Attrs().Get("adaptive"); - bool global_pooling = ctx->Attrs().Get("global_pooling"); - std::string data_format = ctx->Attrs().Get("data_format"); - std::string padding_algorithm = - ctx->Attrs().Get("padding_algorithm"); - - auto in_x_dims = ctx->GetInputDim("X"); - PADDLE_ENFORCE_EQ( - in_x_dims.size() == 4 || in_x_dims.size() == 5, true, - platform::errors::InvalidArgument( - "the input of Op(pool) should be 4-D or 5-D Tensor. But " - "received: %u-D Tensor and it's shape is [%s].", - in_x_dims.size(), in_x_dims)); - - PADDLE_ENFORCE_EQ( - in_x_dims.size() - ksize.size(), 2U, - platform::errors::InvalidArgument( - "the dimension of input minus the size of " - "Attr(ksize) must be euqal to 2 in Op(pool). " - "But received: the dimension of input minus the size " - "of Attr(ksize) is %d, the " - "input's dimension is %d, the shape of input " - "is [%s], the Attr(ksize)'s size is %d, the Attr(ksize) is [%s].", - in_x_dims.size() - ksize.size(), in_x_dims.size(), in_x_dims, - ksize.size(), phi::make_ddim(ksize))); - - PADDLE_ENFORCE_EQ( - ksize.size(), strides.size(), - platform::errors::InvalidArgument( - "the size of Attr(ksize) and Attr(strides) in " - "Op(pool) must be equal. " - "But received: Attr(ksize)'s size is %d, Attr(strides)'s " - "size is %d, Attr(ksize) is [%s], Attr(strides)is [%s].", - ksize.size(), strides.size(), phi::make_ddim(ksize), - phi::make_ddim(strides))); - - // MKL-DNN Kernels are using NCHW order of dims description - // so we ignore data_format consideration for MKL-DNN kernel - const bool channel_last = (ctx->IsRunMKLDNNKernel() == false) && - (data_format == "NHWC" || data_format == "NDHWC"); - - // update paddings if "SAME" or global_pooling - framework::DDim data_dims; - if (channel_last) { - data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1); - } else { - data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size()); - } - UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm, - data_dims, strides, ksize); - - if (global_pooling) { - UpdateKsize(&ksize, data_dims); - } - - std::vector output_shape; - if (adaptive) { - output_shape.insert(output_shape.end(), ksize.begin(), ksize.end()); - } else { - for (int i = 0; i < data_dims.size(); ++i) { - if ((!ctx->IsRuntime()) && (data_dims[i] < 0)) { - output_shape.push_back(data_dims[i]); - } else { - output_shape.push_back( - PoolOutputSize(data_dims[i], ksize[i], paddings[2 * i], - paddings[2 * i + 1], strides[i], ceil_mode)); - } - } - } - - // output_N = input_N - output_shape.insert(output_shape.begin(), in_x_dims[0]); - // output_C = input_C - if (channel_last) { - output_shape.push_back(in_x_dims[in_x_dims.size() - 1]); - } else { - output_shape.insert(output_shape.begin() + 1, in_x_dims[1]); - } - - ctx->SetOutputDim("Out", phi::make_ddim(output_shape)); - ctx->ShareLoD("X", "Out"); -} - bool CanMKLDNNSupportPool(const framework::ExecutionContext& ctx) { if (ctx.Attr("adaptive") == false) return true; // (jczaja): oneDNN is supporting only unchangable in size pool window @@ -216,16 +103,6 @@ framework::OpKernelType PoolOp::GetKernelTypeForVar( tensor.place(), tensor.layout()); } -void PoolOpGrad::InferShape(framework::InferShapeContext* ctx) const { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::NotFound( - "Input(X) of Pool Gradoperator is not found.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("X")), true, - platform::errors::NotFound( - "Input(X@GRAD) of Pool Gradoperator is not found.")); - ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); -} - framework::OpKernelType PoolOpGrad::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { framework::LibraryType library_{framework::LibraryType::kPlain}; @@ -471,7 +348,7 @@ class Pool2dOpGradGradMaker : public framework::SingleGradOpMaker { protected: void Apply(GradOpPtr grad_op) const override { - grad_op->SetType("pool2d_grad_grad"); + grad_op->SetType("pool2d_double_grad"); grad_op->SetInput("X", this->OutputGrad(framework::GradVarName("X"))); grad_op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out"))); grad_op->SetAttrMap(this->Attrs()); @@ -692,35 +569,34 @@ Example: namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(pool2d, Pool2dInferShapeFunctor, + PD_INFER_META(phi::PoolInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(pool2d_grad, Pool2dGradInferShapeFunctor, + PD_INFER_META(phi::PoolGradInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(pool2d_double_grad, + Pool2dDoubleGradInferShapeFunctor, + PD_INFER_META(phi::PoolInferMeta)); + REGISTER_OPERATOR( pool2d, ops::PoolOp, ops::Pool2dOpMaker, ops::PoolOpInferVarType, paddle::framework::DefaultGradOpMaker, - paddle::framework::DefaultGradOpMaker); + paddle::framework::DefaultGradOpMaker, + Pool2dInferShapeFunctor); REGISTER_OPERATOR(pool2d_grad, ops::PoolOpGrad, ops::Pool2dOpGradGradMaker, - ops::Pool2dOpGradGradMaker); -REGISTER_OPERATOR(pool2d_grad_grad, ops::PoolOp); - -REGISTER_OP_CPU_KERNEL( - pool2d, ops::PoolKernel, - ops::PoolKernel); -REGISTER_OP_CPU_KERNEL( - pool2d_grad, ops::PoolGradKernel, - ops::PoolGradKernel); -REGISTER_OP_CPU_KERNEL( - pool2d_grad_grad, - ops::PoolGradGradKernel, - ops::PoolGradGradKernel); + ops::Pool2dOpGradGradMaker, + Pool2dGradInferShapeFunctor); +REGISTER_OPERATOR(pool2d_double_grad, ops::PoolOp, + Pool2dDoubleGradInferShapeFunctor); + +DECLARE_INFER_SHAPE_FUNCTOR(pool3d, Pool3dInferShapeFunctor, + PD_INFER_META(phi::PoolInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(pool3d_grad, Pool3dGradInferShapeFunctor, + PD_INFER_META(phi::PoolGradInferMeta)); REGISTER_OPERATOR( pool3d, ops::PoolOp, ops::Pool3dOpMaker, ops::PoolOpInferVarType, paddle::framework::DefaultGradOpMaker, - paddle::framework::DefaultGradOpMaker); -REGISTER_OPERATOR(pool3d_grad, ops::PoolOpGrad); - -REGISTER_OP_CPU_KERNEL( - pool3d, ops::PoolKernel, - ops::PoolKernel); -REGISTER_OP_CPU_KERNEL( - pool3d_grad, ops::PoolGradKernel, - ops::PoolGradKernel); + paddle::framework::DefaultGradOpMaker, + Pool3dInferShapeFunctor); +REGISTER_OPERATOR(pool3d_grad, ops::PoolOpGrad, Pool3dGradInferShapeFunctor); diff --git a/paddle/fluid/operators/pool_op.cu b/paddle/fluid/operators/pool_op.cu deleted file mode 100644 index 069ce0c1fda853b943a7b414a7a33d9aa6405a89..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/pool_op.cu +++ /dev/null @@ -1,48 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/pool_op.h" - -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL( - pool2d, ops::PoolKernel, - ops::PoolKernel, - ops::PoolKernel); -REGISTER_OP_CUDA_KERNEL( - pool2d_grad, - ops::PoolGradKernel, - ops::PoolGradKernel, - ops::PoolGradKernel); - -REGISTER_OP_CUDA_KERNEL( - pool2d_grad_grad, - ops::PoolGradGradKernel, - ops::PoolGradGradKernel, - ops::PoolGradGradKernel); - -REGISTER_OP_CUDA_KERNEL( - pool3d, ops::PoolKernel, - ops::PoolKernel, - ops::PoolKernel); -REGISTER_OP_CUDA_KERNEL( - pool3d_grad, - ops::PoolGradKernel, - ops::PoolGradKernel, - ops::PoolGradKernel); diff --git a/paddle/fluid/operators/pool_op.h b/paddle/fluid/operators/pool_op.h index bea6506ee86dbfe3ac606a1e8e883bfbf2500f25..d48ac3bd358ef64271de69df4424399b427cfb82 100644 --- a/paddle/fluid/operators/pool_op.h +++ b/paddle/fluid/operators/pool_op.h @@ -12,19 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#pragma once +// NOTE(Ruibiao): Difficult to remove code from this header file because too +// many files rely on it through "mkldnn_reuse.h" -#include -#include -#include +#pragma once -#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/pooling.h" -#include "paddle/phi/kernels/funcs/math_function.h" -#if defined(__HIPCC__) || defined(__NVCC__) -#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" -#endif namespace paddle { namespace operators { @@ -35,8 +28,6 @@ class PoolOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override; - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override; @@ -50,8 +41,6 @@ class PoolOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override; - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override; @@ -71,292 +60,5 @@ class Pool3dOpMaker : public framework::OpProtoAndCheckerMaker { void Make() override; }; -template -inline void UpdatePadding(std::vector* paddings, const bool global_pooling, - const bool adaptive, - const std::string padding_algorithm, - const framework::DDim data_dims, - const std::vector& strides, - const std::vector& ksize) { - // set padding size == data_dims.size() * 2 - auto data_shape = phi::vectorize(data_dims); - if (static_cast(paddings->size()) == data_dims.size()) { - for (int i = 0; i < data_dims.size(); ++i) { - T copy_pad = *(paddings->begin() + 2 * i); - paddings->insert(paddings->begin() + 2 * i + 1, copy_pad); - } - } else { - PADDLE_ENFORCE_EQ(data_dims.size() * 2, paddings->size(), - platform::errors::InvalidArgument( - "Paddings size %d should be the same or twice as the " - "pooling size %d.", - paddings->size(), data_dims.size() * 2)); - } - - // when padding_algorithm is "VALID" or "SAME" - if (padding_algorithm == "SAME") { - for (int i = 0; i < data_dims.size(); ++i) { - T out_size = (data_dims[i] + strides[i] - 1) / strides[i]; - T pad_sum = - std::max((out_size - 1) * strides[i] + ksize[i] - data_shape[i], - static_cast(0)); - T pad_0 = pad_sum / 2; - T pad_1 = pad_sum - pad_0; - *(paddings->begin() + i * 2) = pad_0; - *(paddings->begin() + i * 2 + 1) = pad_1; - } - } else if (padding_algorithm == "VALID") { - for (auto it = paddings->begin(); it != paddings->end(); it++) { - *it = 0; - } - } - - // if global_pooling == true or adaptive == true, padding will be ignore - if (global_pooling || adaptive) { - for (auto it = paddings->begin(); it != paddings->end(); it++) { - *it = 0; - } - } -} - -template -inline void UpdateKsize(std::vector* ksize, - const framework::DDim data_dims) { - ksize->resize(static_cast(data_dims.size())); - for (size_t i = 0; i < ksize->size(); ++i) { - *(ksize->begin() + i) = static_cast(data_dims[i]); - } -} - -inline int getReduceNum(const framework::Tensor& input, - const framework::Tensor* output, - const std::string data_format, - std::vector* reduce_dim) { - // data_format only can be NCHW - bool channel_last = (data_format == "NHWC"); - if (channel_last) { - return 0; - } - int reduce_num = 0; - const int output_height = output->dims()[2]; - const int output_width = output->dims()[3]; - if ((output_height == 1) && (output_width == 1)) { - reduce_dim->push_back(2); - reduce_dim->push_back(3); - reduce_num = input.dims()[2] * input.dims()[3]; - } - return reduce_num; -} - -template -class PoolKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* in_x = context.Input("X"); - Tensor* out = context.Output("Out"); - - std::string pooling_type = context.Attr("pooling_type"); - std::vector ksize = context.Attr>("ksize"); - std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - std::string data_format = context.Attr("data_format"); - bool exclusive = context.Attr("exclusive"); - bool adaptive = context.Attr("adaptive"); - bool global_pooling = context.Attr("global_pooling"); - std::string padding_algorithm = - context.Attr("padding_algorithm"); - - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - // update paddings - auto in_x_dims = in_x->dims(); - framework::DDim data_dims; - if (channel_last) { - data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1); - } else { - data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size()); - } - - UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm, - data_dims, strides, ksize); - if (data_dims.size() * 2 == static_cast(paddings.size())) { - for (int i = 0; i < data_dims.size(); ++i) { - paddings.erase(paddings.begin() + i + 1); - } - } - - if (global_pooling) { - UpdateKsize(&ksize, data_dims); - } - auto& dev_ctx = context.template device_context(); - switch (ksize.size()) { - case 2: { - if (pooling_type == "max") { - paddle::operators::math::Pool2dFunctor< - DeviceContext, paddle::operators::math::MaxPool, T> - pool2d_forward; - paddle::operators::math::MaxPool pool_process; - pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, data_format, - true, false, out, pool_process); - - } else if (pooling_type == "avg") { - std::vector reduce_dim; - int reduce_num = getReduceNum(*in_x, out, data_format, &reduce_dim); - if (reduce_num > 0 && - adaptive) { // for adaptive_avg_pool2d && output_size == 1 -#if defined(__HIPCC__) || defined(__NVCC__) - auto stream = dev_ctx.stream(); - TensorReduceImpl>( - dev_ctx, *in_x, out, kps::DivideFunctor(reduce_num), - reduce_dim, stream); -#else // for cpu - paddle::operators::math::Pool2dFunctor< - DeviceContext, paddle::operators::math::AvgPool, T> - pool2d_forward; - paddle::operators::math::AvgPool pool_process; - pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, - data_format, exclusive, adaptive, out, pool_process); -#endif - } else { // avgpool_2d or adaptive_avg_pool2d && output_size != 1 - paddle::operators::math::Pool2dFunctor< - DeviceContext, paddle::operators::math::AvgPool, T> - pool2d_forward; - paddle::operators::math::AvgPool pool_process; - pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, - data_format, exclusive, adaptive, out, pool_process); - } - } - } break; - case 3: { - if (pooling_type == "max") { - paddle::operators::math::Pool3dFunctor< - DeviceContext, paddle::operators::math::MaxPool, T> - pool3d_forward; - paddle::operators::math::MaxPool pool_process; - pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, data_format, - true, false, out, pool_process); - - } else if (pooling_type == "avg") { - paddle::operators::math::Pool3dFunctor< - DeviceContext, paddle::operators::math::AvgPool, T> - pool3d_forward; - paddle::operators::math::AvgPool pool_process; - pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, data_format, - exclusive, adaptive, out, pool_process); - } - } break; - default: { - PADDLE_THROW(platform::errors::InvalidArgument( - "Pool op only supports 2D and 3D input.")); - } - } - } -}; - -template -class PoolGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* in_x = context.Input("X"); - const Tensor* out = context.Input("Out"); - const Tensor* out_grad = - context.Input(framework::GradVarName("Out")); - Tensor* in_x_grad = context.Output(framework::GradVarName("X")); - - std::string pooling_type = context.Attr("pooling_type"); - std::vector ksize = context.Attr>("ksize"); - std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - bool exclusive = context.Attr("exclusive"); - bool adaptive = context.Attr("adaptive"); - std::string data_format = context.Attr("data_format"); - bool global_pooling = context.Attr("global_pooling"); - std::string padding_algorithm = - context.Attr("padding_algorithm"); - - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - // update paddings - auto in_x_dims = in_x->dims(); - framework::DDim data_dims; - if (channel_last) { - data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1); - } else { - data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size()); - } - UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm, - data_dims, strides, ksize); - if (data_dims.size() * 2 == static_cast(paddings.size())) { - for (int i = 0; i < data_dims.size(); ++i) { - paddings.erase(paddings.begin() + i + 1); - } - } - - if (global_pooling) { - UpdateKsize(&ksize, data_dims); - } - - auto& dev_ctx = context.template device_context(); - if (in_x_grad) { - in_x_grad->mutable_data(context.GetPlace()); - phi::funcs::SetConstant set_constant; - set_constant(dev_ctx, in_x_grad, static_cast(0.0)); - - switch (ksize.size()) { - case 2: { - if (pooling_type == "max") { - paddle::operators::math::MaxPool2dGradFunctor - pool2d_backward; - pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides, - paddings, data_format, in_x_grad); - } else if (pooling_type == "avg") { - paddle::operators::math::Pool2dGradFunctor< - DeviceContext, paddle::operators::math::AvgPoolGrad, T> - pool2d_backward; - paddle::operators::math::AvgPoolGrad pool_process; - pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides, - paddings, data_format, exclusive, adaptive, - in_x_grad, pool_process); - } - } break; - case 3: { - if (pooling_type == "max") { - paddle::operators::math::MaxPool3dGradFunctor - pool3d_backward; - pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides, - paddings, data_format, in_x_grad); - } else if (pooling_type == "avg") { - paddle::operators::math::Pool3dGradFunctor< - DeviceContext, paddle::operators::math::AvgPoolGrad, T> - pool3d_backward; - paddle::operators::math::AvgPoolGrad pool_process; - pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides, - paddings, data_format, exclusive, adaptive, - in_x_grad, pool_process); - } - } break; - default: { - PADDLE_THROW(platform::errors::InvalidArgument( - "Pool op only supports 2D and 3D input.")); - } - } - } - } -}; - -template -class PoolGradGradKernel : public PoolKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - std::string pooling_type = context.Attr("pooling_type"); - if (pooling_type == "max") { - PADDLE_THROW(platform::errors::InvalidArgument( - "Pool op grad grad only supports avgpool.")); - } else { - PoolKernel::Compute(context); - } - } -}; - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/pool_op_mlu.cc b/paddle/fluid/operators/pool_op_mlu.cc index 08656e64231b61181583cb700f2cc3216e25e516..fa88d128a9a1d572414a6459933a8988cae1fda0 100644 --- a/paddle/fluid/operators/pool_op_mlu.cc +++ b/paddle/fluid/operators/pool_op_mlu.cc @@ -12,8 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/pool_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/mlu/mlu_baseop.h" +#include "paddle/phi/kernels/funcs/pooling.h" namespace paddle { namespace operators { @@ -80,10 +81,10 @@ class MLUPoolOpKernel : public framework::OpKernel { data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1); } - UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm, - data_dims, strides, ksize); + phi::funcs::UpdatePadding(&paddings, global_pooling, adaptive, + padding_algorithm, data_dims, strides, ksize); if (global_pooling) { - UpdateKsize(&ksize, data_dims); + phi::funcs::UpdateKernelSize(&ksize, data_dims); } MLUCnnlTensorDesc in_x_desc(*in_x, cnnl_layout, ToCnnlDataType()); @@ -191,10 +192,10 @@ class MLUPoolGradOpKernel : public framework::OpKernel { data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1); } - UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm, - data_dims, strides, ksize); + phi::funcs::UpdatePadding(&paddings, global_pooling, adaptive, + padding_algorithm, data_dims, strides, ksize); if (global_pooling) { - UpdateKsize(&ksize, data_dims); + phi::funcs::UpdateKernelSize(&ksize, data_dims); } // inputs need with NHWC layout diff --git a/paddle/fluid/operators/pool_op_npu.cc b/paddle/fluid/operators/pool_op_npu.cc index bd26d6350d9c300949edb1a90b244a7c747dd7a9..0efcb8b7981c32e9f8d5a04f4fd4122d6725a49e 100644 --- a/paddle/fluid/operators/pool_op_npu.cc +++ b/paddle/fluid/operators/pool_op_npu.cc @@ -11,8 +11,10 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/pool_op.h" + +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" +#include "paddle/phi/kernels/funcs/pooling.h" namespace paddle { namespace operators { @@ -68,8 +70,8 @@ class NPUPoolOpKernel : public framework::OpKernel { strides_vec[2] = strides[0]; strides_vec[3] = strides[1]; } - UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm, - data_dims, strides, ksize); + phi::funcs::UpdatePadding(&paddings, global_pooling, adaptive, + padding_algorithm, data_dims, strides, ksize); PADDLE_ENFORCE_LT( std::max(paddings[0], paddings[1]), ksize[0], platform::errors::InvalidArgument( @@ -201,8 +203,8 @@ class NPUPoolGradOpKernel : public framework::OpKernel { strides_vec[2] = strides[0]; strides_vec[3] = strides[1]; } - UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm, - data_dims, strides, ksize); + phi::funcs::UpdatePadding(&paddings, global_pooling, adaptive, + padding_algorithm, data_dims, strides, ksize); PADDLE_ENFORCE_LT( std::max(paddings[0], paddings[1]), ksize[0], diff --git a/paddle/fluid/operators/pool_op_xpu.cc b/paddle/fluid/operators/pool_op_xpu.cc index 402dd6c10803947f73e593d215d28246a81c6706..87c437d8a78e0122b0fc4f5a7dbf51612e40fbf2 100644 --- a/paddle/fluid/operators/pool_op_xpu.cc +++ b/paddle/fluid/operators/pool_op_xpu.cc @@ -8,13 +8,17 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/pool_op.h" + #include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor.h" #ifdef PADDLE_WITH_XPU namespace paddle { namespace operators { +using framework::Tensor; + xpu::Pooling_t XPUPoolingType(const std::string& pooltype, bool exclusive, bool is_test) { if (pooltype == "max") { diff --git a/paddle/fluid/operators/pool_with_index_op.cc b/paddle/fluid/operators/pool_with_index_op.cc index d061f9ae05613491cbdbff3793b57a3d89d7d6e5..e0341f4a4b4716d0ee82c9437ddc4d8bd1e35fb2 100644 --- a/paddle/fluid/operators/pool_with_index_op.cc +++ b/paddle/fluid/operators/pool_with_index_op.cc @@ -12,8 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/pool_with_index_op.h" #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -28,71 +32,6 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Input(X) of Pooling should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(Out) of Pooling should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Mask"), true, - platform::errors::InvalidArgument( - "Output(Mask) of Pooling should not be null.")); - - auto in_x_dims = ctx->GetInputDim("X"); - - std::vector ksize = ctx->Attrs().Get>("ksize"); - std::vector strides = ctx->Attrs().Get>("strides"); - std::vector paddings = ctx->Attrs().Get>("paddings"); - bool adaptive = ctx->Attrs().Get("adaptive"); - - PADDLE_ENFORCE( - in_x_dims.size() == 4 || in_x_dims.size() == 5, - platform::errors::InvalidArgument("Pooling intput should be 4-D or 5-D " - "tensor but received %dD-Tensor", - in_x_dims.size())); - - if (ctx->Attrs().Get("global_pooling")) { - ksize.resize(static_cast(in_x_dims.size()) - 2); - for (size_t i = 0; i < ksize.size(); ++i) { - paddings[i] = 0; - ksize[i] = static_cast(in_x_dims[i + 2]); - } - } - - PADDLE_ENFORCE_EQ( - in_x_dims.size() - ksize.size(), 2U, - platform::errors::InvalidArgument( - "The input size %d minus the kernel size %d should equal to 2.", - in_x_dims.size(), ksize.size())); - PADDLE_ENFORCE_EQ( - ksize.size(), strides.size(), - platform::errors::InvalidArgument( - "Strides size %d and pooling size %d should be the same.", - strides.size(), ksize.size())); - PADDLE_ENFORCE_EQ( - ksize.size(), paddings.size(), - platform::errors::InvalidArgument( - "Paddings size %d and pooling size %d should be the same.", - paddings.size(), ksize.size())); - - std::vector output_shape({in_x_dims[0], in_x_dims[1]}); - if (adaptive) { - output_shape.insert(output_shape.end(), ksize.begin(), ksize.end()); - } else { - for (size_t i = 0; i < ksize.size(); ++i) { - if ((!ctx->IsRuntime()) && (in_x_dims[i + 2] < 0)) { - output_shape.push_back(in_x_dims[i + 2]); - } else { - output_shape.push_back(MaxPoolOutputSize(in_x_dims[i + 2], ksize[i], - paddings[i], strides[i])); - } - } - } - ctx->SetOutputDim("Out", phi::make_ddim(output_shape)); - ctx->SetOutputDim("Mask", phi::make_ddim(output_shape)); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -106,22 +45,6 @@ class MaxPoolWithIndexOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_EQ( - ctx->HasInput("Mask"), true, - platform::errors::InvalidArgument("Input(Mask) must not be null.")); - PADDLE_ENFORCE_EQ( - ctx->HasInput("X"), true, - platform::errors::InvalidArgument("Input(X) must not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true, - platform::errors::InvalidArgument( - "Input(Out@GRAD) should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("X")), true, - platform::errors::InvalidArgument( - "Output(X@GRAD) should not be null.")); - ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -335,40 +258,34 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER( namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(max_pool2d_with_index, + MaxPool2dWithIndexInferShapeFunctor, + PD_INFER_META(phi::MaxPoolWithIndexInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(max_pool2d_with_index_grad, + MaxPool2dWithIndexGradInferShapeFunctor, + PD_INFER_META(phi::MaxPoolWithIndexGradInferMeta)); + REGISTER_OPERATOR(max_pool2d_with_index, ops::MaxPoolWithIndexOp, ops::MaxPool2dWithIndexOpMaker, ops::MaxPoolWithIndexGradOpMaker, - ops::MaxPoolWithIndexGradOpMaker); + ops::MaxPoolWithIndexGradOpMaker, + MaxPool2dWithIndexInferShapeFunctor); REGISTER_OPERATOR(max_pool2d_with_index_grad, ops::MaxPoolWithIndexOpGrad, - ops::MaxPoolWithIndexOpGradNoNeedBufferVarsInferer); + ops::MaxPoolWithIndexOpGradNoNeedBufferVarsInferer, + MaxPool2dWithIndexGradInferShapeFunctor); -REGISTER_OP_CPU_KERNEL( - max_pool2d_with_index, - ops::MaxPoolWithIndexKernel, - ops::MaxPoolWithIndexKernel); -REGISTER_OP_CPU_KERNEL( - max_pool2d_with_index_grad, - ops::MaxPoolWithIndexGradKernel, - ops::MaxPoolWithIndexGradKernel); +DECLARE_INFER_SHAPE_FUNCTOR(max_pool3d_with_index, + MaxPool3dWithIndexInferShapeFunctor, + PD_INFER_META(phi::MaxPoolWithIndexInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(max_pool3d_with_index_grad, + MaxPool3dWithIndexGradInferShapeFunctor, + PD_INFER_META(phi::MaxPoolWithIndexGradInferMeta)); REGISTER_OPERATOR(max_pool3d_with_index, ops::MaxPoolWithIndexOp, ops::MaxPool3dWithIndexOpMaker, ops::MaxPoolWithIndexGradOpMaker, - ops::MaxPoolWithIndexGradOpMaker); + ops::MaxPoolWithIndexGradOpMaker, + MaxPool3dWithIndexInferShapeFunctor); REGISTER_OPERATOR(max_pool3d_with_index_grad, ops::MaxPoolWithIndexOpGrad, - ops::MaxPoolWithIndexOpGradNoNeedBufferVarsInferer); - -REGISTER_OP_CPU_KERNEL( - max_pool3d_with_index, - ops::MaxPoolWithIndexKernel, - ops::MaxPoolWithIndexKernel); -REGISTER_OP_CPU_KERNEL( - max_pool3d_with_index_grad, - ops::MaxPoolWithIndexGradKernel, - ops::MaxPoolWithIndexGradKernel); + ops::MaxPoolWithIndexOpGradNoNeedBufferVarsInferer, + MaxPool3dWithIndexGradInferShapeFunctor); diff --git a/paddle/fluid/operators/pool_with_index_op.cu.cc b/paddle/fluid/operators/pool_with_index_op.cu.cc deleted file mode 100644 index 5497dcbd9ce255f833df24989d7a76c40bcbca06..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/pool_with_index_op.cu.cc +++ /dev/null @@ -1,43 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/pool_with_index_op.h" - -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL( - max_pool2d_with_index, - ops::MaxPoolWithIndexKernel, - ops::MaxPoolWithIndexKernel); -REGISTER_OP_CUDA_KERNEL( - max_pool2d_with_index_grad, - ops::MaxPoolWithIndexGradKernel, - ops::MaxPoolWithIndexGradKernel); - -REGISTER_OP_CUDA_KERNEL( - max_pool3d_with_index, - ops::MaxPoolWithIndexKernel, - ops::MaxPoolWithIndexKernel); -REGISTER_OP_CUDA_KERNEL( - max_pool3d_with_index_grad, - ops::MaxPoolWithIndexGradKernel, - ops::MaxPoolWithIndexGradKernel); diff --git a/paddle/fluid/operators/pool_with_index_op.h b/paddle/fluid/operators/pool_with_index_op.h deleted file mode 100644 index 6e51a833f5c89efc2621c0ccc3d08dc42b2733a1..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/pool_with_index_op.h +++ /dev/null @@ -1,121 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/pooling.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class MaxPoolWithIndexKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* in_x = context.Input("X"); - Tensor* out = context.Output("Out"); - Tensor* mask = context.Output("Mask"); - - std::vector ksize = context.Attr>("ksize"); - std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - bool adaptive = context.Attr("adaptive"); - - auto& dev_ctx = context.template device_context(); - if (context.Attr("global_pooling")) { - for (size_t i = 0; i < ksize.size(); ++i) { - paddings[i] = 0; - ksize[i] = static_cast(in_x->dims()[i + 2]); - } - } - - switch (ksize.size()) { - case 2: { - paddle::operators::math::MaxPool2dWithIndexFunctor - pool2d_forward; - pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, adaptive, out, - mask); - } break; - case 3: { - paddle::operators::math::MaxPool3dWithIndexFunctor - pool3d_forward; - pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, adaptive, out, - mask); - } break; - default: { - PADDLE_THROW(platform::errors::InvalidArgument( - "Pool op only supports 2D and 3D input.")); - } - } - } -}; - -template -class MaxPoolWithIndexGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* mask = context.Input("Mask"); - const Tensor* out_grad = - context.Input(framework::GradVarName("Out")); - Tensor* in_x_grad = context.Output(framework::GradVarName("X")); - - std::vector ksize = context.Attr>("ksize"); - std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - bool adaptive = context.Attr("adaptive"); - if (context.Attr("global_pooling")) { - for (size_t i = 0; i < ksize.size(); ++i) { - paddings[i] = 0; - ksize[i] = static_cast(in_x_grad->dims()[i + 2]); - } - } - - if (in_x_grad) { - in_x_grad->mutable_data(context.GetPlace()); - auto& device_ctx = context.template device_context(); - phi::funcs::set_constant(device_ctx, in_x_grad, 0); - - switch (ksize.size()) { - case 2: { - paddle::operators::math::MaxPool2dWithIndexGradFunctor - pool2d_backward; - pool2d_backward(device_ctx, *out_grad, *mask, ksize, strides, - paddings, adaptive, in_x_grad); - } break; - case 3: { - paddle::operators::math::MaxPool3dWithIndexGradFunctor - pool3d_backward; - pool3d_backward(device_ctx, *out_grad, *mask, ksize, strides, - paddings, adaptive, in_x_grad); - } break; - default: { - PADDLE_THROW(platform::errors::InvalidArgument( - "Pool op only supports 2D and 3D input.")); - } - } - } - } -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/roi_align_op.cc b/paddle/fluid/operators/roi_align_op.cc index 5627b4f229e100d9979663e8688b8694188bab0f..ac0cd75237baf5e8b860f197d42cd27bae65270e 100644 --- a/paddle/fluid/operators/roi_align_op.cc +++ b/paddle/fluid/operators/roi_align_op.cc @@ -226,11 +226,7 @@ REGISTER_OPERATOR(roi_align, ops::ROIAlignOp, ops::ROIAlignOpMaker, ops::ROIAlignGradMaker); REGISTER_OPERATOR(roi_align_grad, ops::ROIAlignGradOp, ops::RoiAlignGradNoNeedBufVarsInferer); -REGISTER_OP_CPU_KERNEL( - roi_align, - ops::CPUROIAlignOpKernel, - ops::CPUROIAlignOpKernel, - ops::CPUROIAlignOpKernel); + REGISTER_OP_CPU_KERNEL( roi_align_grad, ops::CPUROIAlignGradOpKernel, diff --git a/paddle/fluid/operators/roi_align_op.cu b/paddle/fluid/operators/roi_align_op.cu index 18941d10e937d3c28e5793384f00d9d97225a128..1a2e64cd45ca401f5fb8ca6b6975a029ba735280 100644 --- a/paddle/fluid/operators/roi_align_op.cu +++ b/paddle/fluid/operators/roi_align_op.cu @@ -33,43 +33,6 @@ static inline int NumBlocks(const int N) { kNumMaxinumNumBlocks); } -template -__device__ T BilinearInterpolate(const T* input_data, const int height, - const int width, T y, T x) { - if (y < -1.0 || y > height || x < -1.0 || x > width) { - return 0; - } - y = y <= 0 ? 0 : y; - x = x <= 0 ? 0 : x; - int y_low = static_cast(y); - int x_low = static_cast(x); - int y_high; - int x_high; - if (y_low >= height - 1) { - y_high = y_low = height - 1; - y = static_cast(y_low); - } else { - y_high = y_low + 1; - } - if (x_low >= width - 1) { - x_high = x_low = width - 1; - x = static_cast(x_low); - } else { - x_high = x_low + 1; - } - T ly = y - y_low, lx = x - x_low; - T hy = 1. - ly, hx = 1. - lx; - - T v1 = input_data[y_low * width + x_low]; - T v2 = input_data[y_low * width + x_high]; - T v3 = input_data[y_high * width + x_low]; - T v4 = input_data[y_high * width + x_high]; - T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; - - T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); - return val; -} - template __device__ void BilinearInterpolateGradient(const int height, const int width, T y, T x, T* w1, T* w2, T* w3, @@ -102,65 +65,6 @@ __device__ void BilinearInterpolateGradient(const int height, const int width, return; } -template -__global__ void GPUROIAlignForward( - const int nthreads, const T* input_data, const T* input_rois, - const float spatial_scale, const int channels, const int height, - const int width, const int pooled_height, const int pooled_width, - const int sampling_ratio, int* roi_batch_id_data, T* output_data, - const bool continuous_coordinate) { - CUDA_KERNEL_LOOP(i, nthreads) { - int pw = i % pooled_width; - int ph = (i / pooled_width) % pooled_height; - int c = (i / pooled_width / pooled_height) % channels; - int n = i / pooled_width / pooled_height / channels; - - const T* offset_input_rois = input_rois + n * kROISize; - int roi_batch_ind = roi_batch_id_data[n]; - - T roi_offset = continuous_coordinate ? static_cast(0.5) : 0; - T roi_xmin = offset_input_rois[0] * spatial_scale - roi_offset; - T roi_ymin = offset_input_rois[1] * spatial_scale - roi_offset; - T roi_xmax = offset_input_rois[2] * spatial_scale - roi_offset; - T roi_ymax = offset_input_rois[3] * spatial_scale - roi_offset; - - T roi_width = roi_xmax - roi_xmin; - T roi_height = roi_ymax - roi_ymin; - if (!continuous_coordinate) { - roi_width = max(roi_width, static_cast(1.)); - roi_height = max(roi_height, static_cast(1.)); - } - - T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); - T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); - - const T* offset_input_data = - input_data + (roi_batch_ind * channels + c) * height * width; - - int roi_bin_grid_h = (sampling_ratio > 0) - ? sampling_ratio - : ceil(roi_height / pooled_height); - int roi_bin_grid_w = - (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); - const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1); - T output_val = 0; - for (int iy = 0; iy < roi_bin_grid_h; iy++) { - const T y = roi_ymin + ph * bin_size_h + - static_cast(iy + .5f) * bin_size_h / - static_cast(roi_bin_grid_h); - for (int ix = 0; ix < roi_bin_grid_w; ix++) { - const T x = roi_xmin + pw * bin_size_w + - static_cast(ix + .5f) * bin_size_w / - static_cast(roi_bin_grid_w); - T val = BilinearInterpolate(offset_input_data, height, width, y, x); - output_val += val; - } - } - output_val /= count; - output_data[i] = output_val; - } -} - template __global__ void GPUROIAlignBackward( const int nthreads, const T* input_rois, const T* out_grad, @@ -236,105 +140,6 @@ __global__ void GPUROIAlignBackward( } } -template -class GPUROIAlignOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - auto* out = ctx.Output("Out"); - - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - auto spatial_scale = ctx.Attr("spatial_scale"); - auto sampling_ratio = ctx.Attr("sampling_ratio"); - auto aligned = ctx.Attr("aligned"); - - auto in_dims = in->dims(); - int batch_size = in_dims[0]; - int channels = in_dims[1]; - int height = in_dims[2]; - int width = in_dims[3]; - - int rois_num = rois->dims()[0]; - - if (rois_num == 0) return; - - int output_size = out->numel(); - int blocks = NumBlocks(output_size); - int threads = kNumCUDAThreads; -#ifdef WITH_NV_JETSON - platform::ChangeThreadNum(ctx.cuda_device_context(), &threads, 256); -#endif - Tensor roi_batch_id_list; - roi_batch_id_list.Resize({rois_num}); - auto cplace = platform::CPUPlace(); - int* roi_batch_id_data = roi_batch_id_list.mutable_data(cplace); - auto& dev_ctx = ctx.cuda_device_context(); - auto gplace = ctx.GetPlace(); - if (ctx.HasInput("RoisNum")) { - auto* rois_num_t = ctx.Input("RoisNum"); - int rois_batch_size = rois_num_t->numel(); - PADDLE_ENFORCE_EQ( - rois_batch_size, batch_size, - platform::errors::InvalidArgument( - "The rois_batch_size and imgs " - "batch_size must be the same. But received rois_batch_size = %d, " - "batch_size = %d", - rois_batch_size, batch_size)); - - std::vector rois_num_list(rois_batch_size); - memory::Copy(cplace, rois_num_list.data(), gplace, - rois_num_t->data(), sizeof(int) * rois_batch_size, 0); - int start = 0; - for (int n = 0; n < rois_batch_size; ++n) { - for (int i = start; i < start + rois_num_list[n]; ++i) { - roi_batch_id_data[i] = n; - } - start += rois_num_list[n]; - } - } else { - auto lod = rois->lod(); - PADDLE_ENFORCE_EQ( - lod.empty(), false, - platform::errors::InvalidArgument("Input(ROIs) in ROIAlignOp does " - "not contain LoD information.")); - auto rois_lod = lod.back(); - int rois_batch_size = rois_lod.size() - 1; - PADDLE_ENFORCE_EQ( - rois_batch_size, batch_size, - platform::errors::InvalidArgument( - "The batch size of rois and batch size " - "of images must be the same. But received rois batch size = %d, " - "and images batch size = %d", - rois_batch_size, batch_size)); - int rois_num_with_lod = rois_lod[rois_batch_size]; - PADDLE_ENFORCE_EQ( - rois_num, rois_num_with_lod, - platform::errors::InvalidArgument( - "The actual number of rois and the number of rois " - "provided from Input(RoIsLoD) in RoIAlign must be the same." - " But received actual number of rois is %d, and the number " - "of rois from RoIsLoD is %d", - rois_num, rois_num_with_lod)); - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - roi_batch_id_data[i] = n; - } - } - } - int bytes = roi_batch_id_list.numel() * sizeof(int); - auto roi_ptr = memory::Alloc(dev_ctx, bytes); - int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); - memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes, - dev_ctx.stream()); - GPUROIAlignForward<<>>( - output_size, in->data(), rois->data(), spatial_scale, channels, - height, width, pooled_height, pooled_width, sampling_ratio, roi_id_data, - out->mutable_data(ctx.GetPlace()), aligned); - } -}; - template class GPUROIAlignGradOpKernel : public framework::OpKernel { public: @@ -416,10 +221,6 @@ class GPUROIAlignGradOpKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - roi_align, - ops::GPUROIAlignOpKernel, - ops::GPUROIAlignOpKernel); REGISTER_OP_CUDA_KERNEL( roi_align_grad, ops::GPUROIAlignGradOpKernel, diff --git a/paddle/fluid/operators/roi_align_op.h b/paddle/fluid/operators/roi_align_op.h index e71099ed99f00f5846e6e23d5d39b3b2f8997531..589e35e4ab7ae4caf5efd3fb4d93a26b2ca86b26 100644 --- a/paddle/fluid/operators/roi_align_op.h +++ b/paddle/fluid/operators/roi_align_op.h @@ -23,152 +23,6 @@ namespace operators { using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; -namespace { // NOLINT -constexpr size_t get_offset(size_t x, size_t y, size_t width) { - return y * width + x; -} - -template -struct offsets_and_ratios { - offsets_and_ratios() = default; - offsets_and_ratios(std::size_t xy, std::size_t xY, std::size_t Xy, - std::size_t XY, T xy_ratio, T xY_ratio, T Xy_ratio, - T XY_ratio) - : xy(xy), - xY(xY), - Xy(Xy), - XY(XY), - xy_ratio(xy_ratio), - xY_ratio(xY_ratio), - Xy_ratio(Xy_ratio), - XY_ratio(XY_ratio) {} - - std::size_t xy = 0; - std::size_t xY = 0; - std::size_t Xy = 0; - std::size_t XY = 0; - T xy_ratio = 0.0f; - T xY_ratio = 0.0f; - T Xy_ratio = 0.0f; - T XY_ratio = 0.0f; -}; - -template -std::vector> get_indexes_and_ratios( - std::size_t width, std::size_t height, const T roi_width, - const T roi_height, const T roi_xmin, const T roi_ymin, - std::size_t pooled_width, std::size_t roi_bin_grid_w, - std::size_t pooled_height, std::size_t roi_bin_grid_h) { - const auto ind_num = - pooled_width * roi_bin_grid_w * pooled_height * roi_bin_grid_h; - - std::vector> interpolation_cords; - interpolation_cords.reserve(ind_num); - - const auto bin_w = roi_width / pooled_width; - const auto bin_h = roi_height / pooled_height; - - for (std::size_t py = 0; py < pooled_height; py++) { - for (std::size_t px = 0; px < pooled_width; px++) { - for (std::size_t iy = 0; iy < roi_bin_grid_h; iy++) { - // calculate x of sample points - auto y = - roi_ymin + - bin_h * (py + - static_cast(iy + .5f) / static_cast(roi_bin_grid_h)); - for (std::size_t ix = 0; ix < roi_bin_grid_w; ix++) { - // calculate x of sample points - auto x = roi_xmin + - bin_w * (px + - static_cast(ix + .5f) / - static_cast(roi_bin_grid_w)); - - // deal with elements out of map - if (y < -1.0 || y > height || x < -1.0 || x > width) { - interpolation_cords.emplace_back(); - continue; - } - y = y <= 0 ? 0 : y; - x = x <= 0 ? 0 : x; - - std::size_t x_low_index = static_cast(x); - std::size_t x_high_index; - if (x_low_index >= width - 1) { - x_high_index = x_low_index = width - 1; - x = static_cast(x_low_index); - } else { - x_high_index = x_low_index + 1; - } - T x_ratio = x_high_index - x; - - std::size_t y_low_index = static_cast(y); - std::size_t y_high_index; - if (y_low_index >= height - 1) { - y_high_index = y_low_index = height - 1; - y = static_cast(y_low_index); - } else { - y_high_index = y_low_index + 1; - } - T y_ratio = y_high_index - y; - - auto xy = get_offset(x_low_index, y_low_index, width); - auto xY = get_offset(x_low_index, y_high_index, width); - auto Xy = get_offset(x_high_index, y_low_index, width); - auto XY = get_offset(x_high_index, y_high_index, width); - - auto xy_ratio = x_ratio * y_ratio; - auto xY_ratio = x_ratio * (1 - y_ratio); - auto Xy_ratio = (1 - x_ratio) * y_ratio; - auto XY_ratio = (1 - x_ratio) * (1 - y_ratio); - - interpolation_cords.emplace_back(xy, xY, Xy, XY, xy_ratio, xY_ratio, - Xy_ratio, XY_ratio); - } - } - } - } - return interpolation_cords; -} // namespace - -template -void interpolate(std::vector& interpolated_values, // NOLINT - const std::vector>& interpolation_cords, - const T* data) { - for (auto& ic : interpolation_cords) { - auto xlyl_offset = ic.xy; - auto xhyl_offset = ic.Xy; - auto xlyh_offset = ic.xY; - auto xhyh_offset = ic.XY; - - auto xlyl_ratio = ic.xy_ratio; - auto xhyl_ratio = ic.Xy_ratio; - auto xlyh_ratio = ic.xY_ratio; - auto xhyh_ratio = ic.XY_ratio; - - interpolated_values.emplace_back( - xlyl_ratio * data[xlyl_offset] + xhyl_ratio * data[xhyl_offset] + - xlyh_ratio * data[xlyh_offset] + xhyh_ratio * data[xhyh_offset]); - } -} - -template -void avg_pool(const std::vector& interpolated_values, T* output_data, - int roi_bin_grid_w, int roi_bin_grid_h, int pooled_width, - int pooled_height) { - const auto data_amount = pooled_width * pooled_height; - const auto grid_points = roi_bin_grid_w * roi_bin_grid_h; - const T count = 1.0 / grid_points; - auto val_begin = interpolated_values.cbegin(); - for (auto i = 0; i < data_amount; ++i) { - T sum = 0.0; - auto val_end = val_begin + grid_points; - sum = std::accumulate(val_begin, val_end, sum); - val_begin = val_end; - output_data[i] = sum * count; - } -} -} // NOLINT - template void bilinear_interpolate_gradient(const int height, const int width, T y, T x, const T out_grad_this_bin, const T count, @@ -213,129 +67,6 @@ void bilinear_interpolate_gradient(const int height, const int width, T y, T x, } } -template -class CPUROIAlignOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - auto* out = ctx.Output("Out"); - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - auto spatial_scale = ctx.Attr("spatial_scale"); - auto sampling_ratio = ctx.Attr("sampling_ratio"); - auto aligned = ctx.Attr("aligned"); - - auto in_dims = in->dims(); - int batch_size = in_dims[0]; - int channels = in_dims[1]; - int height = in_dims[2]; - int width = in_dims[3]; - int rois_num = rois->dims()[0]; - - auto in_stride = phi::stride(in_dims); - auto roi_stride = phi::stride(rois->dims()); - auto out_stride = phi::stride(out->dims()); - - const T* input_data = in->data(); - framework::Tensor roi_batch_id_list; - roi_batch_id_list.Resize({rois_num}); - int* roi_batch_id_data = - roi_batch_id_list.mutable_data(ctx.GetPlace()); - int rois_batch_size; - if (ctx.HasInput("RoisNum")) { - auto* rois_num_t = ctx.Input("RoisNum"); - rois_batch_size = rois_num_t->numel(); - PADDLE_ENFORCE_EQ( - rois_batch_size, batch_size, - platform::errors::InvalidArgument( - "The batch size of rois and the batch size of images " - " must be the same. But received the batch size of rois is %d, " - "and the batch size of images is %d", - rois_batch_size, batch_size)); - auto* rois_num_data = rois_num_t->data(); - int start = 0; - for (int n = 0; n < rois_batch_size; ++n) { - for (int i = start; i < start + rois_num_data[n]; ++i) { - roi_batch_id_data[i] = n; - } - start += rois_num_data[n]; - } - } else { - auto lod = rois->lod(); - PADDLE_ENFORCE_EQ(lod.empty(), false, - platform::errors::InvalidArgument( - "Input(ROIs) Tensor of ROIAlignOp " - "does not contain LoD information.")); - auto rois_lod = lod.back(); - int rois_batch_size = rois_lod.size() - 1; - PADDLE_ENFORCE_EQ( - rois_batch_size, batch_size, - platform::errors::InvalidArgument( - "The rois_batch_size and imgs " - "batch_size must be the same. But received rois_batch_size = %d, " - "batch_size = %d", - rois_batch_size, batch_size)); - int rois_num_with_lod = rois_lod[rois_batch_size]; - PADDLE_ENFORCE_EQ( - rois_num, rois_num_with_lod, - platform::errors::InvalidArgument( - "The actual number of rois and the number of rois " - "provided from Input(RoIsLoD) in RoIAlign must be the same." - " But received actual number of rois is %d, and the number " - "of rois from RoIsLoD is %d", - rois_num, rois_num_with_lod)); - for (int n = 0; n < rois_batch_size; ++n) { - for (std::size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - roi_batch_id_data[i] = n; - } - } - } - T* output_data = out->mutable_data(ctx.GetPlace()); - const T* rois_data = rois->data(); - T roi_offset = aligned ? T(0.5) : 0; - for (int n = 0; n < rois_num; ++n) { - int roi_batch_id = roi_batch_id_data[n]; - T roi_xmin = rois_data[0] * spatial_scale - roi_offset; - T roi_ymin = rois_data[1] * spatial_scale - roi_offset; - T roi_xmax = rois_data[2] * spatial_scale - roi_offset; - T roi_ymax = rois_data[3] * spatial_scale - roi_offset; - - T roi_width = roi_xmax - roi_xmin; - T roi_height = roi_ymax - roi_ymin; - if (!aligned) { - roi_width = std::max(roi_width, static_cast(1.)); - roi_height = std::max(roi_height, static_cast(1.)); - } - - const T* batch_data = input_data + roi_batch_id * in_stride[0]; - - int roi_bin_grid_h = (sampling_ratio > 0) - ? sampling_ratio - : ceil(roi_height / pooled_height); - int roi_bin_grid_w = (sampling_ratio > 0) - ? sampling_ratio - : ceil(roi_width / pooled_width); - - auto interpolation_cords = get_indexes_and_ratios( - width, height, roi_width, roi_height, roi_xmin, roi_ymin, - pooled_width, roi_bin_grid_w, pooled_height, roi_bin_grid_h); - - std::vector interpolated_values; - interpolated_values.reserve(interpolation_cords.size()); - for (auto channel = 0; channel < channels; ++channel) { - interpolate(interpolated_values, interpolation_cords, batch_data); - avg_pool(interpolated_values, output_data, roi_bin_grid_w, - roi_bin_grid_h, pooled_width, pooled_height); - batch_data += in_stride[1]; - output_data += out_stride[1]; - interpolated_values.clear(); - } - rois_data += roi_stride[0]; - } - } -}; - template class CPUROIAlignGradOpKernel : public framework::OpKernel { public: diff --git a/paddle/fluid/operators/roi_align_op_npu.cc b/paddle/fluid/operators/roi_align_op_npu.cc index d5b63854d99053ac0620a32cfaba267c7262d515..78509e4299b80ee44610ce3d10f9c57afa0cde18 100644 --- a/paddle/fluid/operators/roi_align_op_npu.cc +++ b/paddle/fluid/operators/roi_align_op_npu.cc @@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/roi_align_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/roi_align_op_xpu.cc b/paddle/fluid/operators/roi_align_op_xpu.cc index 09d2d906653e8c71ddeca7fa606cf5adac8cc596..13490d6fcde3a22e7299db21969d7de6f9a6582c 100644 --- a/paddle/fluid/operators/roi_align_op_xpu.cc +++ b/paddle/fluid/operators/roi_align_op_xpu.cc @@ -13,13 +13,16 @@ See the License for the specific language governing permissions and limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/roi_align_op.h" #include #include +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + template class XPUROIAlignOpKernel : public framework::OpKernel { public: diff --git a/paddle/fluid/operators/scatter_op_npu.cc b/paddle/fluid/operators/scatter_op_npu.cc index 815984ac307fdce14a64f01a661b4b7f7ce1d616..d5ef95269b48a1a7e7b9c3e75af4f9b595580ad3 100644 --- a/paddle/fluid/operators/scatter_op_npu.cc +++ b/paddle/fluid/operators/scatter_op_npu.cc @@ -16,7 +16,7 @@ limitations under the License. */ #include #include -#include "paddle/fluid/operators/kron_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/searchsorted_op.cc b/paddle/fluid/operators/searchsorted_op.cc index bbd5b9c4e7db914d63c9c803c52d44f9350c1d41..d0290795455db1546afbda80e71e79de3f1020ac 100644 --- a/paddle/fluid/operators/searchsorted_op.cc +++ b/paddle/fluid/operators/searchsorted_op.cc @@ -12,8 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/searchsorted_op.h" - +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/enforce.h" namespace paddle { @@ -117,10 +116,3 @@ class SearchSortedOpMaker : public framework::OpProtoAndCheckerMaker { namespace ops = paddle::operators; REGISTER_OPERATOR(searchsorted, ops::SearchSortedOp, ops::SearchSortedOpMaker); - -REGISTER_OP_CPU_KERNEL( - searchsorted, - ops::SearchSortedKernel, - ops::SearchSortedKernel, - ops::SearchSortedKernel, - ops::SearchSortedKernel); diff --git a/paddle/fluid/operators/set_value_op.cc b/paddle/fluid/operators/set_value_op.cc index 7d0d782b837c4c828996e993634373ab38d88eac..513ab46e9b5eebdb39faf4401d9d8b2fc387a82f 100644 --- a/paddle/fluid/operators/set_value_op.cc +++ b/paddle/fluid/operators/set_value_op.cc @@ -243,14 +243,6 @@ REGISTER_OPERATOR(set_value, ops::SetValue, ops::SetValueMaker, REGISTER_OPERATOR(set_value_grad, ops::SetValueGrad); -REGISTER_OP_CPU_KERNEL( - set_value_grad, - ops::SetValueGradKernel, - ops::SetValueGradKernel, - ops::SetValueGradKernel, - ops::SetValueGradKernel, - ops::SetValueGradKernel); - REGISTER_OP_VERSION(set_value) .AddCheckpoint( R"ROC( diff --git a/paddle/fluid/operators/set_value_op.cu b/paddle/fluid/operators/set_value_op.cu deleted file mode 100644 index 9f291a863c067ae0210f44befb89191678291441..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/set_value_op.cu +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/set_value_op.h" - -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL( - set_value_grad, - ops::SetValueGradKernel, - ops::SetValueGradKernel, - ops::SetValueGradKernel, - ops::SetValueGradKernel, - ops::SetValueGradKernel); diff --git a/paddle/fluid/operators/set_value_op.h b/paddle/fluid/operators/set_value_op.h index 4d459f8c01b159549c331f9332e49ed79e7c9b16..4696907f32e6d323c31a27cc6959e26f20168503 100644 --- a/paddle/fluid/operators/set_value_op.h +++ b/paddle/fluid/operators/set_value_op.h @@ -19,14 +19,10 @@ #include #include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/assign_value_op.h" -#include "paddle/fluid/operators/eigen/eigen_function.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/operators/slice_utils.h" -#include "paddle/fluid/operators/strided_slice_op.h" #include "paddle/fluid/operators/utils.h" #include "paddle/fluid/platform/enforce.h" @@ -36,23 +32,6 @@ namespace operators { using Tensor = framework::Tensor; using DDim = framework::DDim; -inline void GetOffsets(const DDim& big_dim, const DDim& small_dim, - DDim start_offset, int cur_dim, - std::vector* offsets) { - if (cur_dim == big_dim.size()) { - offsets->push_back(start_offset); - return; - } - if (small_dim[cur_dim] == big_dim[cur_dim]) { - GetOffsets(big_dim, small_dim, start_offset, cur_dim + 1, offsets); - } else { - for (int i = 0; i < big_dim[cur_dim]; i++) { - GetOffsets(big_dim, small_dim, start_offset, cur_dim + 1, offsets); - start_offset[cur_dim] += 1; - } - } -} - inline std::string GetValueName(framework::proto::VarType::Type data_type) { std::string value_name; switch (data_type) { @@ -121,253 +100,6 @@ inline void CheckIsDimsMatch(const framework::DDim first, "of target shape: %d, but now shape is %d.", second.to_str(), first.to_str())); } -template -class SetValueGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - int rank = ctx.Input(framework::GradVarName("Out"))->dims().size(); - - switch (rank) { - case 1: - SetValueGradCompute<1>(ctx); - break; - case 2: - SetValueGradCompute<2>(ctx); - break; - case 3: - SetValueGradCompute<3>(ctx); - break; - case 4: - SetValueGradCompute<4>(ctx); - break; - case 5: - SetValueGradCompute<5>(ctx); - break; - case 6: - SetValueGradCompute<6>(ctx); - break; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "The rank of set_value_grad's input should be less than 7, but " - "received %d.", - rank)); - } - } - - private: - template - void SetValueGradCompute(const framework::ExecutionContext& context) const { - auto starts = context.Attr>("starts"); - auto ends = context.Attr>("ends"); - auto steps = context.Attr>("steps"); - - auto axes_int64 = context.Attr>("axes"); - std::vector axes(axes_int64.begin(), axes_int64.end()); - - auto starts_indices = Eigen::DSizes(); - auto ends_indices = Eigen::DSizes(); - auto steps_indices = Eigen::DSizes(); - auto reverse_axis = Eigen::array(); - - auto list_new_ends_tensor = - context.MultiInput("EndsTensorList"); - auto list_new_starts_tensor = - context.MultiInput("StartsTensorList"); - auto list_new_steps_tensor = - context.MultiInput("StepsTensorList"); - - if (list_new_starts_tensor.size() > 0) { - starts = GetDataFromTensorList(list_new_starts_tensor); - } - - if (list_new_ends_tensor.size() > 0) { - ends = GetDataFromTensorList(list_new_ends_tensor); - } - - if (list_new_steps_tensor.size() > 0) { - steps = GetDataFromTensorList(list_new_steps_tensor); - } - - auto in = context.Input(framework::GradVarName("Out")); - PADDLE_ENFORCE_EQ( - in->IsInitialized(), true, - platform::errors::PermissionDenied( - "The input of `set_value_grad`(%s) has not been initialized", - framework::GradVarName("Out"))); - auto grad_value = context.Output( - framework::GradVarName("ValueTensor")); - auto grad_input = - context.Output(framework::GradVarName("Input")); - auto in_dims = in->dims(); - - auto decrease_axis_int64 = - context.Attr>("decrease_axes"); - std::vector decrease_axis(decrease_axis_int64.begin(), - decrease_axis_int64.end()); - std::vector infer_flags(axes.size(), 1); - std::vector out_dims_vector(in_dims.size(), -1); - StridedSliceOutDims(starts, ends, steps, axes, infer_flags, in_dims, - decrease_axis, out_dims_vector.data(), axes.size(), - false); - - framework::DDim out_dims(phi::make_ddim(out_dims_vector)); - - std::vector reverse_vector(starts.size(), 0); - StridedSliceFunctor(starts.data(), ends.data(), steps.data(), axes.data(), - reverse_vector.data(), in_dims, infer_flags, - decrease_axis, starts.size()); - - for (size_t axis = 0; axis < D; axis++) { - starts_indices[axis] = 0; - ends_indices[axis] = out_dims[axis]; - steps_indices[axis] = 1; - reverse_axis[axis] = false; - } - - for (size_t axis = 0; axis < axes.size(); axis++) { - int axis_index = axes[axis]; - starts_indices[axis_index] = starts[axis]; - ends_indices[axis_index] = ends[axis]; - steps_indices[axis_index] = steps[axis]; - reverse_axis[axis_index] = (reverse_vector[axis] == 1) ? true : false; - } - - bool need_reverse = false; - for (size_t axis = 0; axis < axes.size(); axis++) { - if (reverse_vector[axis] == 1) { - need_reverse = true; - break; - } - } - - auto& dev_ctx = context.template device_context(); - auto& place = - *context.template device_context().eigen_device(); - phi::funcs::SetConstant set_zero; - - if (grad_input) { - // Set gradient of `Input` - paddle::framework::TensorCopy(*in, context.GetPlace(), grad_input); - - auto grad_input_t = - framework::EigenTensor::From(*grad_input); - - framework::Tensor tmp(grad_input->dtype()); - tmp.mutable_data(out_dims, context.GetPlace()); - set_zero(dev_ctx, &tmp, static_cast(0)); - auto tmp_t = framework::EigenTensor::From(tmp); - - grad_input_t.stridedSlice(starts_indices, ends_indices, steps_indices) - .device(place) = tmp_t; - } - if (grad_value) { - grad_value->mutable_data(context.GetPlace()); - set_zero(dev_ctx, grad_value, static_cast(0)); - - auto in_t = framework::EigenTensor::From(*in); - - if (grad_value->dims() == out_dims) { - auto grad_value_t = - framework::EigenTensor::From(*grad_value); - if (need_reverse) { - framework::Tensor tmp(grad_value->dtype()); - tmp.mutable_data(out_dims, context.GetPlace()); - set_zero(dev_ctx, &tmp, static_cast(0)); - auto tmp_t = framework::EigenTensor::From(tmp); - - tmp_t.device(place) = - in_t.stridedSlice(starts_indices, ends_indices, steps_indices); - grad_value_t.device(place) = tmp_t.reverse(reverse_axis); - } else { - grad_value_t.device(place) = - in_t.stridedSlice(starts_indices, ends_indices, steps_indices); - } - } else { - int out_dims_size = out_dims.size(); - auto grad_value_dims = grad_value->dims(); - auto fake_grad_value_dims = out_dims; - - // Create an extented shape according to the rules of broadcast. - auto grad_value_dims_size = grad_value_dims.size(); - - int num_decrease = 0; - - int decrease_axis_size = decrease_axis.size(); - for (int i = 0; i < out_dims_size; i++) { - if (decrease_axis.end() != - std::find(decrease_axis.begin(), decrease_axis.end(), i)) { - fake_grad_value_dims[i] = 1; - num_decrease++; - } else if (i < out_dims_size - (grad_value_dims_size + - decrease_axis_size - num_decrease)) { - fake_grad_value_dims[i] = 1; - } else { - auto index_grad = - i - (out_dims_size - (grad_value_dims_size + - decrease_axis_size - num_decrease)); - fake_grad_value_dims[i] = grad_value_dims[index_grad]; - - PADDLE_ENFORCE_EQ((out_dims[i] == grad_value_dims[index_grad]) || - (grad_value_dims[index_grad] == 1), - true, - platform::errors::InvalidArgument( - "An error occurred while calculating %s: " - "[%s] can not be accumulated into [%s].", - framework::GradVarName("ValueTensor"), - out_dims, grad_value_dims)); - } - } - - VLOG(3) << "Dimensions of " << framework::GradVarName("ValueTensor") - << "([" << grad_value_dims << "])is broadcasted into [" - << fake_grad_value_dims << "]."; - - auto extent = Eigen::DSizes(); - auto offset = out_dims; - for (int i = 0; i < out_dims_size; i++) { - offset[i] = 0; - extent[i] = fake_grad_value_dims[i]; - } - std::vector offsets; - GetOffsets(out_dims, fake_grad_value_dims, offset, 0, &offsets); - - auto grad_value_t = - framework::EigenTensor:: - From(*grad_value, fake_grad_value_dims); - - framework::Tensor tmp(grad_value->dtype()); - tmp.mutable_data(out_dims, context.GetPlace()); - set_zero(dev_ctx, &tmp, static_cast(0)); - auto tmp_t = framework::EigenTensor::From(tmp); - - tmp_t.device(place) = - in_t.stridedSlice(starts_indices, ends_indices, steps_indices); - - // accumulate gradient - for (auto offset : offsets) { - grad_value_t.device(place) = - grad_value_t + - tmp_t.slice(framework::EigenDim::From(offset), extent); - } - if (need_reverse) { - framework::Tensor tmp_value(grad_value->dtype()); - tmp_value.mutable_data(fake_grad_value_dims, context.GetPlace()); - auto tmp_value_t = - framework::EigenTensor::From(tmp_value); - tmp_value_t.device(place) = grad_value_t.reverse(reverse_axis); - grad_value_t.device(place) = tmp_value_t; - } - } - } - } -}; } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc index 374992096605bfef0433992193e54306c3a12858..3840b99dd176d5b348533f3e50f7f90fc3250ea1 100644 --- a/paddle/fluid/operators/softmax_op.cc +++ b/paddle/fluid/operators/softmax_op.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" @@ -23,6 +24,10 @@ limitations under the License. */ #include "paddle/fluid/platform/mkldnn_helper.h" #endif +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/unary.h" + namespace paddle { namespace operators { @@ -30,30 +35,6 @@ class SoftmaxOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ( - ctx->HasInput("X"), true, - platform::errors::NotFound("Input(X) of SoftmaxOp is not found.")); - PADDLE_ENFORCE_EQ( - ctx->HasOutput("Out"), true, - platform::errors::NotFound("Output(Out) of SoftmaxOp is not found.")); - - auto dim_x = ctx->GetInputDim("X"); - auto rank_x = dim_x.size(); - auto axis = ctx->Attrs().Get("axis"); - PADDLE_ENFORCE_GE(axis, -rank_x, - platform::errors::InvalidArgument( - "Attr(axis) value should be in range [-R, R-1], " - "R is the rank of Input(X).")); - PADDLE_ENFORCE_LT(axis, rank_x, - platform::errors::InvalidArgument( - "Attr(axis) value should be in range [-R, R-1], " - "R is the rank of Input(X).")); - - ctx->SetOutputDim("Out", ctx->GetInputDim("X")); - ctx->ShareLoD("X", /*->*/ "Out"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -168,23 +149,6 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ( - ctx->HasInput("Out"), true, - platform::errors::InvalidArgument("Input(Out) is not found.")); - PADDLE_ENFORCE_EQ( - ctx->HasInput(framework::GradVarName("Out")), true, - platform::errors::InvalidArgument("Input(Out@GRAD) is not found.")); - PADDLE_ENFORCE_EQ( - ctx->GetInputDim("Out"), - ctx->GetInputDim(framework::GradVarName("Out")), - platform::errors::InvalidArgument("Input(Out) and its gradients " - "should have a same shape.")); - - ctx->SetOutputDim(framework::GradVarName("X"), - ctx->GetInputDim(framework::GradVarName("Out"))); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -244,9 +208,14 @@ DECLARE_INPLACE_OP_INFERER(SoftmaxInplaceInferer, {"X", "Out"}); namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(softmax, SoftmaxInferShapeFunctor, + PD_INFER_META(phi::SoftmaxInferMeta)); REGISTER_OPERATOR(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker, ops::SoftmaxOpInferVarType, ops::SoftmaxOpGradMaker, ops::SoftmaxOpGradMaker, - ops::SoftmaxInplaceInferer); -REGISTER_OPERATOR(softmax_grad, ops::SoftmaxOpGrad); + ops::SoftmaxInplaceInferer, SoftmaxInferShapeFunctor); +DECLARE_INFER_SHAPE_FUNCTOR(softmax_grad, SoftmaxGradInferShapeFunctor, + PD_INFER_META(phi::GeneralUnaryGradInferMeta)); +REGISTER_OPERATOR(softmax_grad, ops::SoftmaxOpGrad, + SoftmaxGradInferShapeFunctor); diff --git a/paddle/fluid/operators/spp_op.h b/paddle/fluid/operators/spp_op.h index bff8061814ae66f243ca9d863cf866821ede4a32..aa944cfcfbb1713aeb27b501083853abb4ffed40 100644 --- a/paddle/fluid/operators/spp_op.h +++ b/paddle/fluid/operators/spp_op.h @@ -16,9 +16,10 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/pooling.h" +#include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/operators/strided_memcpy.h" #include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/pooling.h" namespace paddle { namespace operators { @@ -53,14 +54,20 @@ class SppKernel : public framework::OpKernel { out_level.mutable_data(output_shape, context.GetPlace()); // pooling if (pooling_type == "max") { - math::Pool2dFunctor, T> pool_forward; - math::MaxPool max_process; + phi::funcs::Pool2dFunctor< + typename framework::ConvertToPhiContext::TYPE, + phi::funcs::MaxPool, T> + pool_forward; + phi::funcs::MaxPool max_process; pool_forward(context.template device_context(), *in_x, kernel_size, strides, paddings, true, false, &out_level, max_process); } else if (pooling_type == "avg") { - math::Pool2dFunctor, T> pool_forward; - math::AvgPool avg_process; + phi::funcs::Pool2dFunctor< + typename framework::ConvertToPhiContext::TYPE, + phi::funcs::AvgPool, T> + pool_forward; + phi::funcs::AvgPool avg_process; pool_forward(context.template device_context(), *in_x, kernel_size, strides, paddings, true, false, &out_level, avg_process); @@ -95,7 +102,9 @@ class SppGradKernel : public framework::OpKernel { std::string pooling_type = context.template Attr("pooling_type"); auto& device_ctx = context.template device_context(); - phi::funcs::SetConstant zero; + phi::funcs::SetConstant< + typename framework::ConvertToPhiContext::TYPE, T> + zero; in_x_grad->mutable_data(context.GetPlace()); zero(device_ctx, in_x_grad, static_cast(0)); auto out_stride = phi::stride(out->dims()); @@ -145,14 +154,18 @@ class SppGradKernel : public framework::OpKernel { outgrad_level.Resize(out_shape); // pooling backward if (pooling_type == "max") { - math::MaxPool2dGradFunctor pool2d_backward; + phi::funcs::MaxPool2dGradFunctor< + typename framework::ConvertToPhiContext::TYPE, T> + pool2d_backward; pool2d_backward(context.template device_context(), *in_x, *&out_level, *&outgrad_level, kernel_size, strides, paddings, in_x_grad); } else if (pooling_type == "avg") { - math::Pool2dGradFunctor, T> + phi::funcs::Pool2dGradFunctor< + typename framework::ConvertToPhiContext::TYPE, + phi::funcs::AvgPoolGrad, T> pool_backward; - math::AvgPoolGrad avg_process; + phi::funcs::AvgPoolGrad avg_process; pool_backward(context.template device_context(), *in_x, *&out_level, *&outgrad_level, kernel_size, strides, paddings, true, false, in_x_grad, avg_process); diff --git a/paddle/fluid/operators/squeeze_op.h b/paddle/fluid/operators/squeeze_op.h index 58e5440689926497705624a0c64e6cc3d43dbab1..a776a78616b8d6dbac66ccab0d59433b98ae65e4 100644 --- a/paddle/fluid/operators/squeeze_op.h +++ b/paddle/fluid/operators/squeeze_op.h @@ -17,7 +17,6 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/pooling.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/triangular_solve_op.cc b/paddle/fluid/operators/triangular_solve_op.cc index df84659a00f4c4220853404a8b28c6ccc93623a3..35b925ca172b7ccb665978010dbcdd2cb10c9678 100644 --- a/paddle/fluid/operators/triangular_solve_op.cc +++ b/paddle/fluid/operators/triangular_solve_op.cc @@ -12,10 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/triangular_solve_op.h" #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/solve_op.h" #include "paddle/phi/infermeta/binary.h" namespace paddle { diff --git a/paddle/fluid/operators/triangular_solve_op.h b/paddle/fluid/operators/triangular_solve_op.h deleted file mode 100644 index fd46aca456cd9bd883cf9d1ce3576b307794b1a5..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/triangular_solve_op.h +++ /dev/null @@ -1,64 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "glog/logging.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/reduce_ops/reduce_op.h" -#include "paddle/fluid/operators/solve_op.h" -#include "paddle/fluid/operators/tril_triu_op.h" -#include "paddle/phi/core/ddim.h" -#include "paddle/phi/kernels/funcs/complex_functors.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -static void triangular_solve(const DeviceContext &context, const Tensor &x, - const Tensor &y, Tensor *out, bool upper, - bool transpose, bool unitriangular) { - // Tensor broadcast use eigen library - std::vector x_bst_dims_vec; - std::vector y_bst_dims_vec; - std::tie(x_bst_dims_vec, y_bst_dims_vec) = get_broadcast_dims(x, y); - - Tensor x_bst(x.type()); - TensorExpand(context, x, &x_bst, x_bst_dims_vec); - - Tensor y_bst(y.type()); - TensorExpand(context, y, &y_bst, y_bst_dims_vec); - - // TriangularSolveFunctor performs calculations in-place - // x_clone should be a copy of 'x' after broadcast - // out should be a copy of 'y' after broadcast - Tensor x_clone(x.type()); - x_clone.Resize(phi::make_ddim(x_bst_dims_vec)); - x_clone.mutable_data(context.GetPlace()); - framework::TensorCopy(x_bst, context.GetPlace(), context, &x_clone); - - out->Resize(phi::make_ddim(y_bst_dims_vec)); - out->mutable_data(context.GetPlace()); - framework::TensorCopy(y_bst, context.GetPlace(), context, out); - - math::TriangularSolveFunctor functor; - functor(context, &x_clone, out, /*left=*/true, upper, transpose, - unitriangular); -} - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.h b/paddle/fluid/operators/truncated_gaussian_random_op.h index a6ff2f686cb76bb03de8074014f82d6ff9e57bd3..8af6e281424eaabd8d6ea86843b3c13aa36cba47 100644 --- a/paddle/fluid/operators/truncated_gaussian_random_op.h +++ b/paddle/fluid/operators/truncated_gaussian_random_op.h @@ -1,11 +1,8 @@ /* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -140,19 +137,9 @@ T Erfinv(T x) { template struct TruncatedNormal { T mean, std; - T a_normal_cdf; - T b_normal_cdf; - TruncatedNormal(T mean, T std) : mean(mean), std(std) { - auto normal_cdf = [](T x) { - return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0; - }; - a_normal_cdf = normal_cdf(-2.0); - b_normal_cdf = normal_cdf(2.0); - } - + TruncatedNormal(T mean, T std) : mean(mean), std(std) {} T operator()(T value) const { - auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value; - return std::sqrt(2.0) * Erfinv(2 * p - 1) * std + mean; + return std::sqrt(2.0) * Erfinv(value) * std + mean; } }; diff --git a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc index 261d9cee2d5cd25c510aacb280b9623f985eb1f7..4ed0dd22ec086923bbe47af192cab8d001ae734f 100644 --- a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc +++ b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc @@ -84,8 +84,13 @@ class NPUTruncatedGaussianRandomKernel : public framework::OpKernel { Tensor cpu_tensor(tensor->dtype()); cpu_tensor.Resize(tensor->dims()); T* cpu_data = cpu_tensor.mutable_data(platform::CPUPlace()); - std::uniform_real_distribution dist(std::numeric_limits::min(), - 1.0); + auto normal_cdf = [](float x) { + return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0; + }; + float a_normal_cdf = normal_cdf((-2.0 - mean) / std); + float b_normal_cdf = normal_cdf((2.0 - mean) / std); + std::uniform_real_distribution dist(2.0 * a_normal_cdf - 1.0, + 2.0 * b_normal_cdf - 1.0); TruncatedNormal truncated_normal(mean, std); int64_t size = tensor->numel(); diff --git a/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc b/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc index 803b61fbe813f85f48b71d1de7fc41eb26e4b8da..984d9f397cc655b4cfd7e0bc211db1665252272f 100644 --- a/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc +++ b/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc @@ -32,8 +32,13 @@ class XPUTruncatedGaussianRandomKernel : public framework::OpKernel { auto* tensor = context.Output("Out"); T* data = tensor->mutable_data(context.GetPlace()); - std::uniform_real_distribution dist(std::numeric_limits::min(), - 1.0); + auto normal_cdf = [](float x) { + return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0; + }; + float a_normal_cdf = normal_cdf((-2.0 - mean) / std); + float b_normal_cdf = normal_cdf((2.0 - mean) / std); + std::uniform_real_distribution dist(2.0 * a_normal_cdf - 1.0, + 2.0 * b_normal_cdf - 1.0); TruncatedNormal truncated_normal(mean, std); int64_t size = tensor->numel(); diff --git a/paddle/fluid/operators/unsqueeze_op.h b/paddle/fluid/operators/unsqueeze_op.h index 7f676cbb65ee460cdf639641330d49b5774f95a5..f6112fb59c12252255861825ff9d7b534c542665 100644 --- a/paddle/fluid/operators/unsqueeze_op.h +++ b/paddle/fluid/operators/unsqueeze_op.h @@ -16,7 +16,6 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/pooling.h" #include "paddle/fluid/operators/utils.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/phi/kernels/funcs/blas/blas.h" diff --git a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h index c5dff84723ccf4f40065f5a1d13cf5cdce8b3a0f..ce9b09f60ca352a0cc33d2e477134ca2e10c2ad2 100644 --- a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h +++ b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h @@ -56,6 +56,9 @@ XPUOpMap& get_kp_ops() { {"hard_shrink", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"hard_sigmoid", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"swish", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"thresholded_relu", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, }; return s_xpu_kp_kernels; diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index 01de7349f4823a66b2d180f3d1493477f361273a..1254331835bbdf4dfc698021a52208d846651dd5 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -618,7 +618,7 @@ class BinaryMKLDNNHandler const dnnl::engine engine, platform::Place cpu_place, const Tensor* x, const Tensor* y, Tensor* z, float scale_x, float scale_y, float scale_z, - const dnnl::post_ops& post_ops = dnnl::post_ops()) + const dnnl::post_ops& post_ops = dnnl::post_ops{}) : platform::MKLDNNHandlerNoCachingT(engine, cpu_place) { PADDLE_ENFORCE_EQ( x->layout(), DataLayout::kMKLDNN, @@ -676,8 +676,8 @@ class BinaryMKLDNNHandler const auto dst_md = memory::desc(dst_tz, platform::MKLDNNGetDataType(), MKLDNNMemoryFormat::any); - auto attributes = CreateAttributes(algo, scale_x, scale_y, scale_z); - attributes.set_post_ops(post_ops); + auto attributes = + CreateAttributes(algo, scale_x, scale_y, scale_z, post_ops); this->AcquireForwardPrimitiveDescriptor(attributes, algo, src0_md, src1_md, dst_md); @@ -690,10 +690,9 @@ class BinaryMKLDNNHandler } private: - static inline dnnl::primitive_attr CreateAttributes(dnnl::algorithm op, - float scale_x, - float scale_y, - float scale_z) { + static inline dnnl::primitive_attr CreateAttributes( + dnnl::algorithm op, float scale_x, float scale_y, float scale_z, + dnnl::post_ops post_ops = dnnl::post_ops{}) { // Scales set in attributes for inputs contibute to the output equation // in the following way (assuming no broadcasting takes place): // output_i = scale_0 * x_i <+ or *> scale_1 * y_i; @@ -718,6 +717,7 @@ class BinaryMKLDNNHandler {scale_0}); attributes.set_scales(/* input_y_id = */ DNNL_ARG_SRC_1, /* mask = */ 0, {scale_1}); + if (post_ops.len() > 0) attributes.set_post_ops(post_ops); return attributes; } }; diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index feb72bce72bf8c9c13260d53d65020a68ba85eb8..940fc98d3b32021ae8b278305c54d8819292daaf 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -77,7 +77,9 @@ RecordEvent::RecordEvent(const char *name, const TracerEventType type, #endif #endif if (FLAGS_enable_host_event_recorder_hook == false) { - OriginalConstruct(name, role, "none"); + if (g_state != ProfilerState::kDisabled) { // avoid temp string + OriginalConstruct(name, role, "none"); + } return; } if (UNLIKELY(HostTraceLevel::GetInstance().NeedTrace(level) == false)) { @@ -165,8 +167,8 @@ void RecordEvent::End() { } #endif #endif - uint64_t end_ns = PosixInNsec(); if (LIKELY(FLAGS_enable_host_event_recorder_hook && is_enabled_)) { + uint64_t end_ns = PosixInNsec(); if (LIKELY(shallow_copy_name_ != nullptr)) { HostEventRecorder::GetInstance().RecordEvent( shallow_copy_name_, start_ns_, end_ns, role_, type_); @@ -190,6 +192,7 @@ void RecordEvent::End() { // lock is not needed, the code below is thread-safe DeviceTracer *tracer = GetDeviceTracer(); if (tracer) { + uint64_t end_ns = PosixInNsec(); tracer->AddCPURecords(CurAnnotationName(), start_ns_, end_ns, BlockDepth(), g_thread_id); } diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 8ee22590b6d79b3ff6c22f26090f50c2e21d6275..7b223f7ed27e2249d84539a81312658f8c2260f0 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -44,6 +44,9 @@ endif() if(NOT WIN32) set(PYBIND_DEPS ${PYBIND_DEPS} data_loader) set(PYBIND_DEPS ${PYBIND_DEPS} mmap_allocator) + if (WITH_GPU) + set(PYBIND_DEPS ${PYBIND_DEPS} cuda_ipc_allocator) + endif() if (WITH_NCCL OR WITH_RCCL) set(PYBIND_DEPS ${PYBIND_DEPS} nccl_context) set(PYBIND_DEPS ${PYBIND_DEPS} heter_ccl_context) @@ -348,7 +351,7 @@ if(WITH_PYTHON) if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) cc_library(paddle_eager SRCS eager.cc eager_functions.cc eager_method.cc eager_properties.cc eager_utils.cc - DEPS eager_api autograd_meta backward grad_node_info phi op_function_common final_dygraph_function final_dygraph_node dygraph_function dygraph_node accumulation_node global_utils utils python) + DEPS eager_api autograd_meta backward grad_node_info phi op_function_common final_dygraph_function final_dygraph_node dygraph_function dygraph_node accumulation_node global_utils utils python custom_operator custom_operator_node) add_dependencies(paddle_eager eager_codegen) add_dependencies(paddle_eager eager_op_function_generator_cmd) list(APPEND PYBIND_DEPS paddle_eager) diff --git a/paddle/fluid/pybind/custom_handwrite_op_funcs.h b/paddle/fluid/pybind/custom_handwrite_op_funcs.h index 7a276df0d5bdc95b6c925b7c620d7931b6aaf0ec..3b898ce77ce6fb43ca9aaba38e5db9e01a1d19d3 100644 --- a/paddle/fluid/pybind/custom_handwrite_op_funcs.h +++ b/paddle/fluid/pybind/custom_handwrite_op_funcs.h @@ -31,7 +31,6 @@ static PyObject *eager_api_run_program(PyObject *self, PyObject *args, tstate = PyEval_SaveThread(); run_program_dygraph_function(X, Params, Out, OutScope, DOut, attrs); - std::cout << "end run_program_dygraph_function" << std::endl; PyEval_RestoreThread(tstate); tstate = nullptr; } catch (...) { diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc index 0b1796703817c28526172a542ae9253578f44ee2..1df917b8c3594d4505d9e92cd9a8c64bffd50279 100644 --- a/paddle/fluid/pybind/distributed_py.cc +++ b/paddle/fluid/pybind/distributed_py.cc @@ -51,6 +51,18 @@ namespace pybind { using Tensor = paddle::experimental::Tensor; +std::shared_ptr CreateEagerReducer( + py::handle py_tensors, + const std::vector> &group_indices, + const std::vector &is_sparse_gradient, + std::shared_ptr process_group, + const std::vector &group_size_limits, bool find_unused_parameters) { + auto params = CastPyArg2VectorOfTensor(py_tensors.ptr(), 0); + return std::make_shared( + params, group_indices, is_sparse_gradient, process_group, + group_size_limits, find_unused_parameters); +} + #if defined(PADDLE_WITH_GLOO) using ProcessGroupGloo = paddle::distributed::ProcessGroupGloo; using GlooStore = paddle::distributed::ProcessGroupGloo::GlooStore; @@ -271,6 +283,17 @@ void BindDistributed(py::module *m) { py::arg("group_size_limits") = std::vector{25 * 1024 * 1024}, py::arg("tensor_indices") = std::vector{}, py::call_guard()); + + py::class_>(*m, "EagerReducer", + R"DOC()DOC") + .def(py::init(&CreateEagerReducer)) + .def("prepare_for_backward", + [](distributed::EagerReducer &self, py::handle py_tensors) { + auto params = CastPyArg2VectorOfTensor(py_tensors.ptr(), 0); + self.PrepareForBackward(params); + }, + py::arg("tensors"), py::call_guard()); } } // end namespace pybind diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc index 0b04dc7347ce78f87d6f8d81e30eb4135fd965ed..e110432c67d395c865d934a47eaa4a803053db8b 100644 --- a/paddle/fluid/pybind/eager_functions.cc +++ b/paddle/fluid/pybind/eager_functions.cc @@ -21,21 +21,25 @@ limitations under the License. */ #include "paddle/fluid/eager/api/all.h" #include "paddle/fluid/eager/autograd_meta.h" #include "paddle/fluid/eager/backward.h" +#include "paddle/fluid/eager/custom_operator/custom_operator_node.h" #include "paddle/fluid/eager/utils.h" #include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/framework/custom_operator.h" +#include "paddle/fluid/framework/op_meta_info_helper.h" #include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/pybind/eager.h" #include "paddle/fluid/pybind/eager_utils.h" #include "paddle/fluid/pybind/exception.h" +#include "paddle/phi/api/ext/op_meta_info.h" #include "paddle/phi/api/lib/utils/allocator.h" #include "paddle/phi/api/lib/utils/storage.h" #include "paddle/phi/api/lib/utils/tensor_utils.h" #include "paddle/phi/common/data_type.h" #include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/dense_tensor.h" - namespace paddle { namespace pybind { @@ -168,7 +172,276 @@ static PyObject* eager_api_read_next_tensor_list(PyObject* self, PyObject* args, EAGER_CATCH_AND_THROW_RETURN_NULL } +static void ConstructFwdAndBwdMap( + const std::vector& vec_map, + const std::string& op_type) { + auto& in_out_map = egr::Controller::Instance().GetCustomEdgesSlotMap(); + if (in_out_map.find(op_type) != in_out_map.end()) { + VLOG(7) << "Find Exist CustomEdgesSlotMap Skip >>>> "; + return; + } else { + VLOG(7) << "Construct CustomEdgesSlotMap "; + auto inputs_names = + paddle::framework::OpMetaInfoHelper::GetInputs(vec_map[0]); + auto outputs_names = + paddle::framework::OpMetaInfoHelper::GetOutputs(vec_map[0]); + auto attrs_names = + paddle::framework::OpMetaInfoHelper::GetAttrs(vec_map[0]); + auto grad_outputs_names = + paddle::framework::OpMetaInfoHelper::GetOutputs(vec_map[1]); + auto grad_inputs_names = + paddle::framework::OpMetaInfoHelper::GetInputs(vec_map[1]); + auto grad_attrs_names = + paddle::framework::OpMetaInfoHelper::GetAttrs(vec_map[1]); + std::vector> res(5); + in_out_map.insert({op_type, res}); + // Prepare pos map for grad_outputs + VLOG(7) << "Prepare pos map for grad_outputs"; + PADDLE_ENFORCE_LE( + grad_outputs_names.size(), inputs_names.size(), + paddle::platform::errors::InvalidArgument( + "Grad outputs num should be less equal than forward inputs num.")); + for (size_t i = 0; i < grad_outputs_names.size(); i++) { + size_t end = grad_outputs_names[i].find("@GRAD"); + PADDLE_ENFORCE_NE( + end, std::string::npos, + paddle::platform::errors::NotFound( + "All Grad outputs should be grad and we got %s is not grad var, " + "please check your op and change to fit the rule.", + grad_outputs_names[i])); + for (size_t j = 0; j < inputs_names.size(); j++) { + if (grad_outputs_names[i].substr(0, end) == inputs_names[j]) { + VLOG(7) << " ==== Custom Operator: " << op_type << "'s No." << j + << " inputs: " << inputs_names[j] << " related to No." << i + << " grad_outputs: " << grad_outputs_names[i]; + in_out_map[op_type][0][j] = i; + } + } + } + // Prepare pos map for grad_inputs + for (size_t i = 0; i < grad_inputs_names.size(); i++) { + size_t end = grad_inputs_names[i].find("@GRAD"); + if (end != std::string::npos) { + for (size_t j = 0; j < outputs_names.size(); j++) { + if (grad_inputs_names[i].substr(0, end) == outputs_names[j]) { + VLOG(7) << " ==== Custom Operator: " << op_type << "'s No." << j + << " outputs: " << outputs_names[j] << " related to No." + << i << " grad_inputs's grad: " << grad_inputs_names[i]; + in_out_map[op_type][1][j] = i; + } + } + } else { + if (std::find(outputs_names.begin(), outputs_names.end(), + grad_inputs_names[i]) != outputs_names.end()) { + for (size_t j = 0; j < outputs_names.size(); j++) { + if (grad_inputs_names[i] == outputs_names[j]) { + VLOG(7) << " ==== Custom Operator: " << op_type << "'s No." << j + << " outputs: " << outputs_names[j] << " related to No." + << i + << " grad_inputs fwd outputs: " << grad_inputs_names[i]; + in_out_map[op_type][2][j] = i; + } + } + } else { + for (size_t j = 0; j < inputs_names.size(); j++) { + if (grad_inputs_names[i] == inputs_names[j]) { + VLOG(7) << " ==== Custom Operator: " << op_type << "'s No." << j + << " inputs: " << inputs_names[j] << " related to No." + << i + << " grad_inputs fwd inputs: " << grad_inputs_names[i]; + in_out_map[op_type][3][j] = i; + } + } + } + } + } + + // Prepare pos map for grad attrs_ + for (size_t i = 0; i < grad_attrs_names.size(); i++) { + auto end = std::find(attrs_names.begin(), attrs_names.end(), + grad_attrs_names[i]); + PADDLE_ENFORCE_NE(end, attrs_names.end(), + paddle::platform::errors::NotFound( + "All Grad attrs should be one of forward attrs and " + "we got %s is not one of them, please check your " + "op and change to fit the rule.", + grad_attrs_names[i])); + for (size_t j = 0; j < attrs_names.size(); j++) { + if (grad_attrs_names[i] == attrs_names[j]) { + VLOG(7) << " ==== Custom Operator: " << op_type << "'s No." << j + << " attrs: " << attrs_names[j] << " related to No." << i + << " grad_attrs: " << grad_attrs_names[i]; + in_out_map[op_type][4][j] = i; + } + } + } + } +} + +static std::vector CastAttrsToTragetType( + const std::vector& src, + const std::vector& attrs_names) { + std::vector res; + PADDLE_ENFORCE_EQ(src.size(), attrs_names.size(), + paddle::platform::errors::InvalidArgument( + "We Expected same size of attrs and attrs_name list, " + "if u got this error indicate your custom op setting " + "%s attrs, but you just give %s", + attrs_names.size(), src.size())); + for (size_t i = 0; i < src.size(); i++) { + size_t end = attrs_names[i].find(": "); + std::string type_name = + attrs_names[i].substr(end + 2, attrs_names.size() - end - 2); + if (type_name == "int") { + if (src[i].type() == typeid(bool)) { + res.emplace_back(static_cast(paddle::any_cast(src[i]))); + } else if (src[i].type() == typeid(int)) { + res.emplace_back(src[i]); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Your No. %s attrs should only can be bool or int32, other type is " + "forbidden for now but we got %s. Check your code first please", + i, src[i].type().name())); + } + } else if (type_name == "int64_t") { + if (src[i].type() == typeid(bool)) { + res.emplace_back(static_cast(paddle::any_cast(src[i]))); + } else if (src[i].type() == typeid(int)) { + res.emplace_back(static_cast(paddle::any_cast(src[i]))); + } else if (src[i].type() == typeid(int64_t)) { + res.emplace_back(src[i]); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Your No. %s attrs should only can be bool or int32 or int64_t, " + "other type is forbidden for now but we got %s. Check your code " + "first please", + i, src[i].type().name())); + } + } else { + res.emplace_back(src[i]); + } + } + return res; +} + +static PyObject* eager_api_run_costum_op(PyObject* self, PyObject* args, + PyObject* kwargs) { + EAGER_TRY + paddle::CustomOpKernelContext ctx = + CastPyArg2CustomOpKernelContext(PyTuple_GET_ITEM(args, 0), 0); + std::string op_type = CastPyArg2AttrString(PyTuple_GET_ITEM(args, 1), 1); + bool trace_backward = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 2), 2); + VLOG(7) << "Get things for python for Custom Op: " << op_type + << ", trace_backward is: " << trace_backward; + auto meta_info_map = egr::Controller::Instance().GetOpMetaInfoMap(); + PADDLE_ENFORCE_NE(meta_info_map.find(op_type), meta_info_map.end(), + paddle::platform::errors::NotFound( + "Can't find %s in Eager OpMetaInfoMap which should be " + "created by LoadOpMetaInfoAndRegisterOp, please make " + "sure you registered your op first and try again. ", + op_type)); + VLOG(7) << "Run Kernel of Custom Op: " << op_type; + std::vector res_attrs = CastAttrsToTragetType( + ctx.Attrs(), paddle::framework::OpMetaInfoHelper::GetAttrs( + meta_info_map.at(op_type)[0])); + ctx.EmplaceBackAttrs(res_attrs); + const auto& vec_map = meta_info_map.at(op_type); + (*paddle::framework::OpMetaInfoHelper::GetKernelFn(vec_map[0]))(&ctx); + + VLOG(7) << "Get AutogradMeta for inputs and outputs for Custom Op"; + std::vector> ins_auto_grad_metas; + std::vector> outs_auto_grad_metas; + VLOG(7) << "We got slot num of ins is: " << ctx.InputRange().size(); + ins_auto_grad_metas.resize(ctx.InputRange().size()); + VLOG(7) << "We got slot num of outs is: " << ctx.OutputRange().size(); + outs_auto_grad_metas.resize(ctx.OutputRange().size()); + for (size_t i = 0; i < ctx.InputRange().size(); i++) { + ins_auto_grad_metas[i] = + egr::EagerUtils::nullable_autograd_meta(ctx.InputsBetween( + ctx.InputRangeAt(i).first, ctx.InputRangeAt(i).second)); + } + for (size_t i = 0; i < ctx.OutputRange().size(); i++) { + outs_auto_grad_metas[i] = + egr::EagerUtils::unsafe_autograd_meta(ctx.OutputsBetweeen( + ctx.OutputRangeAt(i).first, ctx.OutputRangeAt(i).second)); + } + bool require_any_grad = false; + for (size_t i = 0; i < ins_auto_grad_metas.size(); i++) { + require_any_grad = + require_any_grad || egr::EagerUtils::ComputeRequireGrad( + trace_backward, &(ins_auto_grad_metas[i])); + } + if (require_any_grad) { + VLOG(6) << " Construct Grad for Custom Op: " << op_type; + ConstructFwdAndBwdMap(vec_map, op_type); + for (size_t i = 0; i < outs_auto_grad_metas.size(); i++) { + egr::EagerUtils::PassStopGradient(false, &(outs_auto_grad_metas[i])); + } + auto grad_node = std::make_shared( + outs_auto_grad_metas.size(), ins_auto_grad_metas.size(), op_type); + auto slot_map = + egr::Controller::Instance().GetCustomEdgesSlotMap().at(op_type); + // Prepare Grad outputs + size_t no_grad_cnt = 0; + for (size_t i = 0; i < ins_auto_grad_metas.size(); i++) { + if (slot_map[0].find(i) != slot_map[0].end()) { + grad_node->SetGradOutMeta(&ins_auto_grad_metas[i], slot_map[0][i]); + grad_node->AddEdges(&ins_auto_grad_metas[i], slot_map[0][i]); + } else { + grad_node->SetGradOutMeta(&ins_auto_grad_metas[i], + ins_auto_grad_metas.size() - 1 - no_grad_cnt); + grad_node->AddEdges(&ins_auto_grad_metas[i], + ins_auto_grad_metas.size() - 1 - no_grad_cnt); + no_grad_cnt++; + } + } + // Prepare Grad inputs with grad of fwd outputs + for (size_t i = 0; i < outs_auto_grad_metas.size(); i++) { + egr::EagerUtils::SetOutRankWithSlot(&(outs_auto_grad_metas[i]), i); + egr::EagerUtils::SetHistory(&(outs_auto_grad_metas[i]), grad_node); + grad_node->SetGradInMeta(&(outs_auto_grad_metas[i]), i); + egr::EagerUtils::CheckAndRetainGrad(ctx.OutputsBetweeen( + ctx.OutputRangeAt(i).first, ctx.OutputRangeAt(i).second)); + } + + // Prepare Grad inputs with fwd outputs + for (auto it = slot_map[2].begin(); it != slot_map[2].end(); it++) { + VLOG(7) << "Prepare fwd_outs: " << it->first + << " to grad_inputs: " << it->second; + grad_node->fwd_outs[it->second] = + egr::RunCustomOpNode::ConstructTensorWrapper( + ctx.OutputsBetweeen(ctx.OutputRangeAt(it->first).first, + ctx.OutputRangeAt(it->first).second)); + } + + // Prepare Grad inputs with fwd inputs + for (auto it = slot_map[3].begin(); it != slot_map[3].end(); it++) { + VLOG(7) << "Prepare fwd_ins: " << it->first + << " to grad_inputs: " << it->second; + grad_node->fwd_ins[it->second] = + egr::RunCustomOpNode::ConstructTensorWrapper( + ctx.InputsBetween(ctx.InputRangeAt(it->first).first, + ctx.InputRangeAt(it->first).second)); + } + + auto attrs_names = paddle::framework::OpMetaInfoHelper::GetAttrs( + meta_info_map.at(op_type)[1]); + std::vector attrs(attrs_names.size()); + // Prepare attrs for Grad node + for (auto it = slot_map[4].begin(); it != slot_map[4].end(); it++) { + VLOG(7) << "Prepare fwd attrs: " << it->first + << " to grad_attrs: " << it->second; + attrs[it->second] = res_attrs[it->first]; + } + grad_node->SetAttrs(attrs); + } + Py_INCREF(Py_None); + return Py_None; + EAGER_CATCH_AND_THROW_RETURN_NULL +} + PyMethodDef variable_functions[] = { + // TODO(jiabin): Remove scale when we have final state tests {"scale", (PyCFunction)(void (*)(void))eager_api_scale, METH_VARARGS | METH_KEYWORDS, NULL}, {"_set_expected_place", @@ -179,6 +452,8 @@ PyMethodDef variable_functions[] = { METH_VARARGS | METH_KEYWORDS, NULL}, {"run_backward", (PyCFunction)(void (*)(void))eager_api_run_backward, METH_VARARGS | METH_KEYWORDS, NULL}, + {"_run_custom_op", (PyCFunction)(void (*)(void))eager_api_run_costum_op, + METH_VARARGS | METH_KEYWORDS, NULL}, {"tensor_copy", (PyCFunction)(void (*)(void))eager_api_tensor_copy, METH_VARARGS | METH_KEYWORDS, NULL}, {"read_next_tensor_list", diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index e5f22338dc61543a377d4a94307f834b774257d4..082ec382c79cd9c98ac75db14bc552883088b885 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -214,8 +214,8 @@ static PyObject* tensor_method__is_initialized(TensorObject* self, static PyObject* tensor_method__copy_to(TensorObject* self, PyObject* args, PyObject* kwargs) { EAGER_TRY - bool blocking = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 0), 0); - auto place = CastPyArg2Place(PyTuple_GET_ITEM(args, 1), 1); + auto place = CastPyArg2Place(PyTuple_GET_ITEM(args, 0), 0); + bool blocking = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 1), 1); auto cp_tensor = self->tensor.copy_to(phi::TransToPhiBackend(place), blocking); egr::EagerUtils::autograd_meta(&cp_tensor)->SetStopGradient(true); @@ -688,6 +688,21 @@ static PyObject* tensor_register_reduce_hook(TensorObject* self, PyObject* args, EAGER_CATCH_AND_THROW_RETURN_NULL } +static PyObject* set_grad_type(TensorObject* self, PyObject* args, + PyObject* kwargs) { + EAGER_TRY + auto var_type = pybind::CastPyArg2ProtoType(PyTuple_GET_ITEM(args, 0), 0); + auto grad_tensor = + egr::EagerUtils::unsafe_autograd_meta(self->tensor)->Grad(); + if (var_type == framework::proto::VarType::LOD_TENSOR) { + grad_tensor.set_impl(std::make_shared()); + } else if (var_type == framework::proto::VarType::SELECTED_ROWS) { + grad_tensor.set_impl(std::make_shared()); + } + return Py_None; + EAGER_CATCH_AND_THROW_RETURN_NULL +} + PyMethodDef variable_methods[] = { {"numpy", (PyCFunction)(void (*)(void))tensor_method_numpy, METH_VARARGS | METH_KEYWORDS, NULL}, @@ -734,6 +749,8 @@ PyMethodDef variable_methods[] = { {"_register_backward_hook", (PyCFunction)(void (*)(void))tensor_register_reduce_hook, METH_VARARGS | METH_KEYWORDS, NULL}, + {"_set_grad_type", (PyCFunction)(void (*)(void))set_grad_type, + METH_VARARGS | METH_KEYWORDS, NULL}, {NULL, NULL, 0, NULL}}; } // namespace pybind diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc index 2e1390cb96155c4832a8ceace889e331039ed43f..2572866b8f5198b2414163d4198e06b54d11fedc 100644 --- a/paddle/fluid/pybind/eager_properties.cc +++ b/paddle/fluid/pybind/eager_properties.cc @@ -72,7 +72,7 @@ PyObject* tensor_properties_get_grad(TensorObject* self, void* closure) { EAGER_TRY VLOG(6) << "Get grad for tensor: " << self->tensor.name(); auto meta = egr::EagerUtils::nullable_autograd_meta(self->tensor); - if (meta) { + if (meta && meta->Grad().initialized()) { return ToPyObject(meta->Grad()); } else { Py_INCREF(Py_None); diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc index f4e148cf8dceb5211c368fa00211b2c7b9f0a725..217edad0c0a105cc649c6c8c4433b0c8eab0119b 100644 --- a/paddle/fluid/pybind/eager_utils.cc +++ b/paddle/fluid/pybind/eager_utils.cc @@ -27,10 +27,10 @@ limitations under the License. */ #include "paddle/fluid/pybind/eager_utils.h" #include "paddle/fluid/pybind/op_function_common.h" #include "paddle/fluid/pybind/tensor_py.h" +#include "paddle/phi/api/ext/op_meta_info.h" #include "paddle/phi/common/data_type.h" #include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/dense_tensor.h" - namespace paddle { namespace pybind { @@ -46,6 +46,7 @@ extern PyTypeObject* g_npuplace_pytype; extern PyTypeObject* g_cudapinnedplace_pytype; extern PyTypeObject* g_framework_tensor_pytype; extern PyTypeObject* g_framework_lodtensorarray_pytype; +extern PyTypeObject* g_custom_op_kernel_ctx_pytype; int TensorDtype2NumpyDtype(phi::DataType dtype) { switch (dtype) { @@ -184,7 +185,7 @@ paddle::experimental::Tensor CastPyArg2Tensor(PyObject* obj, ssize_t arg_pos) { } else { PADDLE_THROW(platform::errors::InvalidArgument( "argument (position %d) must be " - "EagerVariable, but got %s", + "Tensor, but got %s", arg_pos + 1, reinterpret_cast(obj->ob_type)->tp_name)); } } @@ -319,7 +320,7 @@ framework::Tensor CastPyArg2FrameworkTensor(PyObject* obj, ssize_t arg_pos) { } else { PADDLE_THROW(platform::errors::InvalidArgument( "argument (position %d) must be " - "EagerVariable, but got %s", + "DenseTensor, but got %s", arg_pos + 1, reinterpret_cast(obj->ob_type)->tp_name)); } } @@ -391,6 +392,19 @@ paddle::framework::proto::VarType::Type CastPyArg2ProtoType(PyObject* obj, return dtype; } +paddle::CustomOpKernelContext CastPyArg2CustomOpKernelContext(PyObject* obj, + ssize_t arg_pos) { + if (PyObject_IsInstance( + obj, reinterpret_cast(g_custom_op_kernel_ctx_pytype))) { + return ::pybind11::handle(obj).cast(); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "argument (position %d) must be " + "one of(Place,CUDAPlace,CPUPlace,XPUPlace,NPUPlace,CUDAPinnedPlace), " + "but got %s", + arg_pos + 1, reinterpret_cast(obj->ob_type)->tp_name)); + } +} PyObject* ToPyObject(bool value) { if (value) { Py_INCREF(Py_True); @@ -928,6 +942,5 @@ paddle::experimental::DataType CastPyArg2DataType(PyObject* obj, framework::proto::VarType::Type type = CastPyArg2ProtoType(obj, arg_pos); return framework::TransToPhiDataType(type); } - } // namespace pybind } // namespace paddle diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h index 966a920377b38f160a1d4789ef4b04b61d47f2c1..2187555e1c3c7f64bd864e4212bfc6ebe1fb1684 100644 --- a/paddle/fluid/pybind/eager_utils.h +++ b/paddle/fluid/pybind/eager_utils.h @@ -20,10 +20,10 @@ limitations under the License. */ #include "pybind11/pybind11.h" #include "pybind11/stl.h" namespace paddle { +class CustomOpKernelContext; namespace framework { class Scope; } - namespace pybind { typedef struct { @@ -40,6 +40,8 @@ int CastPyArg2AttrInt(PyObject* obj, ssize_t arg_pos); int64_t CastPyArg2AttrLong(PyObject* obj, ssize_t arg_pos); float CastPyArg2AttrFloat(PyObject* obj, ssize_t arg_pos); std::string CastPyArg2AttrString(PyObject* obj, ssize_t arg_pos); +paddle::CustomOpKernelContext CastPyArg2CustomOpKernelContext(PyObject* obj, + ssize_t arg_pos); paddle::experimental::Tensor CastPyArg2Tensor(PyObject* obj, ssize_t arg_pos); std::shared_ptr CastPyArg2VarBase(PyObject* obj, ssize_t arg_pos); @@ -52,6 +54,7 @@ std::vector CastPyArg2VectorOfTensorBase(PyObject* obj, std::vector CastPyArg2VectorOfInt(PyObject* obj, size_t arg_pos); framework::proto::VarType::Type CastPyArg2ProtoType(PyObject* obj, ssize_t arg_pos); + PyObject* ToPyObject(int value); PyObject* ToPyObject(bool value); PyObject* ToPyObject(int64_t value); @@ -138,6 +141,7 @@ std::vector GetTensorPtrListFromArgs( ssize_t arg_idx, bool dispensable = false); // end of Slice related methods + std::vector GetScopePtrListFromArgs( const std::string& op_type, const std::string& arg_name, PyObject* args, ssize_t arg_idx, bool dispensable); diff --git a/paddle/fluid/pybind/exception.cc b/paddle/fluid/pybind/exception.cc index 362a3e44fab6254bef591bfd144e071821846271..4f25a6f1a5ca8d1a7926d148830934370e323e0f 100644 --- a/paddle/fluid/pybind/exception.cc +++ b/paddle/fluid/pybind/exception.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/pybind/exception.h" - +#include "paddle/phi/api/ext/exception.h" namespace paddle { namespace pybind { @@ -122,6 +122,8 @@ void ThrowExceptionToPython(std::exception_ptr p) { PyErr_SetString(EnforceNotMetException, e.what()); break; } + } catch (const paddle::PD_Exception& e) { + PyErr_SetString(PyExc_OSError, e.what()); } } } // namespace pybind diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 1c5b30fe087f3636a6a10579651d2c6a77a42343..21bbc7f3e369bf66935487d3f3619c9a0890399b 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -64,6 +64,9 @@ limitations under the License. */ #include "paddle/fluid/imperative/amp_auto_cast.h" #include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/memory/allocation/allocator_strategy.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/memory/allocation/cuda_ipc_allocator.h" +#endif #include "paddle/fluid/memory/allocation/mmap_allocator.h" #include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/common_infer_shape_functions.h" @@ -161,6 +164,9 @@ limitations under the License. */ #include "paddle/fluid/pybind/fleet_py.h" #endif +#include "paddle/fluid/eager/api/utils/global_utils.h" +#include "paddle/fluid/pybind/eager_utils.h" +#include "paddle/phi/api/ext/op_meta_info.h" #include "pybind11/stl.h" DECLARE_bool(use_mkldnn); @@ -184,6 +190,7 @@ PyTypeObject *g_cudapinnedplace_pytype = nullptr; PyTypeObject *g_mluplace_pytype = nullptr; PyTypeObject *g_framework_tensor_pytype = nullptr; PyTypeObject *g_framework_lodtensorarray_pytype = nullptr; +PyTypeObject *g_custom_op_kernel_ctx_pytype = nullptr; bool IsCompiledWithCUDA() { #if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) @@ -729,6 +736,13 @@ PYBIND11_MODULE(core_noavx, m) { lib[string]: the libarary, could be 'phi', 'fluid' and 'all'. )DOC"); + // NOTE(Aganlengzi): KernelFactory static instance is initialized BEFORE + // plugins are loaded for custom kernels, but de-initialized AFTER they are + // unloaded. We need manually clear symbols(may contain plugins' symbols) + // stored in this static instance to avoid illegal memory access. + m.def("clear_kernel_factory", + []() { phi::KernelFactory::Instance().kernels().clear(); }); + // NOTE(zjl): ctest would load environment variables at the beginning even // though we have not `import paddle.fluid as fluid`. So we add this API // to enable eager deletion mode in unittest. @@ -747,6 +761,57 @@ PYBIND11_MODULE(core_noavx, m) { m.def("_promote_types_if_complex_exists", &paddle::framework::PromoteTypesIfComplexExists); + py::class_ custom_op_kernel_ctx( + m, "CustomOpKernelContext", R"DOC()DOC"); + g_custom_op_kernel_ctx_pytype = + reinterpret_cast(custom_op_kernel_ctx.ptr()); + custom_op_kernel_ctx.def(py::init<>()) + .def("add_inputs", + [](paddle::CustomOpKernelContext &self, const py::handle &input) { + PyObject *obj = input.ptr(); + if (PyList_Check(obj) || PyTuple_Check(obj)) { + self.EmplaceBackInputs( + std::move(CastPyArg2VectorOfTensor(obj, 1))); + } else { + self.EmplaceBackInput(std::move(CastPyArg2Tensor(obj, 1))); + } + }) + .def("add_outputs", + [](paddle::CustomOpKernelContext &self, py::handle &outputs) { + PyObject *obj = outputs.ptr(); + if (PyList_Check(obj) || PyTuple_Check(obj)) { + self.EmplaceBackOutputs( + std::move(CastPyArg2VectorOfTensor(obj, 1))); + } else { + self.EmplaceBackOutput(std::move(CastPyArg2Tensor(obj, 1))); + } + }) + .def("add_attr", [](paddle::CustomOpKernelContext &self, + bool attr) { self.EmplaceBackAttr(attr); }) + .def("add_attr", [](paddle::CustomOpKernelContext &self, + int attr) { self.EmplaceBackAttr(attr); }) + .def("add_attr", [](paddle::CustomOpKernelContext &self, + float attr) { self.EmplaceBackAttr(attr); }) + .def("add_attr", [](paddle::CustomOpKernelContext &self, + int64_t attr) { self.EmplaceBackAttr(attr); }) + .def("add_attr", + [](paddle::CustomOpKernelContext &self, const std::string &attr) { + self.EmplaceBackAttr(attr); + }) + .def("add_attr", + [](paddle::CustomOpKernelContext &self, + const std::vector &attr) { self.EmplaceBackAttr(attr); }) + .def("add_attr", + [](paddle::CustomOpKernelContext &self, + const std::vector &attr) { self.EmplaceBackAttr(attr); }) + .def("add_attr", + [](paddle::CustomOpKernelContext &self, + const std::vector &attr) { self.EmplaceBackAttr(attr); }) + .def("add_attr", [](paddle::CustomOpKernelContext &self, + const std::vector &attr) { + self.EmplaceBackAttr(attr); + }); + py::class_ framework_tensor(m, "Tensor", py::buffer_protocol()); g_framework_tensor_pytype = @@ -1180,6 +1245,287 @@ PYBIND11_MODULE(core_noavx, m) { }); #else }) +#ifdef PADDLE_WITH_CUDA + .def("_share_buffer_with", + [](framework::Tensor &self, const framework::Tensor src, + py::tuple t) { + auto *cuda_ipc_allocation = + dynamic_cast( + src.Holder().get()); + + PADDLE_ENFORCE_NOT_NULL( + cuda_ipc_allocation, + platform::errors::PreconditionNotMet( + "Tensor is not Cuda IPC shared tensor. " + "Now only Tensor shared by cuda ipc could use this " + "api.")); + + size_t size = t[0].cast(); + auto dtype = + static_cast(t[1].cast()); + auto dims = phi::make_ddim(t[2].cast>()); + auto lod_info = t[3].cast(); + auto device_id = t[4].cast(); + + auto shared_reader_holder = + std::make_shared( + cuda_ipc_allocation->ptr(), + cuda_ipc_allocation->base_ptr(), size, + platform::CUDAPlace(device_id)); + + self.ResetHolderWithType(shared_reader_holder, dtype); + self.Resize(dims); + self.set_lod(lod_info); + + VLOG(6) << "Reconstructed tensor with buffer shared!"; + }, + R"DOC( + Deserialize GPU Tensor for existed shared Cuda IPC tensor. + + Params: + tensor: Shared Cuda IPC tensor. + tuple: contrains data size, data type, + tensor dims, lod information, device index. + + )DOC") + .def("_share_cuda", + [](framework::Tensor self) { + if (!self.IsInitialized() || self.numel() == 0) + throw std::runtime_error( + "Tensor not initialized or numel is 0. could not pass " + "to shared memory. "); + + auto *holder = dynamic_cast( + self.Holder().get()); + PADDLE_ENFORCE_EQ( + platform::is_gpu_place(holder->place()), true, + platform::errors::InvalidArgument( + "Tensor is not on GPU. share_cuda only support GPU " + "Tensor, share_filename is for CPU tensor.")); + + void *base_ptr = holder->base_ptr(); + ptrdiff_t offset_bytes = reinterpret_cast(holder->ptr()) - + reinterpret_cast(base_ptr); + + cudaIpcMemHandle_t handle; + PADDLE_ENFORCE_GPU_SUCCESS(cudaIpcGetMemHandle(&handle, base_ptr)); + + auto _handle = py::bytes(reinterpret_cast(&handle), + (py::ssize_t)CUDA_IPC_HANDLE_SIZE); + + // TODO(ZHUI): use cuda event, to avoid sync. + const auto &device_id = paddle::platform::GetCurrentDeviceId(); + auto stream = + paddle::platform::stream::get_current_stream(device_id); + stream->Synchronize(); + + int type_idx = static_cast(self.type()); + size_t data_size = + self.numel() * + framework::SizeOfType( + framework::TransToProtoVarType(self.type())); + + return py::make_tuple(_handle, (py::size_t)offset_bytes, data_size, + type_idx, vectorize(self.dims()), self.lod(), + device_id); + }, + R"DOC( + Serialize GPU Tensor by cudaIpcMemHandle. + + Returns: + tuple: contrains handle, data size, data type, + tensor dims, lod information, device index. + + Examples: + .. code-block:: python + + import paddle + tensor = paddle.ones([3,3]) + metainfo = tensor.value().get_tensor()._share_cuda() + + )DOC") + .def("_new_shared_cuda", + [](py::tuple t) { + if (t.size() != 7) + throw std::runtime_error( + "Invalid Tensor meta info for shared cuda tensor!"); + + // 1. Create a new C++ instance + framework::Tensor tensor; + + // 2. Rebuild Allocation from handle + const std::string &handle = t[0].cast(); + ptrdiff_t offset_bytes = (ptrdiff_t)t[1].cast(); + auto device_id = t[6].cast(); + auto base_ptr = memory::allocation::GetIpcBasePtr(handle); + size_t size = t[2].cast(); + void *dev = base_ptr.get(); + dev = reinterpret_cast(dev) + offset_bytes; + + auto shared_reader_holder = + std::make_shared( + dev, size, device_id, std::move(base_ptr)); + + // 3. Rebuild Tensor + tensor.ResetHolderWithType( + shared_reader_holder, + static_cast(t[3].cast())); + tensor.Resize(phi::make_ddim(t[4].cast>())); + tensor.set_lod(t[5].cast()); + + return tensor; + }, + R"DOC( + Deserialize GPU lod tensor from cudaIpcMemHandle. + + Params: + tuple: contrains handle, data size, data type, + tensor dims, lod information, device index. + + Examples: + .. code-block:: python + + import paddle + tensor = paddle.ones([3,3]) + metainfo = tensor.value().get_tensor()._share_cuda() + tensor_from_shared = paddle.to_tensor(paddle.fluid.core.LoDTensor._new_shared_cuda(metainfo)) + + )DOC") +#endif + .def("_share_filename", + [](framework::Tensor &self) { + if (!self.IsInitialized() || self.numel() == 0) + throw std::runtime_error( + "Tensor not initialized or numel is 0. could not pass to " + "shared memory. "); + + auto holder = self.Holder(); + PADDLE_ENFORCE_EQ( + platform::is_cpu_place(holder->place()) || + platform::is_cuda_pinned_place(holder->place()), + true, platform::errors::InvalidArgument( + "Tensor is not on CPU. share_filename only " + "support CPU Tensor.")); + + auto *mmap_allocation = dynamic_cast< + memory::allocation::RefcountedMemoryMapAllocation *>( + holder.get()); + // If the tensor is not shared, allocate memory map allocation. + if (mmap_allocation == nullptr) { + void *data_ptr = self.data(); + size_t data_size = + self.numel() * + framework::SizeOfType( + framework::TransToProtoVarType(self.type())); + + int flags = memory::allocation::MAPPED_SHAREDMEM | + memory::allocation::MAPPED_EXCLUSIVE; + std::string handle = memory::allocation::GetIPCName(); + auto shared_holder = + memory::allocation::AllocateRefcountedMemoryMapAllocation( + handle, flags, data_size); + + // copy data & reset holder + if (platform::is_cuda_pinned_place(holder->place())) { +#ifdef PADDLE_WITH_CUDA + memory::Copy(platform::CPUPlace(), shared_holder->ptr(), + platform::CUDAPinnedPlace(), data_ptr, data_size); +#endif + } else { + memory::Copy(platform::CPUPlace(), shared_holder->ptr(), + platform::CPUPlace(), data_ptr, data_size); + } + self.ResetHolder(shared_holder); + mmap_allocation = shared_holder.get(); + } + int type_idx = static_cast(self.type()); + + return py::make_tuple(mmap_allocation->ipc_name(), + mmap_allocation->size(), type_idx, + vectorize(self.dims()), self.lod()); + }, + R"DOC( + Serialize CPU lod tensor in shared memory to tuple. + If the tensor is not in shared memory, we will copy it first. + + Returns: + tuple: contrains ipc name, data size, data type, + tensor dims and lod imformation. + + Examples: + .. code-block:: python + + import paddle + tensor = paddle.ones([3,3]) + metainfo = tensor.value().get_tensor()._share_filename() + + )DOC") + .def("_new_shared_filename", + [](py::tuple t) { // __setstate__ + if (t.size() != 5) + throw std::runtime_error("Invalid Tensor meta info state!"); + + framework::Tensor tensor; + + // 2. Rebuild Allocation + const std::string &ipc_name = t[0].cast(); + size_t size = t[1].cast(); + int flags = memory::allocation::MAPPED_SHAREDMEM | + memory::allocation::MAPPED_NOCREATE; + + auto shared_holder = + memory::allocation::AllocateRefcountedMemoryMapAllocation( + ipc_name, flags, size); + + // 3. Rebuild Tensor + tensor.ResetHolderWithType( + shared_holder, + static_cast(t[2].cast())); + tensor.Resize(phi::make_ddim(t[3].cast>())); + tensor.set_lod(t[4].cast()); + + return tensor; + }, + R"DOC( + Deserialize CPU lod tensor from shared memory. + + Params: + tuple: contrains ipc file name, data size, data type, + tensor dims and lod information. + + Examples: + .. code-block:: python + + import paddle + tensor = paddle.ones([3,3]) + metainfo = tensor.value().get_tensor()._share_filename() + tensor_from_shared = paddle.to_tensor(paddle.fluid.core.LoDTensor._new_shared_filename(metainfo)) + + )DOC") + .def("_shared_incref", + [](framework::Tensor &self) { + auto *mmap_allocation = dynamic_cast< + memory::allocation::RefcountedMemoryMapAllocation *>( + self.Holder().get()); + if (mmap_allocation) { + mmap_allocation->incref(); + } + }, + R"DOC( + Increase reference count of share_filename tensor. + )DOC") + .def("_shared_decref", + [](framework::Tensor &self) { + auto *mmap_allocation = dynamic_cast< + memory::allocation::RefcountedMemoryMapAllocation *>( + self.Holder().get()); + if (mmap_allocation) { + mmap_allocation->decref(); + } + }, + R"DOC( + Decrease reference count of share_filename tensor. + )DOC") .def(py::pickle( [](const framework::Tensor &t) { // __getstate__ auto holder = t.Holder(); @@ -2536,10 +2882,11 @@ All parameter, weight, gradient are variables in Paddle. m.def("init_gflags", framework::InitGflags); m.def("init_glog", framework::InitGLOG); - m.def("load_op_meta_info_and_register_op", - framework::LoadOpMetaInfoAndRegisterOp); + m.def("load_op_meta_info_and_register_op", [](const std::string dso_name) { + egr::Controller::Instance().MergeOpMetaInfoMap( + framework::LoadOpMetaInfoAndRegisterOp(dso_name)); + }); m.def("init_devices", []() { framework::InitDevices(); }); - m.def("is_compiled_with_cuda", IsCompiledWithCUDA); m.def("is_compiled_with_ascend", IsCompiledWithAscend); m.def("is_compiled_with_rocm", IsCompiledWithROCM); diff --git a/paddle/fluid/pybind/slice_utils.h b/paddle/fluid/pybind/slice_utils.h index a037fa13eb53b94fd8d82413dad55d7f34b0006d..add332abd30eaaad1772a0b8e326ea0ae6c27e8b 100644 --- a/paddle/fluid/pybind/slice_utils.h +++ b/paddle/fluid/pybind/slice_utils.h @@ -188,16 +188,14 @@ static void ParseIndexingSlice( int start = static_cast(PyLong_AsLong(slice_item)); auto s_t = start; start = start < 0 ? start + dim_len : start; - if (start >= dim_len || start < 0) { - std::string str_error_message = - "The starting index " + std::to_string(s_t) + - " of slice is out of bounds in tensor " + std::to_string(dim) + - "-th axis, it shound be in the range of [" + - std::to_string(-dim_len) + ", " + std::to_string(dim_len) + ")"; - // py::index_error is corresponding to IndexError in Python - // Used to indicate out of bounds access in __getitem__, __setitem__ - throw py::index_error(str_error_message); - } + + PADDLE_ENFORCE( + 0 <= start && start < dim_len, + platform::errors::OutOfRange("The starting index %d of slice is out " + "of bounds in tensor %d-th axis, it " + "shound be in the range of [%d, %d).", + s_t, dim, -dim_len, dim_len)); + slice_axes->push_back(dim); slice_starts->push_back(start); slice_ends->push_back(start + 1); diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index c593c7df3e0ec708beecfd6c5051637d65a7f79d..6849fcb039410f95d829b9bb793a856f1485bd6c 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -585,14 +585,20 @@ inline void _getSliceinfo(const framework::Tensor &self, py::object obj, auto &step = *pstep; auto &slicelength = *pslicelength; const framework::DDim &srcDDim = self.dims(); - if (dim < 0 || dim >= srcDDim.size()) { - throw py::index_error(); - } + PADDLE_ENFORCE( + 0 <= dim && dim < srcDDim.size(), + platform::errors::OutOfRange("The dim %d of slice is out of bounds, it " + "shound be in the range of [0, %d).", + dim, srcDDim.size())); + if (py::isinstance(obj)) { size_t lstart, lstop, lstep, lslicelength; py::slice s = static_cast(obj); if (!s.compute(srcDDim[dim], &lstart, &lstop, &lstep, &lslicelength)) { - throw py::index_error(); + PADDLE_THROW(platform::errors::OutOfRange( + "Slice on dim: %d is error, please check the validity of tensor " + "dims or slice item.", + dim)); } start = static_cast(lstart); stop = static_cast(lstop); @@ -600,15 +606,19 @@ inline void _getSliceinfo(const framework::Tensor &self, py::object obj, slicelength = static_cast(lslicelength); } else if (py::isinstance(obj)) { start = static_cast(static_cast(obj)); - if (std::abs(start) >= srcDDim[dim]) { - throw py::index_error(); - } + PADDLE_ENFORCE( + std::abs(start) < srcDDim[dim], + platform::errors::OutOfRange("The start %d of slice is out of bounds, " + "it shound be in the range of (%d, %d).", + start, -srcDDim[dim], srcDDim[dim])); start = (start >= 0) ? start : srcDDim[dim] - start; stop = start + 1; step = 1; slicelength = 1; } else { - throw py::index_error(); + PADDLE_THROW( + platform::errors::OutOfRange("Index object error, the index object for " + "slice only supports slice(::) and int.")); } } diff --git a/paddle/infrt/CMakeLists.txt b/paddle/infrt/CMakeLists.txt index ed29b5b44c7791d356ec1283a0027cacf1fd5e7a..4e273f6d551edd74ec979e6ec34aedabdb58bd10 100644 --- a/paddle/infrt/CMakeLists.txt +++ b/paddle/infrt/CMakeLists.txt @@ -90,7 +90,6 @@ add_subdirectory(tests) set(infrt_mlir_incs basic_kernels_inc test_kernels_inc - infrt_base_inc tensor_shape_inc dense_tensor_inc pd_ops_inc diff --git a/paddle/infrt/api/infrt_api.cc b/paddle/infrt/api/infrt_api.cc index e0488117783d5657aa97c301d9d12ce1c77017e7..0500a8123044cd05695c5167b1afaa48a6027b57 100644 --- a/paddle/infrt/api/infrt_api.cc +++ b/paddle/infrt/api/infrt_api.cc @@ -24,7 +24,7 @@ #include "paddle/infrt/common/global.h" #include "paddle/infrt/dialect/dense_tensor.h" -#include "paddle/infrt/dialect/infrt/infrt_dialect.h" +#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" #include "paddle/infrt/dialect/mlir_loader.h" #include "paddle/infrt/host_context/core_runtime.h" #include "paddle/infrt/host_context/kernel_registry.h" @@ -144,7 +144,7 @@ class PredictExecutor : public MlirToRuntimeTranslator { // process results auto& last_op = predict_func.front().back(); - if (last_op.getName().getStringRef() == "Infrt.return") { + if (last_op.getName().getStringRef() == "infrt.return") { for (size_t i = 0; i < last_op.getNumOperands(); ++i) { auto* value = AddValue(mlir::Value(last_op.getOperand(i))); results_.push_back(ValueRef(value)); diff --git a/paddle/infrt/dialect/CMakeLists.txt b/paddle/infrt/dialect/CMakeLists.txt index e35989da2085b21f4dbfaadea05793fc9dcb8753..a3f2d0afafc417cc7a4cbba8a3d6bfa92c9bef00 100644 --- a/paddle/infrt/dialect/CMakeLists.txt +++ b/paddle/infrt/dialect/CMakeLists.txt @@ -2,26 +2,20 @@ core_gather_headers() gather_srcs(infrt_src SRCS dialect.cc - basic_kernels.cc - test_kernels.cc - infrt_base.cc - init_infrt_dialects.cc + init_dialects.cc tensor_shape.cc dense_tensor.cc mlir_loader.cc diagnostic_utils.cc - pd_types.cc pd_ops.cc ) -mlir_tablegen_on(basic_kernels) -mlir_tablegen_on(test_kernels) -mlir_tablegen_on(infrt_base DIALECT Infrt) mlir_tablegen_on(tensor_shape DIALECT ts) mlir_tablegen_on(dense_tensor DIALECT dt) mlir_tablegen_on(pd_op_base DIALECT pd) mlir_tablegen_on(pd_ops) mlir_tablegen_on(pd_extra_ops) + mlir_add_rewriter(rewrite) # TODO(Superjomn) add a cmake function cc_executable to ecapsulate the following code diff --git a/paddle/infrt/dialect/dense_tensor.h b/paddle/infrt/dialect/dense_tensor.h index 27febffe8156379c63a0b6b3fb048f7441255f0e..7fbd1e8a4efe1e9dc1d022beb7673ee8a59c7e36 100644 --- a/paddle/infrt/dialect/dense_tensor.h +++ b/paddle/infrt/dialect/dense_tensor.h @@ -19,7 +19,7 @@ #include -#include "paddle/infrt/dialect/infrt/infrt_dialect.h" +#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" #include "paddle/infrt/dialect/dense_tensor_dialect.hpp.inc" diff --git a/paddle/infrt/dialect/dense_tensor.td b/paddle/infrt/dialect/dense_tensor.td index f5db90648eec9933eadf897a8090260bdbfe575b..666c7b300af33db0c27e5b3ab8a74aa4b1591c9b 100644 --- a/paddle/infrt/dialect/dense_tensor.td +++ b/paddle/infrt/dialect/dense_tensor.td @@ -2,7 +2,7 @@ #else #define DT_OPS -include "paddle/infrt/dialect/infrt_base.td" +include "paddle/infrt/dialect/infrt/ir/infrt_base.td" include "paddle/infrt/dialect/tensor_shape_base.td" include "mlir/Interfaces/SideEffectInterfaces.td" diff --git a/paddle/infrt/dialect/infrt/CMakeLists.txt b/paddle/infrt/dialect/infrt/CMakeLists.txt index 08ce2d4707bfdc8498610793437675ae8238475e..5f65336453fbdf82f30948aeea8dc52b0367159b 100644 --- a/paddle/infrt/dialect/infrt/CMakeLists.txt +++ b/paddle/infrt/dialect/infrt/CMakeLists.txt @@ -1,17 +1,3 @@ -core_gather_headers() - -gather_srcs(infrt_src SRCS - common_type.cc - infrt_dialect.cc - ) - - -add_mlir_dialect(infrt_ops infrt) - -set(LLVM_TARGET_DEFINITIONS infrt_ops.td) -mlir_tablegen(infrt_opsAttributes.h.inc -gen-attrdef-decls -dialect=infrt) -mlir_tablegen(infrt_opsAttributes.cpp.inc -gen-attrdef-defs -dialect=infrt) -add_public_tablegen_target(MLIRinfrt_opsAttributesIncGen) -add_dependencies(mlir-headers MLIRinfrt_opsAttributesIncGen) - +add_subdirectory(common) +add_subdirectory(ir) add_subdirectory(pass) diff --git a/paddle/infrt/dialect/infrt/common/CMakeLists.txt b/paddle/infrt/dialect/infrt/common/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..f693c82b5060ef35eecbc1ef9ad5053d6b93e4ad --- /dev/null +++ b/paddle/infrt/dialect/infrt/common/CMakeLists.txt @@ -0,0 +1,6 @@ +core_gather_headers() + +gather_srcs(infrt_src SRCS + types.cc + utils.cc + ) diff --git a/paddle/infrt/dialect/infrt/common_type.cc b/paddle/infrt/dialect/infrt/common/types.cc similarity index 97% rename from paddle/infrt/dialect/infrt/common_type.cc rename to paddle/infrt/dialect/infrt/common/types.cc index 00684c505268c09e97d262a3526c946d1bc3095c..62419a196288bb052a9f240ecc25f34c102a5b35 100644 --- a/paddle/infrt/dialect/infrt/common_type.cc +++ b/paddle/infrt/dialect/infrt/common/types.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/infrt/dialect/infrt/common_type.h" +#include "paddle/infrt/dialect/infrt/common/types.h" namespace infrt { diff --git a/paddle/infrt/dialect/infrt/common_type.h b/paddle/infrt/dialect/infrt/common/types.h similarity index 100% rename from paddle/infrt/dialect/infrt/common_type.h rename to paddle/infrt/dialect/infrt/common/types.h diff --git a/paddle/infrt/dialect/infrt/common/utils.cc b/paddle/infrt/dialect/infrt/common/utils.cc new file mode 100644 index 0000000000000000000000000000000000000000..0ffb23c490f8f52044d35d20508f42f3f9a89413 --- /dev/null +++ b/paddle/infrt/dialect/infrt/common/utils.cc @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/dialect/infrt/common/utils.h" + +mlir::SmallVector infrt::cvtValueToValueRange( + const mlir::Value &operand) { + return mlir::SmallVector(1, operand); +} + +mlir::SmallVector infrt::concatTwoValueRange( + mlir::ValueRange operand_0, mlir::ValueRange operand_1) { + mlir::SmallVector operands; + operands.append(operand_0.begin(), operand_0.end()); + operands.append(operand_1.begin(), operand_1.end()); + return operands; +} diff --git a/paddle/fluid/operators/searchsorted_op.cu b/paddle/infrt/dialect/infrt/common/utils.h similarity index 57% rename from paddle/fluid/operators/searchsorted_op.cu rename to paddle/infrt/dialect/infrt/common/utils.h index 4633ab43efba121cf4c55a877d90b974690952ec..886407b56649a296046d570826cf2b1b0e8aade8 100644 --- a/paddle/fluid/operators/searchsorted_op.cu +++ b/paddle/infrt/dialect/infrt/common/utils.h @@ -12,12 +12,20 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/searchsorted_op.h" -namespace ops = paddle::operators; -namespace plat = paddle::platform; +#pragma once -REGISTER_OP_CUDA_KERNEL( - searchsorted, ops::SearchSortedKernel, - ops::SearchSortedKernel, - ops::SearchSortedKernel, - ops::SearchSortedKernel); +#include +#include +#include +#include +#include +#include + +namespace infrt { + +mlir::SmallVector cvtValueToValueRange( + const mlir::Value &operand); + +mlir::SmallVector concatTwoValueRange( + mlir::ValueRange operand_0, mlir::ValueRange operand_1); +} // namespace infrt diff --git a/paddle/infrt/dialect/infrt/ir/CMakeLists.txt b/paddle/infrt/dialect/infrt/ir/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..7c009bdb267e6ea1dd5a5fb392f64dddb7a05f06 --- /dev/null +++ b/paddle/infrt/dialect/infrt/ir/CMakeLists.txt @@ -0,0 +1,18 @@ +core_gather_headers() + +gather_srcs(infrt_src SRCS + infrt_dialect.cc + basic_kernels.cc + test_kernels.cc + ) + +add_mlir_dialect(infrt_ops infrt) + +set(LLVM_TARGET_DEFINITIONS infrt_ops.td) +mlir_tablegen(infrt_opsAttributes.h.inc -gen-attrdef-decls -dialect=infrt) +mlir_tablegen(infrt_opsAttributes.cpp.inc -gen-attrdef-defs -dialect=infrt) +add_public_tablegen_target(MLIRinfrt_opsAttributesIncGen) +add_dependencies(mlir-headers MLIRinfrt_opsAttributesIncGen) + +mlir_tablegen_on(basic_kernels) +mlir_tablegen_on(test_kernels) diff --git a/paddle/infrt/dialect/basic_kernels.cc b/paddle/infrt/dialect/infrt/ir/basic_kernels.cc similarity index 63% rename from paddle/infrt/dialect/basic_kernels.cc rename to paddle/infrt/dialect/infrt/ir/basic_kernels.cc index c1aa75fb24650b99ea8371c0ecbe7e572df2f0ce..ba83f3e36c94a173accad9fb6e746eaec0ec8e6c 100644 --- a/paddle/infrt/dialect/basic_kernels.cc +++ b/paddle/infrt/dialect/infrt/ir/basic_kernels.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/infrt/dialect/basic_kernels.h" +#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h" #include #include @@ -30,23 +30,6 @@ namespace infrt { namespace dialect { using namespace mlir; // NOLINT -static ParseResult parseCallOp(OpAsmParser &parser, // NOLINT - OperationState &result) { // NOLINT - SymbolRefAttr callee_attr; - FunctionType callee_type; - SmallVector operands; - auto callee_loc = parser.getNameLoc(); - if (parser.parseAttribute(callee_attr, "callee", result.attributes) || - parser.parseOperandList(operands, OpAsmParser::Delimiter::Paren) || - parser.parseOptionalAttrDict(result.attributes) || - parser.parseColonType(callee_type) || - parser.addTypesToList(callee_type.getResults(), result.types) || - parser.resolveOperands( - operands, callee_type.getInputs(), callee_loc, result.operands)) - return failure(); - return success(); -} - static ParseResult parseConstantOp(Type attrType, OpAsmParser &parser, // NOLINT OperationState &result) { // NOLINT @@ -79,24 +62,6 @@ static ParseResult parseConstantI64Op(OpAsmParser &parser, // NOLINT IntegerType::get(result.getContext(), 64), parser, result); } -static ParseResult parseReturnOp(OpAsmParser &parser, // NOLINT - OperationState &result) { // NOLINT - SmallVector opInfo; - SmallVector types; - llvm::SMLoc loc = parser.getCurrentLocation(); - return failure(parser.parseOperandList(opInfo) || - (!opInfo.empty() && parser.parseColonTypeList(types)) || - parser.resolveOperands(opInfo, types, loc, result.operands)); -} - -static void print(OpAsmPrinter &p, CallOp op) { // NOLINT - p << op->getAttr("callee") << "("; - p.printOperands(op.getOperands()); - p << ")"; - p.printOptionalAttrDict(op->getAttrs(), {"callee"}); - p << " : "; -} - static void printConstant(OpAsmPrinter &p, mlir::Operation *op) { // NOLINT p << " "; p.printOptionalAttrDict(op->getAttrs(), /*elidedAttrs=*/{"value"}); @@ -127,37 +92,13 @@ static void print(OpAsmPrinter &p, ConstantI64Op op) { // NOLINT printConstant(p, op); } -static void print(OpAsmPrinter &p, ReturnOp op) { // NOLINT - if (op.getNumOperands() > 0) { - p << ' '; - p.printOperands(op.getOperands()); - p << " : "; - llvm::interleaveComma(op.getOperands(), p); - } -} - -static LogicalResult verify(CallOp op) { return success(); } - static LogicalResult verify(ConstantF32Op op) { return success(); } static LogicalResult verify(ConstantI32Op op) { return success(); } static LogicalResult verify(ConstantF64Op op) { return success(); } static LogicalResult verify(ConstantI64Op op) { return success(); } -static LogicalResult verify(ReturnOp op) { - auto function = dyn_cast(op->getParentOp()); - - if (!function) return success(); - - auto results = function.getType().getResults(); - if (op.getNumOperands() != results.size()) - return op.emitOpError("has ") - << op.getNumOperands() - << " operands, but enclosing function returns " << results.size(); - - return success(); -} } // namespace dialect } // namespace infrt #define GET_OP_CLASSES -#include "paddle/infrt/dialect/basic_kernels.cpp.inc" +#include "paddle/infrt/dialect/infrt/ir/basic_kernels.cpp.inc" diff --git a/paddle/infrt/dialect/basic_kernels.h b/paddle/infrt/dialect/infrt/ir/basic_kernels.h similarity index 92% rename from paddle/infrt/dialect/basic_kernels.h rename to paddle/infrt/dialect/infrt/ir/basic_kernels.h index b82abcd52d28f45b18824d9ea6f9e12c2ec1c574..a36f55691b716dda51120e8c4be7c956df9b9f25 100644 --- a/paddle/infrt/dialect/basic_kernels.h +++ b/paddle/infrt/dialect/infrt/ir/basic_kernels.h @@ -18,4 +18,4 @@ #include #define GET_OP_CLASSES -#include "paddle/infrt/dialect/basic_kernels.hpp.inc" +#include "paddle/infrt/dialect/infrt/ir/basic_kernels.hpp.inc" diff --git a/paddle/infrt/dialect/basic_kernels.td b/paddle/infrt/dialect/infrt/ir/basic_kernels.td similarity index 69% rename from paddle/infrt/dialect/basic_kernels.td rename to paddle/infrt/dialect/infrt/ir/basic_kernels.td index 89d8cd65b85cd39c9eb50edca1aa1bfaf47073a4..60315b45dd0dfaee8437c1dd312691445fdede56 100644 --- a/paddle/infrt/dialect/basic_kernels.td +++ b/paddle/infrt/dialect/infrt/ir/basic_kernels.td @@ -4,10 +4,10 @@ #else #define BASIC_OPS -include "paddle/infrt/dialect/infrt_base.td" +include "paddle/infrt/dialect/infrt/ir/infrt_base.td" include "mlir/Interfaces/SideEffectInterfaces.td" -class INFRT_Op traits = []> : Op { +class INFRT_Op traits = []> : Op { // Each registered op needs to provide all of a printer, parser and verifier. let printer = [{ return infrt::dialect::print(p, *this); }]; @@ -15,23 +15,6 @@ class INFRT_Op traits = []> : Op { - let summary = "call a host operation"; - let description = [{ - The "infrt.call" operation represents a direct call to a function. The operands and result types of the call must match the specified function type. - - %2 = infrt.call @add(%0, %1) : (f32, f32) -> f32 - }]; - - let arguments = (ins FlatSymbolRefAttr:$callee, Variadic:$operands); - let results = (outs Variadic); - - let extraClassDeclaration = [{ - mlir::StringRef getCallee() { return callee(); } - mlir::FunctionType getCalleeType(); - }]; -} - class ConstantOp : INFRT_Op<"constant." # suffix, [NoSideEffect]> { let summary = "constant value constructor in host"; @@ -45,22 +28,6 @@ def ConstantI64Op : ConstantOp<"i64", I64, I64Attr>; def ConstantF32Op : ConstantOp<"f32", F32, F32Attr>; def ConstantF64Op : ConstantOp<"f64", F64, F64Attr>; -def ReturnOp : INFRT_Op<"return", [Terminator]> { - let summary = "host executor return operation"; - let description = [{ - The "Infrt.return" operation represents a return operation within a function. - - func @foo() : (i32, f8) { - Infrt.return %0, %1 : i32, f8 - } - }]; - - let arguments = (ins Variadic:$operands); - - let builders = [OpBuilder<(ins), - [{ build($_builder, $_state, llvm::None); }]>]; -} - class AddOp : INFRT_Op<"add." # suffix, [NoSideEffect]> { let summary = "infrt.add operation"; let description = [{ @@ -112,7 +79,7 @@ def PrintF32Op : PrintOp<"f32", F32>; def PrintF64Op : PrintOp<"f64", F64>; def PrintStringOp : INFRT_Op<"print_string"> { - let summary = "Infrt.print_string"; + let summary = "infrt.print_string"; let description = [{ An operation that prints a string. }]; diff --git a/paddle/infrt/dialect/infrt/infrt_ops_base.td b/paddle/infrt/dialect/infrt/ir/infrt_base.td similarity index 85% rename from paddle/infrt/dialect/infrt/infrt_ops_base.td rename to paddle/infrt/dialect/infrt/ir/infrt_base.td index 3190c1c84b8c04ceb7e91d829865c65503f5d708..c5130e89bb13a58a0aa0cf3aeae1b00e269eb259 100644 --- a/paddle/infrt/dialect/infrt/infrt_ops_base.td +++ b/paddle/infrt/dialect/infrt/ir/infrt_base.td @@ -101,4 +101,21 @@ class Infrt_Attr traits = [], : AttrDef { let mnemonic = ?; } + +// tools function. used for pattern rewriter +class INFRT_createI32Attr : NativeCodeCall< + "$_builder.getI32IntegerAttr(" # value # ")">; + +class INFRT_createSI32Attr : NativeCodeCall< + "$_builder.getSI32IntegerAttr(" # value # ")">; + +class INFRT_createF32Attr : NativeCodeCall< + "$_builder.getF32FloatAttr(" # value # ")">; + +def INFRT_cvtValueToValueRange : NativeCodeCall< + "infrt::cvtValueToValueRange($0)">; + +def INFRT_concatTwoValueRange : NativeCodeCall< + "infrt::concatTwoValueRange($0, $1)">; + #endif // INFRT_OPS_BASE diff --git a/paddle/infrt/dialect/infrt/infrt_dialect.cc b/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc similarity index 84% rename from paddle/infrt/dialect/infrt/infrt_dialect.cc rename to paddle/infrt/dialect/infrt/ir/infrt_dialect.cc index 400e4921c944491e0ce8cded38fec9435f4ad0bd..42de08ebc41938c40675435d4af10f758c52052b 100644 --- a/paddle/infrt/dialect/infrt/infrt_dialect.cc +++ b/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc @@ -12,40 +12,52 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/infrt/dialect/infrt/infrt_dialect.h" +#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" #include #include #include #include #include "paddle/infrt/dialect/dense_tensor.h" -#include "paddle/infrt/dialect/infrt/infrt_opsDialect.cpp.inc" +#include "paddle/infrt/dialect/infrt/ir/infrt_opsDialect.cpp.inc" #define GET_TYPEDEF_CLASSES -#include "paddle/infrt/dialect/infrt/infrt_opsTypes.cpp.inc" +#include "paddle/infrt/dialect/infrt/ir/infrt_opsTypes.cpp.inc" #define GET_ATTRDEF_CLASSES -#include "paddle/infrt/dialect/infrt/infrt_opsAttributes.cpp.inc" +#include "paddle/infrt/dialect/infrt/ir/infrt_opsAttributes.cpp.inc" #define GET_OP_CLASSES -#include "paddle/infrt/dialect/infrt/infrt_ops.cpp.inc" +#include "paddle/infrt/dialect/infrt/ir/infrt_ops.cpp.inc" + +#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h" + +#include "paddle/infrt/dialect/infrt/ir/test_kernels.h" namespace infrt { void InfrtDialect::initialize() { addTypes< #define GET_TYPEDEF_LIST -#include "paddle/infrt/dialect/infrt/infrt_opsTypes.cpp.inc" // NOLINT +#include "paddle/infrt/dialect/infrt/ir/infrt_opsTypes.cpp.inc" // NOLINT >(); addAttributes< #define GET_ATTRDEF_LIST -#include "paddle/infrt/dialect/infrt/infrt_opsAttributes.cpp.inc" // NOLINT +#include "paddle/infrt/dialect/infrt/ir/infrt_opsAttributes.cpp.inc" // NOLINT >(); addOperations< #define GET_OP_LIST -#include "paddle/infrt/dialect/infrt/infrt_ops.cpp.inc" // NOLINT +#include "paddle/infrt/dialect/infrt/ir/infrt_ops.cpp.inc" // NOLINT + >(); + addOperations< +#define GET_OP_LIST +#include "paddle/infrt/dialect/infrt/ir/basic_kernels.cpp.inc" + >(); + addOperations< +#define GET_OP_LIST +#include "paddle/infrt/dialect/infrt/ir/test_kernels.cpp.inc" >(); } @@ -128,7 +140,7 @@ mlir::Type InfrtDialect::parseType(::mlir::DialectAsmParser &parser) const { void InfrtDialect::printType(::mlir::Type type, ::mlir::DialectAsmPrinter &os) const { - // print LoDTensorType, for example: !Infrt.lod_tensor<3x64x3x3xf32,5> + // print LoDTensorType, for example: !infrt.lod_tensor<3x64x3x3xf32,5> if (type.isa()) { auto lod_tensor_type = type.cast(); os << "lod_tensor<"; diff --git a/paddle/infrt/dialect/infrt/infrt_dialect.h b/paddle/infrt/dialect/infrt/ir/infrt_dialect.h similarity index 77% rename from paddle/infrt/dialect/infrt/infrt_dialect.h rename to paddle/infrt/dialect/infrt/ir/infrt_dialect.h index ed5b36e556149dbc3026e732cf953c5562841921..3e6ea2a74c79d43015a62f166928e10adb48698a 100644 --- a/paddle/infrt/dialect/infrt/infrt_dialect.h +++ b/paddle/infrt/dialect/infrt/ir/infrt_dialect.h @@ -22,14 +22,14 @@ #include #include #include -#include "paddle/infrt/dialect/infrt/common_type.h" +#include "paddle/infrt/dialect/infrt/common/types.h" -#include "paddle/infrt/dialect/infrt/infrt_opsDialect.h.inc" +#include "paddle/infrt/dialect/infrt/ir/infrt_opsDialect.h.inc" #define GET_TYPEDEF_CLASSES -#include "paddle/infrt/dialect/infrt/infrt_opsTypes.h.inc" +#include "paddle/infrt/dialect/infrt/ir/infrt_opsTypes.h.inc" #define GET_ATTRDEF_CLASSES -#include "paddle/infrt/dialect/infrt/infrt_opsAttributes.h.inc" +#include "paddle/infrt/dialect/infrt/ir/infrt_opsAttributes.h.inc" #define GET_OP_CLASSES -#include "paddle/infrt/dialect/infrt/infrt_ops.h.inc" +#include "paddle/infrt/dialect/infrt/ir/infrt_ops.h.inc" diff --git a/paddle/infrt/dialect/infrt/infrt_ops.td b/paddle/infrt/dialect/infrt/ir/infrt_ops.td similarity index 64% rename from paddle/infrt/dialect/infrt/infrt_ops.td rename to paddle/infrt/dialect/infrt/ir/infrt_ops.td index 16ade66d47b8ee538a6e7c4f19bf571a25c3e416..f5430b03d0d75cfa8ba91f03ebc90ee0f73c25d7 100644 --- a/paddle/infrt/dialect/infrt/infrt_ops.td +++ b/paddle/infrt/dialect/infrt/ir/infrt_ops.td @@ -1,4 +1,4 @@ -include "paddle/infrt/dialect/infrt/infrt_ops_base.td" +include "paddle/infrt/dialect/infrt/ir/infrt_base.td" // Op definition class Infrt_Op traits = []> : Op { @@ -33,6 +33,26 @@ def Infrt_ReturnOp : Infrt_Op<"return", [Terminator]> { let assemblyFormat = "attr-dict ($operands^ `:` type($operands))?"; } +def Infrt_CallOp : Infrt_Op<"call"> { + let summary = "call a host operation"; + let description = [{ + The "infrt.call" operation represents a direct call to a function. The operands and result types of the call must match the specified function type. + + %2 = infrt.call @add(%0, %1) : (f32, f32) -> f32 + }]; + + let arguments = (ins FlatSymbolRefAttr:$callee, Variadic:$operands); + let results = (outs Variadic); + + //let extraClassDeclaration = [{ + // mlir::StringRef getCallee() { return callee(); } + // mlir::FunctionType getCalleeType(); + // }]; + let assemblyFormat = [{ + $callee `(` $operands `)` attr-dict `:` functional-type($operands, results) + }]; +} + def Infrt_CvtTensorOp : Infrt_Op<"cvt_tensor", [NoSideEffect]> { let summary = "convert tensor type op"; let description = [{convert tensor type op!}]; diff --git a/paddle/infrt/dialect/test_kernels.cc b/paddle/infrt/dialect/infrt/ir/test_kernels.cc similarity index 96% rename from paddle/infrt/dialect/test_kernels.cc rename to paddle/infrt/dialect/infrt/ir/test_kernels.cc index f0c4723b49a7906cf5327771e26eb87e8b1248c0..5f7f83a9dfa8011b3043e20da7d9f21f3afe5cf6 100644 --- a/paddle/infrt/dialect/test_kernels.cc +++ b/paddle/infrt/dialect/infrt/ir/test_kernels.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/infrt/dialect/test_kernels.h" +#include "paddle/infrt/dialect/infrt/ir/test_kernels.h" #include #include @@ -147,7 +147,7 @@ static mlir::LogicalResult verify(BenchmarkOp op) { // Verify that the target benchmark region has exactly one return value. auto ®ion = op.region(); auto &last_op = region.front().back(); - if (last_op.getName().getStringRef() != "Infrt.return") { + if (last_op.getName().getStringRef() != "infrt.return") { return op.emitOpError("missing return statement"); } if (last_op.getNumOperands() != 1) { @@ -161,4 +161,4 @@ static mlir::LogicalResult verify(BenchmarkOp op) { } // namespace infrt #define GET_OP_CLASSES -#include "paddle/infrt/dialect/test_kernels.cpp.inc" +#include "paddle/infrt/dialect/infrt/ir/test_kernels.cpp.inc" diff --git a/paddle/infrt/dialect/test_kernels.h b/paddle/infrt/dialect/infrt/ir/test_kernels.h similarity index 92% rename from paddle/infrt/dialect/test_kernels.h rename to paddle/infrt/dialect/infrt/ir/test_kernels.h index 73c8a6fb387bca6ebc7ae393e4bba32ab94aa951..1fe5020b240046f71571e3a4c999b1eae07741a1 100644 --- a/paddle/infrt/dialect/test_kernels.h +++ b/paddle/infrt/dialect/infrt/ir/test_kernels.h @@ -17,4 +17,4 @@ #include #define GET_OP_CLASSES -#include "paddle/infrt/dialect/test_kernels.hpp.inc" +#include "paddle/infrt/dialect/infrt/ir/test_kernels.hpp.inc" diff --git a/paddle/infrt/dialect/test_kernels.td b/paddle/infrt/dialect/infrt/ir/test_kernels.td similarity index 93% rename from paddle/infrt/dialect/test_kernels.td rename to paddle/infrt/dialect/infrt/ir/test_kernels.td index 6e4bc26aa1496dcb4caed83f98fc42dab9e3cce0..0ce1f3f65e8f7f46cf32794b3191e66ae71e3eae 100644 --- a/paddle/infrt/dialect/test_kernels.td +++ b/paddle/infrt/dialect/infrt/ir/test_kernels.td @@ -4,12 +4,12 @@ #else #define TEST_OPS -include "paddle/infrt/dialect/infrt_base.td" +include "paddle/infrt/dialect/infrt/ir/infrt_base.td" include "mlir/Interfaces/SideEffectInterfaces.td" // Base class for Test dialect ops. class Test_Op traits = []> : - Op { + Op { // Each registered op in the Test namespace needs to provide all of a printer, // parser and verifier. @@ -45,7 +45,7 @@ def BenchmarkOp : Test_Op<"benchmark"> { // The following code benchmarks the infrt.add.i32 kernel. %x = infrt.add.i32 %c, %c // The benchmarked function needs to return exactly one value. - Infrt.return %x : i32 + infrt.return %x : i32 } }]; diff --git a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td index ef702650b6f1bbd3615ca7a70880d3c2c04e254b..51addb4deb43824965806962613d1ab4bd1c1e3d 100644 --- a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td +++ b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td @@ -2,7 +2,7 @@ #define INFRT_OP_FUSE include "mlir/Interfaces/SideEffectInterfaces.td" -include "paddle/infrt/dialect/infrt/infrt_ops.td" +include "paddle/infrt/dialect/infrt/ir/infrt_ops.td" include "paddle/infrt/dialect/pd_ops.td" def FuseCvtTensorPattern : Pat< diff --git a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc index cb16e054418b3b2c6ff843fdaf464d24a42249c2..25ecf2ae99dc3613944fcedaee427b540f0faae4 100644 --- a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc +++ b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc @@ -15,7 +15,7 @@ #include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h" #include -#include "paddle/infrt/dialect/infrt/infrt_dialect.h" +#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" #include "paddle/infrt/dialect/pd_ops.h" namespace { #include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse.cpp.inc" // NOLINT diff --git a/paddle/infrt/dialect/infrt_base.cc b/paddle/infrt/dialect/infrt_base.cc deleted file mode 100644 index e951762abb20c232232af66d6bf1f2e7568a763b..0000000000000000000000000000000000000000 --- a/paddle/infrt/dialect/infrt_base.cc +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/infrt/dialect/infrt_base.h" - -#include "paddle/infrt/dialect/basic_kernels.h" -#include "paddle/infrt/dialect/dense_tensor.h" -#include "paddle/infrt/dialect/test_kernels.h" - -namespace infrt { -namespace dialect { - -// ----INFRTDialect definition begin---- -void INFRTDialect::initialize() { - allowUnknownTypes(); - allowUnknownOperations(); - addOperations< -#define GET_OP_LIST -#include "paddle/infrt/dialect/basic_kernels.cpp.inc" - >(); - addOperations< -#define GET_OP_LIST -#include "paddle/infrt/dialect/test_kernels.cpp.inc" - >(); -} - -mlir::Type INFRTDialect::parseType(mlir::DialectAsmParser &parser) const { - llvm::StringRef keyword; - if (parser.parseKeyword(&keyword)) return mlir::Type(); - // parse TensorMapType, for example: !infrt.tensor_map - parser.emitError(parser.getCurrentLocation(), "unknown infrt type: ") - << keyword; - return mlir::Type(); -} - -void INFRTDialect::printType(mlir::Type type, - mlir::DialectAsmPrinter &printer) const { - // print TensorMapType, for example: !infrt.tensor_map - llvm_unreachable("unknown infrt type."); -} - -// ----INFRTDialect definition end---- - -} // namespace dialect -} // namespace infrt diff --git a/paddle/infrt/dialect/infrt_base.h b/paddle/infrt/dialect/infrt_base.h deleted file mode 100644 index 3ef73171dcdea4e0367837f4b3893405c29a1580..0000000000000000000000000000000000000000 --- a/paddle/infrt/dialect/infrt_base.h +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include -#include - -#include "paddle/infrt/dialect/infrt_base.hpp.inc" - -namespace infrt { -namespace dialect { - -class INFRTDialect : public mlir::Dialect { - explicit INFRTDialect(mlir::MLIRContext *context) - : mlir::Dialect( - getDialectNamespace(), context, mlir::TypeID::get()) { - initialize(); - } - - // parse types registered to the dialect. - mlir::Type parseType(mlir::DialectAsmParser &parser) const override; - // print types registered to the dialect. - void printType(mlir::Type type, - mlir::DialectAsmPrinter &printer) const override; - - void initialize(); - friend class mlir::MLIRContext; - - public: - static ::llvm::StringRef getDialectNamespace() { return "Infrt"; } -}; -} // namespace dialect - -template -static mlir::IntegerAttr createI32Attr(mlir::OpBuilder &b, // NOLINT - mlir::Location loc, - T constant) { - return b.getIntegerAttr(b.getI32Type(), constant); -} - -template -static mlir::IntegerAttr createSI32Attr(mlir::OpBuilder &b, // NOLINT - mlir::Location loc, - T constant) { - return b.getSI32IntegerAttr(constant); -} - -template -static mlir::FloatAttr createF32Attr(mlir::OpBuilder &b, // NOLINT - mlir::Location loc, - T constant) { - return b.getF32FloatAttr(constant); -} - -static mlir::SmallVector cvtValueToValueRange( - const mlir::Value &operand) { - return mlir::SmallVector(1, operand); -} - -static mlir::SmallVector concatTwoValueRange( - mlir::ValueRange operand_0, mlir::ValueRange operand_1) { - mlir::SmallVector operands; - operands.append(operand_0.begin(), operand_0.end()); - operands.append(operand_1.begin(), operand_1.end()); - return operands; -} -} // namespace infrt diff --git a/paddle/infrt/dialect/infrt_base.td b/paddle/infrt/dialect/infrt_base.td deleted file mode 100644 index 45e6b116f489709b1d854727870010c7545d92e7..0000000000000000000000000000000000000000 --- a/paddle/infrt/dialect/infrt_base.td +++ /dev/null @@ -1,33 +0,0 @@ -#ifndef INFRT_BASE -#define INFRT_BASE - -include "mlir/IR/OpBase.td" -include "paddle/infrt/dialect/infrt/infrt_ops_base.td" - -def INFRT_Dialect : Dialect { - let name = "Infrt"; - - let description = [{ - The INFRT host dialect. - }]; - - let cppNamespace = "::infrt::dialect"; -} - -def BufferType : OpaqueType<"b", "buffer", "buffer">; - -class INFRT_createI32Attr : NativeCodeCall< - "infrt::createI32Attr($_builder, $_loc, " # value # ")">; - -class INFRT_createSI32Attr : NativeCodeCall< - "infrt::createSI32Attr($_builder, $_loc, " # value # ")">; - -class INFRT_createF32Attr : NativeCodeCall< - "infrt::createF32Attr($_builder, $_loc, " # value # ")">; - -def INFRT_cvtValueToValueRange : NativeCodeCall< - "infrt::cvtValueToValueRange($0)">; - -def INFRT_concatTwoValueRange : NativeCodeCall< - "infrt::concatTwoValueRange($0, $1)">; -#endif // INFRT_BASE diff --git a/paddle/infrt/dialect/init_infrt_dialects.cc b/paddle/infrt/dialect/init_dialects.cc similarity index 83% rename from paddle/infrt/dialect/init_infrt_dialects.cc rename to paddle/infrt/dialect/init_dialects.cc index 5eae01719361dd5bc21c139b54cbcf16f226b4cc..0c5944ebf84750be8cf789552219157da3170c39 100644 --- a/paddle/infrt/dialect/init_infrt_dialects.cc +++ b/paddle/infrt/dialect/init_dialects.cc @@ -12,14 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/infrt/dialect/init_infrt_dialects.h" +#include "paddle/infrt/dialect/init_dialects.h" #include -#include "paddle/infrt/dialect/basic_kernels.h" #include "paddle/infrt/dialect/dense_tensor.h" -#include "paddle/infrt/dialect/infrt/infrt_dialect.h" -#include "paddle/infrt/dialect/infrt_base.h" +#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h" +#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" + #include "paddle/infrt/dialect/pd_ops.h" #include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h" #include "paddle/infrt/dialect/phi/ir/phi_base.h" @@ -30,8 +30,7 @@ namespace infrt { void registerCinnDialects(mlir::DialectRegistry ®istry) { // NOLINT registry.insert #include "paddle/infrt/dialect/diagnostic_utils.h" -#include "paddle/infrt/dialect/init_infrt_dialects.h" +#include "paddle/infrt/dialect/init_dialects.h" namespace infrt { namespace dialect { diff --git a/paddle/infrt/dialect/mlir_loader_test.cc b/paddle/infrt/dialect/mlir_loader_test.cc index 2f721e49a63096d1c3168805d373cbc8809542da..8ccb07161d364e968ead568f20c4b98b18a7e04e 100644 --- a/paddle/infrt/dialect/mlir_loader_test.cc +++ b/paddle/infrt/dialect/mlir_loader_test.cc @@ -22,7 +22,7 @@ #include -#include "paddle/infrt/dialect/init_infrt_dialects.h" +#include "paddle/infrt/dialect/init_dialects.h" namespace infrt { namespace dialect { @@ -32,13 +32,13 @@ TEST(MlirLoader, basic) { auto source = R"ROC( func @main() -> f32 { - %v0 = Infrt.constant.f32 1.0 - %v1 = Infrt.constant.f32 2.0 - %value = "Infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32 + %v0 = infrt.constant.f32 1.0 + %v1 = infrt.constant.f32 2.0 + %value = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32 - "Infrt.print.f32"(%v0) : (f32) -> () + "infrt.print.f32"(%v0) : (f32) -> () - Infrt.return %value : f32 + infrt.return %value : f32 } )ROC"; diff --git a/paddle/infrt/dialect/opt.cc b/paddle/infrt/dialect/opt.cc index 5bcf5a23f4c532b1056ceaa54c80902b32e4061a..2006530958f0b5223edfcee87a5895e101f0e240 100644 --- a/paddle/infrt/dialect/opt.cc +++ b/paddle/infrt/dialect/opt.cc @@ -14,7 +14,7 @@ #include #include -#include "paddle/infrt/dialect/init_infrt_dialects.h" +#include "paddle/infrt/dialect/init_dialects.h" int main(int argc, char **argv) { mlir::DialectRegistry registry; diff --git a/paddle/infrt/dialect/pd_op_base.td b/paddle/infrt/dialect/pd_op_base.td index 26425e3945caa2f85547b7b8e8be7dbeaf10e630..f6af4c83aed8bd0b7ce04c172169b036e674777b 100644 --- a/paddle/infrt/dialect/pd_op_base.td +++ b/paddle/infrt/dialect/pd_op_base.td @@ -6,7 +6,7 @@ include "mlir/IR/OpBase.td" include "mlir/Interfaces/SideEffectInterfaces.td" -include "paddle/infrt/dialect/infrt/infrt_ops_base.td" +include "paddle/infrt/dialect/infrt/ir/infrt_base.td" def PD_Dialect : Dialect { let name = "pd"; diff --git a/paddle/infrt/dialect/pd_ops.cc b/paddle/infrt/dialect/pd_ops.cc index 55ab174fcaf059d81f83e54e8f1e5864ef25b7e3..96e9e307f2fd3f33be3d2273a7aa66c363e4beb1 100644 --- a/paddle/infrt/dialect/pd_ops.cc +++ b/paddle/infrt/dialect/pd_ops.cc @@ -16,7 +16,6 @@ #include #include -#include "paddle/infrt/dialect/infrt_base.h" #define GET_OP_CLASSES #include "paddle/infrt/dialect/pd_ops.cpp.inc" // NOLINT diff --git a/paddle/infrt/dialect/pd_ops.h b/paddle/infrt/dialect/pd_ops.h index 41dd2ddd94eb161735568170a9a8bdc2ec259cdf..e6b0f30c059054189fe3a86bb112da923ad76423 100644 --- a/paddle/infrt/dialect/pd_ops.h +++ b/paddle/infrt/dialect/pd_ops.h @@ -28,7 +28,7 @@ #include #include #include -#include "paddle/infrt/dialect/infrt/infrt_dialect.h" +#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" namespace mlir { namespace pd { diff --git a/paddle/infrt/dialect/pd_types.h b/paddle/infrt/dialect/pd_types.h deleted file mode 100644 index 0da888a9c076922fc21d5cce004dc839bd705762..0000000000000000000000000000000000000000 --- a/paddle/infrt/dialect/pd_types.h +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// This file defines the types used in PaddlePaddle MLIR dialect. -// We borrowed much ideas from tensorflow mlir dialect (tf_types.h in -// tensorflow). - -#pragma once - -#include -#include -#include -#include -#include - -namespace mlir { -namespace PD { - -class PaddleType : public Type { - public: - using Type::Type; - - static bool classof(Type type); -}; - -namespace detail { - -template -class PaddleTypeImpl : public Type::TypeBase { - public: - using Base = typename Type::TypeBase; - using PDBase = PaddleTypeImpl; - using Base::Base; -}; - -} // namespace detail - -#define HANDLE_PD_TYPE(pdtype, enumerant, name) \ - class pdtype##Type : public detail::PaddleTypeImpl { \ - public: \ - using PDBase::PDBase; \ - }; - -} // namespace PD -} // namespace mlir diff --git a/paddle/infrt/dialect/phi/data_type.cc b/paddle/infrt/dialect/phi/data_type.cc index 5da7ec8831258e52dd157ff444ffcd6e7930e1bb..bbc296ea748a39472bb7f57b04e9159b5fbd89f1 100644 --- a/paddle/infrt/dialect/phi/data_type.cc +++ b/paddle/infrt/dialect/phi/data_type.cc @@ -16,7 +16,7 @@ namespace infrt { -phi::Backend cvtTarget2Phi(TargetType target) { +phi::Backend ConvertTargetToPhi(TargetType target) { switch (target) { case TargetType::CPU: return phi::Backend::CPU; @@ -27,7 +27,7 @@ phi::Backend cvtTarget2Phi(TargetType target) { } } -TargetType cvtTargetFromPhi(phi::Backend backend) { +TargetType ConvertTargetFromPhi(phi::Backend backend) { switch (backend) { case phi::Backend::CPU: return TargetType::CPU; @@ -38,7 +38,7 @@ TargetType cvtTargetFromPhi(phi::Backend backend) { } } -phi::DataType cvtPrecision2Phi(PrecisionType precision) { +phi::DataType ConvertPrecisionToPhi(PrecisionType precision) { #define CONVERT_PRECISION_TO_PHI(Precision) \ case PrecisionType::Precision: \ return phi::DataType::Precision; @@ -61,7 +61,7 @@ phi::DataType cvtPrecision2Phi(PrecisionType precision) { #undef CONVERT_PRECISION_TO_PHI } -PrecisionType cvtPrecisionFromPhi(phi::DataType datatype) { +PrecisionType ConvertPrecisionFromPhi(phi::DataType datatype) { #define CONVERT_PRECISION_FROM_PHI(Precision) \ case phi::DataType::Precision: \ return PrecisionType::Precision; @@ -84,7 +84,7 @@ PrecisionType cvtPrecisionFromPhi(phi::DataType datatype) { #undef CONVERT_PRECISION_FROM_PHI } -phi::DataLayout cvtLayout2Phi(LayoutType layout) { +phi::DataLayout ConvertLayoutToPhi(LayoutType layout) { switch (layout) { case LayoutType::NCHW: return phi::DataLayout::NCHW; @@ -97,7 +97,7 @@ phi::DataLayout cvtLayout2Phi(LayoutType layout) { } } -LayoutType cvtLayoutFromPhi(phi::DataLayout layout) { +LayoutType ConvertLayoutFromPhi(phi::DataLayout layout) { switch (layout) { case phi::DataLayout::NCHW: return LayoutType::NCHW; @@ -110,16 +110,16 @@ LayoutType cvtLayoutFromPhi(phi::DataLayout layout) { } } -phi::KernelKey cvtPlace2Phi(const Place& place) { - return phi::KernelKey(cvtTarget2Phi(place.target), - cvtLayout2Phi(place.layout), - cvtPrecision2Phi(place.precision)); +phi::KernelKey ConvertPlaceToPhi(const Place& place) { + return phi::KernelKey(ConvertTargetToPhi(place.target), + ConvertLayoutToPhi(place.layout), + ConvertPrecisionToPhi(place.precision)); } -Place cvtPlaceFromPhi(phi::TensorArgDef tensor_arg) { - return Place(cvtTargetFromPhi(tensor_arg.backend), - cvtPrecisionFromPhi(tensor_arg.dtype), - cvtLayoutFromPhi(tensor_arg.layout)); +Place ConvertPlaceFromPhi(phi::TensorArgDef tensor_arg) { + return Place(ConvertTargetFromPhi(tensor_arg.backend), + ConvertPrecisionFromPhi(tensor_arg.dtype), + ConvertLayoutFromPhi(tensor_arg.layout)); } } // namespace infrt diff --git a/paddle/infrt/dialect/phi/data_type.h b/paddle/infrt/dialect/phi/data_type.h index b618ef3861303334b697382f11bfa4fdb4a35c7a..bd258cb1038792e52667b0ef39c65b16c6210eb3 100644 --- a/paddle/infrt/dialect/phi/data_type.h +++ b/paddle/infrt/dialect/phi/data_type.h @@ -14,7 +14,7 @@ #pragma once -#include "paddle/infrt/dialect/infrt/common_type.h" +#include "paddle/infrt/dialect/infrt/common/types.h" #include "paddle/phi/common/backend.h" #include "paddle/phi/common/data_type.h" #include "paddle/phi/common/layout.h" @@ -23,16 +23,16 @@ namespace infrt { -phi::Backend cvtTarget2Phi(TargetType target); -TargetType cvtTargetFromPhi(phi::Backend backend); +phi::Backend ConvertTargetToPhi(TargetType target); +TargetType ConvertTargetFromPhi(phi::Backend backend); -phi::DataType cvtPrecision2Phi(PrecisionType precision); -PrecisionType cvtPrecisionFromPhi(phi::DataType datatype); +phi::DataType ConvertPrecisionToPhi(PrecisionType precision); +PrecisionType ConvertPrecisionFromPhi(phi::DataType datatype); -phi::DataLayout cvtLayout2Phi(LayoutType layout); -LayoutType cvtLayoutFromPhi(phi::DataLayout layout); +phi::DataLayout ConvertLayoutToPhi(LayoutType layout); +LayoutType ConvertLayoutFromPhi(phi::DataLayout layout); -phi::KernelKey cvtPlace2Phi(const Place& place); -Place cvtPlaceFromPhi(phi::TensorArgDef tensor_arg); +phi::KernelKey ConvertPlaceToPhi(const Place& place); +Place ConvertPlaceFromPhi(phi::TensorArgDef tensor_arg); } // namespace infrt diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_base.td b/paddle/infrt/dialect/phi/ir/infrt_phi_base.td index 671646b9259ccfd2399862d71d6860db93608eb8..5d7338ec4292ed49112c3cce45a30816e686886d 100644 --- a/paddle/infrt/dialect/phi/ir/infrt_phi_base.td +++ b/paddle/infrt/dialect/phi/ir/infrt_phi_base.td @@ -2,7 +2,7 @@ #define PHI_BASE include "mlir/IR/OpBase.td" -include "paddle/infrt/dialect/infrt_base.td" +include "paddle/infrt/dialect/infrt/ir/infrt_base.td" include "mlir/Interfaces/InferTypeOpInterface.td" def PHI_Dialect : Dialect { diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td b/paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td index ee23470fc754a56ef323c167613f7f32982eedd8..d2ff7acfba8b26f5c0ca1ec459d3b5e2f7fb3d93 100644 --- a/paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td +++ b/paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td @@ -3,7 +3,7 @@ include "mlir/Interfaces/SideEffectInterfaces.td" include "mlir/IR/OpBase.td" -include "paddle/infrt/dialect/infrt_base.td" +include "paddle/infrt/dialect/infrt/ir/infrt_base.td" include "paddle/infrt/dialect/phi/ir/infrt_phi_base.td" def PHI_CPUKernelDialect : Dialect { diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td index 21c4669b645fb6c7622fb01ae1c7bacaee0f5ca2..8c3a79498d74d3b80e1590bbc2c0530c7af6411e 100644 --- a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td +++ b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td @@ -5,7 +5,7 @@ include "paddle/infrt/dialect/phi/ir/infrt_phi_base.td" include "mlir/Interfaces/SideEffectInterfaces.td" include "mlir/IR/OpBase.td" -include "paddle/infrt/dialect/infrt_base.td" +include "paddle/infrt/dialect/infrt/ir/infrt_base.td" def PHI_DenseTensorDialect : Dialect { let name = "phi_dt"; diff --git a/paddle/infrt/dialect/phi/ir/phi_base.h b/paddle/infrt/dialect/phi/ir/phi_base.h index 0ea1973a7331b8a34bf2a286cb55e19a4d09118b..64cd08cc05ed42fe8d53b8c5b8a5bc994bae8824 100644 --- a/paddle/infrt/dialect/phi/ir/phi_base.h +++ b/paddle/infrt/dialect/phi/ir/phi_base.h @@ -18,7 +18,7 @@ #include #include -#include "paddle/infrt/dialect/infrt/common_type.h" +#include "paddle/infrt/dialect/infrt/common/types.h" #include "paddle/infrt/dialect/phi/ir/infrt_phi_baseDialect.h.inc" diff --git a/paddle/infrt/dialect/phi/ir/phi_kernels.h b/paddle/infrt/dialect/phi/ir/phi_kernels.h index b84d1b2b7294baf789fe4e1f3911edede8172cf7..4f8b41852cc67e32c510c247e907092046731452 100644 --- a/paddle/infrt/dialect/phi/ir/phi_kernels.h +++ b/paddle/infrt/dialect/phi/ir/phi_kernels.h @@ -30,7 +30,7 @@ #include #include "paddle/infrt/dialect/dense_tensor.h" -#include "paddle/infrt/dialect/infrt/infrt_dialect.h" +#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" #include "paddle/infrt/dialect/phi/ir/phi_base.h" #include "paddle/infrt/dialect/phi/ir/phi_cpu_kernelsDialect.h.inc" diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc index d1763897b4a1320179134ede14fe404aee4a6a76..353b1054e71374987207e1055289258915e0774d 100644 --- a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc +++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc @@ -80,7 +80,7 @@ std::vector getCandidateKernels( phi::KernelKeyMap kernel_key_map = phi::KernelFactory::Instance().SelectKernelMap(name); for (Place place : valid_palces) { - phi::KernelKey kernel_key = cvtPlace2Phi(place); + phi::KernelKey kernel_key = ConvertPlaceToPhi(place); if (kernel_key_map.find(kernel_key) == kernel_key_map.end()) { kernel_key = phi::KernelKey(kernel_key.backend(), phi::DataLayout::ALL_LAYOUT, @@ -97,10 +97,10 @@ std::vector getCandidateKernels( const paddle::SmallVector& output_arg = args_def.output_defs(); for (auto tensor_arg : input_arg) { - phi_kernel_desc.inputsType.emplace_back(cvtPlaceFromPhi(tensor_arg)); + phi_kernel_desc.inputsType.emplace_back(ConvertPlaceFromPhi(tensor_arg)); } for (auto tensor_arg : output_arg) { - phi_kernel_desc.outputsType.emplace_back(cvtPlaceFromPhi(tensor_arg)); + phi_kernel_desc.outputsType.emplace_back(ConvertPlaceFromPhi(tensor_arg)); } candidate_kernels.emplace_back(phi_kernel_desc); } diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc.h b/paddle/infrt/dialect/phi/pass/kernel_op_desc.h index 34fd2f0f62dcd9b793f9157003bfd3772d0e1307..b1f7c6c0811def9141e8012518fff5f504934149 100644 --- a/paddle/infrt/dialect/phi/pass/kernel_op_desc.h +++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc.h @@ -16,7 +16,7 @@ #include #include -#include "paddle/infrt/dialect/infrt/common_type.h" +#include "paddle/infrt/dialect/infrt/common/types.h" namespace infrt { diff --git a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc b/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc index fb00a3de3fc0c82dce2489c0f412c64118e3101e..485bf2a75d890aa0df5e888e7284ae3451aa514c 100644 --- a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc +++ b/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc @@ -24,13 +24,29 @@ #include #include -#include "paddle/infrt/dialect/infrt/infrt_dialect.h" +#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" #include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h" #include "paddle/infrt/dialect/phi/pass/kernel_op_desc.h" #include "paddle/infrt/dialect/phi/pass/proto_arg_map_context.h" #include "paddle/phi/core/compat/op_utils.h" #include "paddle/phi/ops/compat/signatures.h" -namespace infrt { + +namespace { +class phiOpCvtPass + : public mlir::PassWrapper { + public: + ::llvm::StringRef getName() const override { return "phiOpCvtPass"; } + void runOnFunction() override; + explicit phiOpCvtPass( + std::vector valid_places = std::vector()) + : valid_places_(valid_places) {} + + private: + void convertStage(); + void diapatchStage(); + std::vector valid_places_; +}; + // Implementation of the phiOpCvtPass. void phiOpCvtPass::runOnFunction() { convertStage(); @@ -63,7 +79,7 @@ void phiOpCvtPass::convertStage() { ::phi::KernelSignature kernel_sign = ::phi::OpUtilsMap::Instance().GetArgumentMappingFn(op_name)( - ProtoArgumentMappingContext(op)); + infrt::ProtoArgumentMappingContext(op)); // resort input&output according to kernel_sign ::llvm::SmallVector inputs, ori_output; ::llvm::SmallVector output_types; @@ -109,10 +125,10 @@ void phiOpCvtPass::diapatchStage() { } mlir::OpBuilder builder(&block, block.begin()); - std::map phi_context; + std::map phi_context; for (infrt::KernelOp kernel_op : worklist) { std::string kernel_name = kernel_op.name().str(); - std::vector candidates = + std::vector candidates = getCandidateKernels(kernel_name, valid_places_); if (candidates.empty()) { LOG(FATAL) << "No candidate kernels for op:" << kernel_name; @@ -121,12 +137,13 @@ void phiOpCvtPass::diapatchStage() { builder.setInsertionPoint(kernel_op); // Todo: Implimentation the concrete pass pick strategy - const PhiKernelDesc &phi_kernel_desc = candidates.front(); + const infrt::PhiKernelDesc &phi_kernel_desc = candidates.front(); - kernel_name = getPhiTargetPrefix(phi_kernel_desc.kernelType.target) + - kernel_name + - getPhiPrecisionSuffix(phi_kernel_desc.kernelType.precision) + - getPhiLayoutSuffix(phi_kernel_desc.kernelType.layout); + kernel_name = + infrt::getPhiTargetPrefix(phi_kernel_desc.kernelType.target) + + kernel_name + + infrt::getPhiPrecisionSuffix(phi_kernel_desc.kernelType.precision) + + infrt::getPhiLayoutSuffix(phi_kernel_desc.kernelType.layout); mlir::OperationName operation_name(kernel_name, kernel_op.getContext()); mlir::OperationState operation_state(kernel_op.getLoc(), operation_name); @@ -134,18 +151,18 @@ void phiOpCvtPass::diapatchStage() { if (phi_context.find(phi_kernel_desc.kernelType.target) == phi_context.end()) { switch (phi_kernel_desc.kernelType.target) { - case TargetType::CPU: { + case infrt::TargetType::CPU: { auto context_value = builder .create( kernel_op.getLoc(), - phi::ContextType::get(kernel_op.getContext(), - TargetType::CPU)) + infrt::phi::ContextType::get(kernel_op.getContext(), + infrt::TargetType::CPU)) .output(); - phi_context[TargetType::CPU] = context_value; + phi_context[infrt::TargetType::CPU] = context_value; } break; - case TargetType::GPU: - case TargetType::UNK: + case infrt::TargetType::GPU: + case infrt::TargetType::UNK: default: LOG(FATAL) << "Unsupported TargetType"; break; @@ -155,29 +172,30 @@ void phiOpCvtPass::diapatchStage() { phi_context.at(phi_kernel_desc.kernelType.target)); for (size_t index = 0; index < phi_kernel_desc.inputsType.size(); ++index) { mlir::Value input = kernel_op.getOperand(index); - auto cvt_tensor_type_op = builder.create( + auto cvt_tensor_type_op = builder.create( kernel_op.getLoc(), - DenseTensorType::get(kernel_op.getContext(), - phi_kernel_desc.inputsType[index].target, - phi_kernel_desc.inputsType[index].precision, - phi_kernel_desc.inputsType[index].layout), + infrt::DenseTensorType::get( + kernel_op.getContext(), + phi_kernel_desc.inputsType[index].target, + phi_kernel_desc.inputsType[index].precision, + phi_kernel_desc.inputsType[index].layout), input); operation_state.addOperands(cvt_tensor_type_op.output()); } for (size_t index = 0; index < phi_kernel_desc.outputsType.size(); ++index) { - operation_state.addTypes( - DenseTensorType::get(kernel_op.getContext(), - phi_kernel_desc.outputsType[index].target, - phi_kernel_desc.outputsType[index].precision, - phi_kernel_desc.outputsType[index].layout)); + operation_state.addTypes(infrt::DenseTensorType::get( + kernel_op.getContext(), + phi_kernel_desc.outputsType[index].target, + phi_kernel_desc.outputsType[index].precision, + phi_kernel_desc.outputsType[index].layout)); } operation_state.addAttributes(kernel_op.attrsAttr().getValue()); mlir::Operation *phi_operation = builder.createOperation(operation_state); for (size_t index = 0; index < phi_kernel_desc.outputsType.size(); ++index) { mlir::Value input = phi_operation->getResult(index); - auto cvt_tensor_type_op = builder.create( + auto cvt_tensor_type_op = builder.create( kernel_op.getLoc(), kernel_op.getResultTypes()[index], input); kernel_op.getResult(index).replaceAllUsesWith( cvt_tensor_type_op.output()); @@ -185,4 +203,10 @@ void phiOpCvtPass::diapatchStage() { kernel_op.erase(); } } -} // namespace infrt + +} // namespace + +std::unique_ptr infrt::createPhiOpCvtPass( + std::vector valid_places) { + return std::make_unique(valid_places); +} diff --git a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h b/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h index 051fee9b61a24772ff2295280fa1b0a1588d7bae..8b1944042aa7c42fef87786af0d0fa131c6f0535 100644 --- a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h +++ b/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h @@ -14,44 +14,14 @@ #pragma once #include -#include "paddle/infrt/dialect/infrt/common_type.h" +#include "paddle/infrt/dialect/infrt/common/types.h" namespace infrt { /* * phiOpCvtPass. - * - * Convert the general operators in pd Dialect to a infrt.kernelOp. - * - * source func: - * - * func @main() -> tensor { - * %a = "pd.feed"()... - * %c = "pd.conv2d"(%a) ... - * %d = "pd.conv3d"(%c) ... - * %f = "pd.conv2d"(%a) ... - * "pd.fetch" (%d, %f) - * } - * - * destination func: - * func @main() -> tensor { - * %a = "pd.feed"()... - * %c = "infrt.kernel"(%a){name = "conv2d"} ... - * %d = "infrt.kernel"(%c){name = "conv3d"}... - * %f = "infrt.kernel"(%a){name = "conv2d"}... - * "pd.fetch" (%d, %f) - * } + * Convert the general operators from pd Dialect to phi dialect. */ -class phiOpCvtPass - : public mlir::PassWrapper { - public: - ::llvm::StringRef getName() const override { return "phiOpCvtPass"; } - void runOnFunction() override; - explicit phiOpCvtPass(std::vector valid_places = std::vector()) - : valid_places_(valid_places) {} +std::unique_ptr createPhiOpCvtPass( + std::vector valid_places = std::vector()); - private: - void convertStage(); - void diapatchStage(); - std::vector valid_places_; -}; } // namespace infrt diff --git a/paddle/infrt/dialect/phi/phi_ir_exec.cc b/paddle/infrt/dialect/phi/phi_ir_exec.cc index 559fb90a64a7868c9c150e12e881d73df7a4aaf2..de61dba8e744c88f279761520ac1815bb265d875 100644 --- a/paddle/infrt/dialect/phi/phi_ir_exec.cc +++ b/paddle/infrt/dialect/phi/phi_ir_exec.cc @@ -38,7 +38,7 @@ int main(int argc, char** argv) { std::vector valid_places = {{infrt::TargetType::CPU, infrt::PrecisionType::FLOAT32, infrt::LayoutType::NCHW}}; - phi_pass_manager.addPass(std::make_unique(valid_places)); + phi_pass_manager.addPass(infrt::createPhiOpCvtPass(valid_places)); phi_pass_manager.addPass(infrt::createInfrtOpFusePass()); if (mlir::failed(pm.run(*module))) { std::cout << "\npass failed!\n" << std::endl; diff --git a/paddle/infrt/dialect/print_ir.cc b/paddle/infrt/dialect/print_ir.cc index a37df265955e70cdf735f251bc8853c7ad4fe831..b118a5f7a9caf42f4aa63dd0222e7a2647addac5 100644 --- a/paddle/infrt/dialect/print_ir.cc +++ b/paddle/infrt/dialect/print_ir.cc @@ -31,7 +31,7 @@ #include #include "paddle/infrt/common/global.h" -#include "paddle/infrt/dialect/init_infrt_dialects.h" +#include "paddle/infrt/dialect/init_dialects.h" namespace cl = llvm::cl; diff --git a/paddle/infrt/dialect/rewrite.td b/paddle/infrt/dialect/rewrite.td index 5e228fed4d57eb283705c725797c42c5da133c3f..62e7471a390dfeee1a9ddfc15033e85db0adca2e 100644 --- a/paddle/infrt/dialect/rewrite.td +++ b/paddle/infrt/dialect/rewrite.td @@ -1,7 +1,7 @@ #ifndef INFRT_REWRITE #define INFRT_REWRITE -include "paddle/infrt/dialect/infrt_base.td" +include "paddle/infrt/dialect/infrt/ir/infrt_base.td" include "mlir/Interfaces/SideEffectInterfaces.td" include "paddle/infrt/dialect/pd_ops.td" include "paddle/infrt/dialect/pd_extra_ops.td" diff --git a/paddle/infrt/dialect/tensor_shape.td b/paddle/infrt/dialect/tensor_shape.td index d3714c8ed14d3f1aea50ec4c55a9c4c2fb85e958..2be21d6aa772020519e3d909c9bdf7232f7ff985 100644 --- a/paddle/infrt/dialect/tensor_shape.td +++ b/paddle/infrt/dialect/tensor_shape.td @@ -2,7 +2,7 @@ #else #define INFRT_OPS -include "paddle/infrt/dialect/infrt_base.td" +include "paddle/infrt/dialect/infrt/ir/infrt_base.td" include "paddle/infrt/dialect/tensor_shape_base.td" include "mlir/Interfaces/SideEffectInterfaces.td" diff --git a/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td b/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td index 68ca1559acee03580eea0842bfbac3593d418c02..46c250b05492cefe61d8e677a352a217718189b8 100644 --- a/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td +++ b/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td @@ -2,7 +2,7 @@ #define PD_LOWER_TO_TRT include "mlir/Interfaces/SideEffectInterfaces.td" -include "paddle/infrt/dialect/infrt_base.td" +include "paddle/infrt/dialect/infrt/ir/infrt_base.td" include "paddle/infrt/dialect/pd_ops.td" include "paddle/infrt/dialect/tensorrt/trt_ops.td" diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h index 803e53e3244f92134928e1105a8248e9f49e5432..18afba19e06189294078bcfc1a0b2bb341eb7126 100644 --- a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h +++ b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h @@ -14,7 +14,6 @@ #pragma once #include -#include "paddle/infrt/dialect/infrt_base.h" namespace infrt { namespace trt { @@ -28,17 +27,17 @@ namespace trt { * func @main(%a : tensor) -> tensor { * %c = "pd.graph"(%a) { * %m = "pd.conv2d"(%a)... - * "infrt.return" (%m) + * infrt.return %m... * } ... * %d = "pd.graph"(%c) { * %m = "pd.conv3d"(%c)... - * "infrt.return" (%m) + * infrt.return %m... * } ... * %f = "pd.graph"(%a) { * %m = "pd.conv2d"(%a)... - * "infrt.return" (%m) + * infrt.return %m... * } ... - * "infrt.return" (%d, %f).. + * infrt.return %d, %f :... * } * * destination func: @@ -47,9 +46,9 @@ namespace trt { * %m = "pd.conv2d"(%a)... * %n = "pd.conv3d"(%m)... * %s = "pd.conv2d"(%a)... - * "infrt.return" (%n, %s) + * infrt.return %n, %s:... * } ... - * "infrt.return" (%d, %f) + * infrt.return %d, %f:... * } */ class TRTGraphFusePass diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h index 1c44a13cf9dfb65a1747a596dc1012e7f54d792e..a5dd4f14b2946fe232b7b725f6ace7caf74ff4d4 100644 --- a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h +++ b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h @@ -14,7 +14,6 @@ #pragma once #include -#include "paddle/infrt/dialect/infrt_base.h" namespace infrt { namespace trt { @@ -31,9 +30,9 @@ namespace trt { * %m = "pd.conv2d"(%a)... * %n = "pd.conv3d"(%m)... * %s = "pd.conv2d"(%a)... - * "infrt.return" (%n, %s)... + * infrt.return %n, %s : ... * } ... - * "infrt.return" (%d, %f)... + * infrt.return %d, %f : ... * } * * destination func: @@ -41,7 +40,7 @@ namespace trt { * %c = "pd.conv2d"(%a) ... * %d = "pd.conv3d"(%c) ... * %f = "pd.conv2d"(%a) ... - * "infrt.return" (%d, %f)... + * infrt.return %d, %f:... * } */ class TRTGraphSplitPass diff --git a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc index 1be5f4dbc39d7699b6d8a36cfb3e164694e908c1..83bebdb6bf19bdf8f75d11d693813b8169e297a0 100644 --- a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc +++ b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc @@ -14,7 +14,6 @@ #include "paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h" #include #include -#include "paddle/infrt/dialect/infrt_base.h" #include "paddle/infrt/dialect/pd_ops.h" #include "paddle/infrt/dialect/tensorrt/trt_dialect_types.h" @@ -24,7 +23,7 @@ namespace trt { #include "paddle/infrt/dialect/tensorrt/pd_lower_to_trt.cpp.inc" // NOLINT struct PD2TRT_GraphLower : public ::mlir::RewritePattern { - PD2TRT_GraphLower(::mlir::MLIRContext *context) + explicit PD2TRT_GraphLower(::mlir::MLIRContext *context) : ::mlir::RewritePattern("pd.graph", 1, context, {"trt.create_engine"}) {} ::mlir::LogicalResult matchAndRewrite( ::mlir::Operation *op, ::mlir::PatternRewriter &rewriter) const override { diff --git a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h index 7550d8c84e19504fc0f41067c1194703a55410ba..ede64f8bcd556a73b779fc3b772bf3fa8f74eaf9 100644 --- a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h +++ b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h @@ -15,7 +15,7 @@ #pragma once #include "mlir/IR/Dialect.h" #include "mlir/Pass/Pass.h" -#include "paddle/infrt/dialect/infrt/infrt_dialect.h" +#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" #include "paddle/infrt/dialect/tensorrt/trt_ops.h" namespace infrt { @@ -29,9 +29,9 @@ namespace trt { * %m = "pd.conv2d"(%a)... * %n = "pd.conv3d"(%m)... * %s = "pd.conv2d"(%a)... - * "infrt.return" (%n, %s)... + * infrt.return %n, %s:... * } ... - * "infrt.return" (%d, %f)... + * infrt.return %d, %f:... * } * * destination ir: @@ -40,10 +40,10 @@ namespace trt { * %m = "trt.Convolution"(%a)... * %n = "trt.Convolution"(%m)... * %s = "trt.Convolution"(%a)... - * "infrt.return" (%n, %s)... + * infrt.return %n, %s :... * }){run_once = true} ... * %d, %f = "trt.execute"(%engine, %a)... - * "infrt.return" (%d, %f)... + * infrt.return %d, %f :... * } */ struct TRTOpConverterPass diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc index 13b7f1aee55d2a2d30822a878bbd50d385411f43..9f348b4122fc74033703c92459e6cfa5b3a1f3a2 100644 --- a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc +++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc @@ -15,8 +15,8 @@ #include "paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h" #include -#include "paddle/infrt/dialect/basic_kernels.h" -#include "paddle/infrt/dialect/infrt/infrt_dialect.h" +#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h" +#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" #include "paddle/infrt/dialect/pd_ops.h" namespace infrt { diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h index b9e461c8633d906fd46e9f7d6799e8a157915048..1cb08dc0a2161eeb5720191bada52f9b54e94893 100644 --- a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h +++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h @@ -14,7 +14,6 @@ #pragma once #include -#include "paddle/infrt/dialect/infrt_base.h" namespace infrt { namespace trt { @@ -29,24 +28,24 @@ namespace trt { * %c = "pd.conv2d"(%a) ... * %d = "pd.conv3d"(%c) ... * %f = "pd.conv2d"(%a) ... - * "infrt.return"(%d, %f) ... + * infrt.return %d, %f: ... * } * * destination func: * func @main(%a : tensor) -> tensor { * %c = "pd.graph"(%a) { * %m = "pd.conv2d"(%a)... - * "infrt.return" (%m) + * infrt.return %m:... * } ... * %d = "pd.graph"(%c) { * %m = "pd.conv3d"(%c)... - * "infrt.return" (%m) + * infrt.return %m:... * } ... * %f = "pd.graph"(%a) { * %m = "pd.conv2d"(%a)... - * "infrt.return" (%m) + * infrt.return %m:... * } ... - * "infrt.return" (%d, %f) + * infrt.return %d, %f:... * } * TODO(winter-wang): Supplementary how to judge the operators can be supported * by tensorrt. diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.h b/paddle/infrt/dialect/tensorrt/trt_ops.h index 44444232915bad7d25b0ecedfa8e8427f4567e49..78d960b5120454bdd01b779abedbe2f7ec0d5853 100644 --- a/paddle/infrt/dialect/tensorrt/trt_ops.h +++ b/paddle/infrt/dialect/tensorrt/trt_ops.h @@ -28,8 +28,8 @@ #include #include #include -#include "paddle/infrt/dialect/basic_kernels.h" -#include "paddle/infrt/dialect/infrt/infrt_dialect.h" +#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h" +#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" #include "paddle/infrt/dialect/pd_ops.h" namespace infrt { diff --git a/paddle/infrt/external_kernels/basic.mlir b/paddle/infrt/external_kernels/basic.mlir index 1a7ea854c9ce469ee5719743287b4ee1b5de9286..843b12ced21a982b18b5a63f7bbef1d4d24eea16 100644 --- a/paddle/infrt/external_kernels/basic.mlir +++ b/paddle/infrt/external_kernels/basic.mlir @@ -1,7 +1,7 @@ // CHECK: basic func @basic() -> f32 { - %v0 = Infrt.constant.f32 1.0 - %v1 = Infrt.constant.f32 2.0 + %v0 = infrt.constant.f32 1.0 + %v1 = infrt.constant.f32 2.0 %v2 = "external.add.f32"(%v0, %v1) : (f32, f32) -> f32 // CHECK: 1 @@ -17,5 +17,5 @@ func @basic() -> f32 { // CHECK: 6 "external.print.f32"(%v3) : (f32) -> () - Infrt.return %v3 : f32 + infrt.return %v3 : f32 } diff --git a/paddle/infrt/external_kernels/fc.mlir b/paddle/infrt/external_kernels/fc.mlir index b0cabddc3ebc4a9ede73d506ac58acaa140f03d5..26b2d24cace70455d4a0e21dddf23c9bd628ae81 100644 --- a/paddle/infrt/external_kernels/fc.mlir +++ b/paddle/infrt/external_kernels/fc.mlir @@ -1,43 +1,43 @@ // CHECK-LABEL: @fc -func @fc(%input : !Infrt.tensor, - %w : !Infrt.tensor, - %bias : !Infrt.tensor) -> !Infrt.tensor +func @fc(%input : !infrt.dense_tensor, + %w : !infrt.dense_tensor, + %bias : !infrt.dense_tensor) -> !infrt.dense_tensor { - %out = dt.create_uninit_tensor.f32 [30, 50] -> !Infrt.tensor - // dt.fill_tensor_with_constant.f32 (%out : !Infrt.tensor) {value=0.0:f32} + %out = dt.create_uninit_tensor.f32 [30, 50] -> !infrt.dense_tensor + // dt.fill_tensor_with_constant.f32 (%out : !infrt.dense_tensor) {value=0.0:f32} // fc1 - "external.matmul"(%input, %w, %out) {}: (!Infrt.tensor, !Infrt.tensor, !Infrt.tensor) -> () - "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!Infrt.tensor, !Infrt.tensor, !Infrt.tensor) -> () - "external.sigmoid"(%out, %out) {}: (!Infrt.tensor, !Infrt.tensor) -> () + "external.matmul"(%input, %w, %out) {}: (!infrt.dense_tensor, !infrt.dense_tensor, !infrt.dense_tensor) -> () + "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!infrt.dense_tensor, !infrt.dense_tensor, !infrt.dense_tensor) -> () + "external.sigmoid"(%out, %out) {}: (!infrt.dense_tensor, !infrt.dense_tensor) -> () // fc2 - "external.matmul"(%out, %w, %out) {}: (!Infrt.tensor, !Infrt.tensor, !Infrt.tensor) -> () - "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!Infrt.tensor, !Infrt.tensor, !Infrt.tensor) -> () - "external.sigmoid"(%out, %out) {}: (!Infrt.tensor, !Infrt.tensor) -> () + "external.matmul"(%out, %w, %out) {}: (!infrt.dense_tensor, !infrt.dense_tensor, !infrt.dense_tensor) -> () + "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!infrt.dense_tensor, !infrt.dense_tensor, !infrt.dense_tensor) -> () + "external.sigmoid"(%out, %out) {}: (!infrt.dense_tensor, !infrt.dense_tensor) -> () - Infrt.return %out : !Infrt.tensor + infrt.return %out : !infrt.dense_tensor } // CHECK-LABEL: @benchmark func @benchmark() { - %input = dt.create_uninit_tensor.f32 [30, 50] -> !Infrt.tensor - dt.fill_tensor_with_constant.f32 (%input : !Infrt.tensor) {value=1.0:f32} + %input = dt.create_uninit_tensor.f32 [30, 50] -> !infrt.dense_tensor + dt.fill_tensor_with_constant.f32 (%input : !infrt.dense_tensor) {value=1.0:f32} - %w = dt.create_uninit_tensor.f32 [50, 50] -> !Infrt.tensor - dt.fill_tensor_with_constant.f32 (%w : !Infrt.tensor) {value=2.0:f32} + %w = dt.create_uninit_tensor.f32 [50, 50] -> !infrt.dense_tensor + dt.fill_tensor_with_constant.f32 (%w : !infrt.dense_tensor) {value=2.0:f32} - %bias = dt.create_uninit_tensor.f32 [30, 50] -> !Infrt.tensor - dt.fill_tensor_with_constant.f32 (%bias : !Infrt.tensor) {value=3.0:f32} + %bias = dt.create_uninit_tensor.f32 [30, 50] -> !infrt.dense_tensor + dt.fill_tensor_with_constant.f32 (%bias : !infrt.dense_tensor) {value=3.0:f32} - Infrt.benchmark "add.f32"( - %input:!Infrt.tensor, - %w:!Infrt.tensor, - %bias:!Infrt.tensor) + infrt.benchmark "add.f32"( + %input:!infrt.dense_tensor, + %w:!infrt.dense_tensor, + %bias:!infrt.dense_tensor) duration_secs = 100, max_count = 300000, num_warmup_runs = 3 { - %res = Infrt.call @fc(%input, %w, %bias) : (!Infrt.tensor, !Infrt.tensor, !Infrt.tensor) -> (!Infrt.tensor) - Infrt.return %res : !Infrt.tensor + %res = infrt.call @fc(%input, %w, %bias) : (!infrt.dense_tensor, !infrt.dense_tensor, !infrt.dense_tensor) -> (!infrt.dense_tensor) + infrt.return %res : !infrt.dense_tensor } - Infrt.return + infrt.return } diff --git a/paddle/infrt/external_kernels/paddle.mlir b/paddle/infrt/external_kernels/paddle.mlir index d55d9904b5bc4e43388abacf9e4b62bf06db458b..97781e5c8c5e544bba53b561f2adcae16263886f 100644 --- a/paddle/infrt/external_kernels/paddle.mlir +++ b/paddle/infrt/external_kernels/paddle.mlir @@ -1,50 +1,50 @@ // CHECK: paddle_func func @paddle_func() -> () { - %input = dt.create_uninit_tensor.f32 [3, 5] -> !Infrt.tensor - dt.fill_tensor_with_constant.f32 (%input : !Infrt.tensor) {value=1.0:f32} + %input = dt.create_uninit_tensor.f32 [3, 5] -> !infrt.dense_tensor + dt.fill_tensor_with_constant.f32 (%input : !infrt.dense_tensor) {value=1.0:f32} - %w = dt.create_uninit_tensor.f32 [5, 4] -> !Infrt.tensor - dt.fill_tensor_with_constant.f32 (%w : !Infrt.tensor) {value=2.0:f32} + %w = dt.create_uninit_tensor.f32 [5, 4] -> !infrt.dense_tensor + dt.fill_tensor_with_constant.f32 (%w : !infrt.dense_tensor) {value=2.0:f32} - %bias = dt.create_uninit_tensor.f32 [4] -> !Infrt.tensor - dt.fill_tensor_with_constant.f32 (%bias : !Infrt.tensor) {value=3.0:f32} + %bias = dt.create_uninit_tensor.f32 [4] -> !infrt.dense_tensor + dt.fill_tensor_with_constant.f32 (%bias : !infrt.dense_tensor) {value=3.0:f32} - %out = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor - dt.fill_tensor_with_constant.f32 (%out : !Infrt.tensor) {value=0.0:f32} + %out = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.dense_tensor + dt.fill_tensor_with_constant.f32 (%out : !infrt.dense_tensor) {value=0.0:f32} - "external.fc2"(%input, %w, %bias, %out) {in_num_col_dims=3:i32, test_attr=5:i32}: (!Infrt.tensor, !Infrt.tensor, !Infrt.tensor, !Infrt.tensor) -> () + "external.fc2"(%input, %w, %bias, %out) {in_num_col_dims=3:i32, test_attr=5:i32}: (!infrt.dense_tensor, !infrt.dense_tensor, !infrt.dense_tensor, !infrt.dense_tensor) -> () // CHECK-LABEL: tensor: shape=shape[3,5], values=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] - dt.print_tensor (%input : !Infrt.tensor) + dt.print_tensor (%input : !infrt.dense_tensor) // CHECK-LABEL: tensor: shape=shape[5,4], values=[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2] - dt.print_tensor (%w : !Infrt.tensor) - dt.print_tensor (%bias : !Infrt.tensor) - dt.print_tensor (%out : !Infrt.tensor) + dt.print_tensor (%w : !infrt.dense_tensor) + dt.print_tensor (%bias : !infrt.dense_tensor) + dt.print_tensor (%out : !infrt.dense_tensor) // test external.matmul - %out1 = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor - dt.fill_tensor_with_constant.f32 (%out1 : !Infrt.tensor) {value=0.0:f32} - "external.matmul"(%input, %w, %out1) {}: (!Infrt.tensor, !Infrt.tensor, !Infrt.tensor) -> () - dt.print_tensor (%out1 : !Infrt.tensor) + %out1 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.dense_tensor + dt.fill_tensor_with_constant.f32 (%out1 : !infrt.dense_tensor) {value=0.0:f32} + "external.matmul"(%input, %w, %out1) {}: (!infrt.dense_tensor, !infrt.dense_tensor, !infrt.dense_tensor) -> () + dt.print_tensor (%out1 : !infrt.dense_tensor) // test external.elementwise_add - %out2 = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor - dt.fill_tensor_with_constant.f32 (%out2 : !Infrt.tensor) {value=0.0:f32} - %bias1 = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor - dt.fill_tensor_with_constant.f32 (%bias1 : !Infrt.tensor) {value=3.0:f32} - "external.elementwise_add"(%out1, %bias1, %out2) {axis=-1}: (!Infrt.tensor, !Infrt.tensor, !Infrt.tensor) -> () - dt.print_tensor (%out2 : !Infrt.tensor) + %out2 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.dense_tensor + dt.fill_tensor_with_constant.f32 (%out2 : !infrt.dense_tensor) {value=0.0:f32} + %bias1 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.dense_tensor + dt.fill_tensor_with_constant.f32 (%bias1 : !infrt.dense_tensor) {value=3.0:f32} + "external.elementwise_add"(%out1, %bias1, %out2) {axis=-1}: (!infrt.dense_tensor, !infrt.dense_tensor, !infrt.dense_tensor) -> () + dt.print_tensor (%out2 : !infrt.dense_tensor) // test external.relu - %out3 = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor - dt.fill_tensor_with_constant.f32 (%out3 : !Infrt.tensor) {value=0.0:f32} - "external.relu"(%out1, %out3) {}: (!Infrt.tensor, !Infrt.tensor) -> () - dt.print_tensor (%out3 : !Infrt.tensor) + %out3 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.dense_tensor + dt.fill_tensor_with_constant.f32 (%out3 : !infrt.dense_tensor) {value=0.0:f32} + "external.relu"(%out1, %out3) {}: (!infrt.dense_tensor, !infrt.dense_tensor) -> () + dt.print_tensor (%out3 : !infrt.dense_tensor) // test external.sigmoid - %out4 = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor - dt.fill_tensor_with_constant.f32 (%out4 : !Infrt.tensor) {value=0.0:f32} - "external.sigmoid"(%out1, %out4) {}: (!Infrt.tensor, !Infrt.tensor) -> () - dt.print_tensor (%out4 : !Infrt.tensor) + %out4 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.dense_tensor + dt.fill_tensor_with_constant.f32 (%out4 : !infrt.dense_tensor) {value=0.0:f32} + "external.sigmoid"(%out1, %out4) {}: (!infrt.dense_tensor, !infrt.dense_tensor) -> () + dt.print_tensor (%out4 : !infrt.dense_tensor) - Infrt.return + infrt.return } diff --git a/paddle/infrt/host_context/kernel_registry.cc b/paddle/infrt/host_context/kernel_registry.cc index f343dfc71b040e77308b30c2963fb4014221e29c..4209b2a9648d8be0a9a3897c27c7a35113cba424 100644 --- a/paddle/infrt/host_context/kernel_registry.cc +++ b/paddle/infrt/host_context/kernel_registry.cc @@ -23,8 +23,9 @@ namespace infrt { namespace host_context { struct KernelRegistry::Impl { - std::unordered_map data; - std::unordered_map> attr_names; + std::unordered_map>> + data; }; KernelRegistry::KernelRegistry() : impl_(std::make_unique()) {} @@ -33,20 +34,29 @@ void KernelRegistry::AddKernel(const std::string &key, KernelImplementation fn) { CHECK(!impl_->data.count(key)) << "kernel [" << key << "] is registered twice"; - impl_->data.emplace(key, fn); + impl_->data.emplace( + key, std::make_pair(std::move(fn), std::vector{})); } -void KernelRegistry::AddKernelAttrNameList( - const std::string &key, const std::vector &names) { - CHECK(!impl_->attr_names.count(key)) - << "kernel [" << key << "] is registered twice in attribute names"; - impl_->attr_names.emplace( - key, llvm::SmallVector(names.begin(), names.end())); +const std::vector &KernelRegistry::GetAttrNameList( + const std::string &key) const { + CHECK(impl_->data.count(key)); + return impl_->data[key].second; +} + +void KernelRegistry::AddKernelWithAttrs( + const std::string &key, + KernelImplementation fn, + std::vector &&attr_order) { + CHECK(!impl_->data.count(key)) << "kernel [" << key + << "] is registered twice"; + impl_->data.emplace(key, + std::make_pair(std::move(fn), std::move(attr_order))); } KernelImplementation KernelRegistry::GetKernel(const std::string &key) const { auto it = impl_->data.find(key); - return it != impl_->data.end() ? it->second : KernelImplementation{}; + return it != impl_->data.end() ? it->second.first : KernelImplementation{}; } std::vector KernelRegistry::GetKernelList() const { diff --git a/paddle/infrt/host_context/kernel_registry.h b/paddle/infrt/host_context/kernel_registry.h index a813f690efb0b3d36b7575d0889652f0868a2d85..a146b2b3c4c1e1090b5ac1843466b93a31b0bb0b 100644 --- a/paddle/infrt/host_context/kernel_registry.h +++ b/paddle/infrt/host_context/kernel_registry.h @@ -34,10 +34,14 @@ class KernelRegistry { KernelRegistry(); void AddKernel(const std::string &key, KernelImplementation fn); - void AddKernelAttrNameList(const std::string &key, - const std::vector &names); + void AddKernelWithAttrs(const std::string &key, + KernelImplementation fn, + std::vector &&attrs_order); KernelImplementation GetKernel(const std::string &key) const; + const std::vector &GetAttrNameList( + const std::string &key) const; + std::vector GetKernelList() const; size_t size() const; diff --git a/paddle/infrt/host_context/mlir_exec.cc b/paddle/infrt/host_context/mlir_exec.cc index 90bcb1df220c0f4c558ece80a09fccc93aada41c..1506282f6268191a2eece5540d30fbe90d8eeb52 100644 --- a/paddle/infrt/host_context/mlir_exec.cc +++ b/paddle/infrt/host_context/mlir_exec.cc @@ -92,7 +92,7 @@ int main(int argc, char** argv) { std::vector valid_places = {{infrt::TargetType::CPU, infrt::PrecisionType::FLOAT32, infrt::LayoutType::NCHW}}; - phi_pass_manager.addPass(std::make_unique(valid_places)); + phi_pass_manager.addPass(infrt::createPhiOpCvtPass(valid_places)); phi_pass_manager.addPass(infrt::createInfrtOpFusePass()); #endif diff --git a/paddle/infrt/host_context/mlir_function_executable.cc b/paddle/infrt/host_context/mlir_function_executable.cc index 47ec27ebec300f1cedd57b11e0dd1e6b37611141..ec8d43f99bae770f28cbf1b1bdc269536b4e7100 100644 --- a/paddle/infrt/host_context/mlir_function_executable.cc +++ b/paddle/infrt/host_context/mlir_function_executable.cc @@ -43,6 +43,7 @@ MlirFunctionExecutable::MlirFunctionExecutable( func_op.getNumResults()), MlirToRuntimeTranslator(&core_runtime_builder_), region_(&func_op.getRegion()), + kernel_registry_(kernel_registry), core_runtime_builder_(kernel_registry), function_table_(function_table) {} @@ -54,6 +55,7 @@ MlirFunctionExecutable::MlirFunctionExecutable( : Function("", func_type.getNumInputs(), func_type.getNumResults()), MlirToRuntimeTranslator(&core_runtime_builder_), region_(region), + kernel_registry_(kernel_registry), core_runtime_builder_(kernel_registry), function_table_(function_table) {} @@ -90,7 +92,7 @@ void MlirFunctionExecutable::BuildExecutables( if (EmitCallOp(&op, &function_table_)) continue; - if (EmitGeneralOp(&op)) continue; + if (EmitGeneralOp(&op, *kernel_registry_)) continue; LOG(FATAL) << "Not supported op: " << DumpToString(op); } diff --git a/paddle/infrt/host_context/mlir_function_executable.h b/paddle/infrt/host_context/mlir_function_executable.h index a6428df86e6b27061d92856970682bc29499d825..cd9161d01bbf648c344ec2a82747d997b810856a 100644 --- a/paddle/infrt/host_context/mlir_function_executable.h +++ b/paddle/infrt/host_context/mlir_function_executable.h @@ -70,6 +70,7 @@ class MlirFunctionExecutable : public Function, public MlirToRuntimeTranslator { private: mlir::Region* region_{}; + KernelRegistry* kernel_registry_{}; CoreRuntimeBuilder core_runtime_builder_; MlirToRuntimeTranslator::function_defs_t& function_table_; std::function copy_res_fn_; diff --git a/paddle/infrt/host_context/mlir_tests/basic.mlir b/paddle/infrt/host_context/mlir_tests/basic.mlir index 1b55b408f2b082c09d06d51037e8c9d967a171f4..263d5884134b143aa8d3403c5cd05672df39636f 100644 --- a/paddle/infrt/host_context/mlir_tests/basic.mlir +++ b/paddle/infrt/host_context/mlir_tests/basic.mlir @@ -1,30 +1,30 @@ // CHECK-LABEL: basic func @basic() -> f32 { - %v0 = Infrt.constant.f32 1.0 - %v1 = Infrt.constant.f32 2.0 - %v2 = "Infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32 + %v0 = infrt.constant.f32 1.0 + %v1 = infrt.constant.f32 2.0 + %v2 = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32 // CHECK: 1 - "Infrt.print.f32"(%v0) : (f32) -> () + "infrt.print.f32"(%v0) : (f32) -> () // CHECK: 2 - "Infrt.print.f32"(%v1) : (f32) -> () + "infrt.print.f32"(%v1) : (f32) -> () // CHECK: 3 - "Infrt.print.f32"(%v2) : (f32) -> () + "infrt.print.f32"(%v2) : (f32) -> () - %v3 = "Infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32 + %v3 = "infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32 // CHECK: 6 - "Infrt.print.f32"(%v3) : (f32) -> () + "infrt.print.f32"(%v3) : (f32) -> () - Infrt.return %v3 : f32 + infrt.return %v3 : f32 } // CHECK-LABEL: basic1 // Check the mlir executor can work with more than one function in a file. func @basic1() -> () { - %v0 = Infrt.constant.f32 1.0 - "Infrt.print.f32"(%v0) : (f32) -> () + %v0 = infrt.constant.f32 1.0 + "infrt.print.f32"(%v0) : (f32) -> () // CHECK: 1 - Infrt.return + infrt.return } \ No newline at end of file diff --git a/paddle/infrt/host_context/mlir_tests/dense_tensor.mlir b/paddle/infrt/host_context/mlir_tests/dense_tensor.mlir index 5a973a3eb23e6015ede2d69d83ab8c26de669908..1a7fa28f1e58bd400671099f5af7bedbb3c04e4d 100644 --- a/paddle/infrt/host_context/mlir_tests/dense_tensor.mlir +++ b/paddle/infrt/host_context/mlir_tests/dense_tensor.mlir @@ -1,9 +1,9 @@ // CHECK-LABEL: build_tensor1 func @build_tensor1() { - %a = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor - dt.fill_tensor_with_constant.f32 (%a : !Infrt.tensor) {value=1.0:f32} + %a = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.dense_tensor + dt.fill_tensor_with_constant.f32 (%a : !infrt.dense_tensor) {value=1.0:f32} // CHECK: tensor: shape=shape[3,4], values=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] - dt.print_tensor (%a : !Infrt.tensor) + dt.print_tensor (%a : !infrt.dense_tensor) - Infrt.return + infrt.return } diff --git a/paddle/infrt/host_context/mlir_tests/shape.mlir b/paddle/infrt/host_context/mlir_tests/shape.mlir index 22df1c8010d8dbd6a4b8e332e01602b4421ebcdd..691ce62cbf82ad4dc0d3b0199a9c1d1127213de5 100644 --- a/paddle/infrt/host_context/mlir_tests/shape.mlir +++ b/paddle/infrt/host_context/mlir_tests/shape.mlir @@ -3,5 +3,5 @@ func @build_tensor1() { %a = ts.build_shape [1:i64, 57:i64, 92:i64] // CHECK: shape[1,57,92] ts.print_shape %a - Infrt.return -} \ No newline at end of file + infrt.return +} diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.cc b/paddle/infrt/host_context/mlir_to_runtime_translate.cc index a901c323ec03a418a32eee3cb8ea17708e38bdb9..c613843cd1779599fbac5aea6042b26b151534e8 100644 --- a/paddle/infrt/host_context/mlir_to_runtime_translate.cc +++ b/paddle/infrt/host_context/mlir_to_runtime_translate.cc @@ -75,7 +75,7 @@ struct MlirToRuntimeTranslator::Impl { }; bool MlirToRuntimeTranslator::EmitConstantOp(mlir::Operation* op) { - if (!infrt::Startswith(op->getName().getStringRef().str(), "Infrt.constant")) + if (!infrt::Startswith(op->getName().getStringRef().str(), "infrt.constant")) return false; VLOG(3) << "Emitting constant op [" << op->getName().getStringRef().str() << "]"; @@ -267,10 +267,11 @@ boost::optional> MlirToRuntimeTranslator::EmitAttribute( } static bool IsReturn(mlir::Operation* op) { - return op->getName().getStringRef() == "Infrt.return"; + return op->getName().getStringRef() == "infrt.return"; } -bool MlirToRuntimeTranslator::EmitGeneralOp(mlir::Operation* op) { +bool MlirToRuntimeTranslator::EmitGeneralOp( + mlir::Operation* op, const KernelRegistry& kernel_registry) { CHECK(impl_->runtime); impl_->cur_op = impl_->runtime->NewOpExecutable(op->getName().getStringRef().str()); @@ -308,42 +309,80 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(mlir::Operation* op) { // process attributes auto attrs = op->getAttrs(); + // MLIR's underlying attr storage type is `Builtin_Dictionary`, and its + // elements + // are sorted by name. The following code adapts the order of function + // signatures + // of the phi operator library. + llvm::SmallVector tmp; + tmp.resize(attrs.size()); + const std::string& kernel_name = op->getName().getStringRef().str(); + const auto& attr_names = kernel_registry.GetAttrNameList(kernel_name); + if (attrs.size() && attr_names.empty()) { + LOG(WARNING) << "The kernel `" << kernel_name + << "` has no specified attr order."; + } + auto get_offset = [](const char* attr, + const std::vector& names, + const std::string& kernel_name) -> int { + for (size_t i = 0; i < names.size(); ++i) { + if (!std::strcmp(attr, names[i])) { + return i; + } + } + LOG(WARNING) << "The attribute `" << attr << "` of kernel `" << kernel_name + << "` is not properly registered with " + "`KernelRegistry::AddKernelWithAttrs()`."; + return -1; + }; + for (size_t i = 0; i < attrs.size(); i++) { auto& attr = attrs[i]; + int offset{}; + if (attr_names.size()) { + offset = get_offset(attr.getName().data(), attr_names, kernel_name); + } else { + offset = i; + } + CHECK_NE(offset, -1); if (auto v = EmitAttribute(attr.getValue())) { - impl_->cur_op->AppendAttribute(new Value(*v)); + tmp[offset] = new Value(*v); } else if (auto v = EmitAttribute(attr.getValue())) { - impl_->cur_op->AppendAttribute(new Value(*v)); + tmp[offset] = new Value(*v); } else if (auto v = EmitAttribute(attr.getValue())) { - impl_->cur_op->AppendAttribute(new Value(*v)); + tmp[offset] = new Value(*v); } else if (auto v = EmitAttribute(attr.getValue())) { - impl_->cur_op->AppendAttribute(new Value(*v)); + tmp[offset] = new Value(*v); } else if (auto v = EmitAttribute(attr.getValue())) { - impl_->cur_op->AppendAttribute(new Value(std::move(*v))); + tmp[offset] = new Value(std::move(*v)); } else if (auto v = EmitAttribute(attr.getValue())) { - impl_->cur_op->AppendAttribute(new Value(*v)); + tmp[offset] = new Value(*v); } else if (auto v = EmitAttribute<::infrt::TargetType>(attr.getValue())) { - impl_->cur_op->AppendAttribute(new Value(*v)); + tmp[offset] = new Value(*v); } else if (auto v = EmitAttribute<::infrt::PrecisionType>(attr.getValue())) { - impl_->cur_op->AppendAttribute(new Value(*v)); + tmp[offset] = new Value(*v); } else if (auto v = EmitAttribute<::infrt::LayoutType>(attr.getValue())) { - impl_->cur_op->AppendAttribute(new Value(*v)); + tmp[offset] = new Value(*v); } else if (auto v = EmitAttribute>(attr.getValue())) { - impl_->cur_op->AppendAttribute(new Value(std::move(*v))); + tmp[offset] = new Value(std::move(*v)); } else if (auto v = EmitAttribute>(attr.getValue())) { - impl_->cur_op->AppendAttribute(new Value(std::move(*v))); + tmp[offset] = new Value(std::move(*v)); } else if (auto v = EmitAttribute>(attr.getValue())) { - impl_->cur_op->AppendAttribute(new Value(std::move(*v))); + tmp[offset] = new Value(std::move(*v)); } else if (auto v = EmitAttribute>(attr.getValue())) { - impl_->cur_op->AppendAttribute(new Value(std::move(*v))); + tmp[offset] = new Value(std::move(*v)); } else if (auto v = EmitAttribute>(attr.getValue())) { - impl_->cur_op->AppendAttribute(new Value(std::move(*v))); + tmp[offset] = new Value(std::move(*v)); } else { LOG(FATAL) << "Not supported attribute type"; } } + for (size_t i = 0; i < tmp.size(); i++) { + impl_->cur_op->AppendAttribute(tmp[i]); + } + // process results llvm::SmallVector res_values; for (int i = 0, e = op->getNumResults(); i < e; i++) { @@ -405,7 +444,7 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(mlir::Operation* op) { bool MlirToRuntimeTranslator::EmitReturnOp( mlir::Operation* op, llvm::SmallVectorImpl* results) { CHECK(results); - if (op->getName().getStringRef() == "Infrt.return") { + if (op->getName().getStringRef() == "infrt.return") { for (size_t i = 0; i < op->getNumOperands(); i++) { results->push_back(op->getOperand(i)); } @@ -478,7 +517,7 @@ bool MlirToRuntimeTranslator::EmitCallOp(mlir::Operation* op, function_defs_t* function_table) { CHECK(op); CHECK(function_table); - if (op->getName().getStringRef() != "Infrt.call") return false; + if (op->getName().getStringRef() != "infrt.call") return false; impl_->cur_op = impl_->runtime->NewOpExecutable(op->getName().getStringRef().str()); @@ -598,7 +637,7 @@ class MlirProgramTestExecutor : public MlirToRuntimeTranslator { llvm::SmallVector results; if (EmitReturnOp(&op, &results)) continue; if (EmitCallOp(&op, &impl_->func_defs)) continue; - if (EmitGeneralOp(&op)) continue; + if (EmitGeneralOp(&op, *registry)) continue; LOG(FATAL) << "Not supported op: " << DumpToString(op); } diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.h b/paddle/infrt/host_context/mlir_to_runtime_translate.h index 0c453651d9e6dc44adaf108ec6a1b0df984fe8be..27a7f20168667daddd353e902d49479aa612e38f 100644 --- a/paddle/infrt/host_context/mlir_to_runtime_translate.h +++ b/paddle/infrt/host_context/mlir_to_runtime_translate.h @@ -57,13 +57,14 @@ class MlirToRuntimeTranslator { protected: //! Emit a "infrt.constant.*" operation, return true if succeed. bool EmitConstantOp(mlir::Operation* op); - //! Emit a "Infrt.return" operation. + //! Emit a "infrt.return" operation. bool EmitReturnOp(mlir::Operation* op, llvm::SmallVectorImpl* results); //! Emit a "ts.build_shape" operation. bool EmitBuildShapeOp(mlir::Operation* op); //! Emit an operation other than the special cases above. - bool EmitGeneralOp(mlir::Operation* op); + bool EmitGeneralOp(mlir::Operation* op, + const KernelRegistry& kernel_registry); //! Emit all the functions. bool EmitFunctions(); diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc b/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc index 5824e40abf97a4d63543948d056e815bbeebce3a..31615fbc3f6e46f55ddc5f56641750feb0972772 100644 --- a/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc +++ b/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc @@ -37,14 +37,14 @@ TEST(MlirToRuntimeTranslate, basic) { auto source = R"ROC( func @main() -> () { - %v0 = Infrt.constant.f32 1.0 - %v1 = Infrt.constant.f32 2.0 - %v2 = "Infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32 - %v3 = "Infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32 + %v0 = infrt.constant.f32 1.0 + %v1 = infrt.constant.f32 2.0 + %v2 = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32 + %v3 = "infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32 - "Infrt.print.f32"(%v1) : (f32) -> () + "infrt.print.f32"(%v1) : (f32) -> () - Infrt.return + infrt.return } )ROC"; @@ -63,14 +63,14 @@ TEST(TestMlir, basic) { auto source = R"ROC( func @main() -> () { - %v0 = Infrt.constant.f32 1.0 - %v1 = Infrt.constant.f32 2.0 - %v2 = "Infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32 - %v3 = "Infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32 + %v0 = infrt.constant.f32 1.0 + %v1 = infrt.constant.f32 2.0 + %v2 = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32 + %v3 = "infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32 - "Infrt.print.f32"(%v1) : (f32) -> () + "infrt.print.f32"(%v1) : (f32) -> () - Infrt.return + infrt.return } )ROC"; @@ -101,7 +101,7 @@ func @predict(%a: !infrt.dense_tensor, %b: !infrt.dense_tensor< "!infrt.dense_tensor"; auto end = R"ROC( -Infrt.return %a0, %b0: !infrt.dense_tensor, !infrt.dense_tensor +infrt.return %a0, %b0: !infrt.dense_tensor, !infrt.dense_tensor } )ROC"; diff --git a/paddle/infrt/host_context/paddle_mlir.cc b/paddle/infrt/host_context/paddle_mlir.cc index 6afef5935c73450b4865c0e02593aa372299c95f..18c25827b8ec5a71907e694cea4e7680b598e883 100644 --- a/paddle/infrt/host_context/paddle_mlir.cc +++ b/paddle/infrt/host_context/paddle_mlir.cc @@ -19,7 +19,6 @@ MLIRModelGenImpl::MLIRModelGenImpl() : context_(infrt::Global::getMLIRContext()), builder_(context_) { context_->allowUnregisteredDialects(); context_->getOrLoadDialect(); - context_->getOrLoadDialect(); context_->getOrLoadDialect(); context_->getOrLoadDialect(); context_->getOrLoadDialect(); diff --git a/paddle/infrt/host_context/paddle_mlir.h b/paddle/infrt/host_context/paddle_mlir.h index 78dfefcfda2c83760492766507999322152187eb..e825cbb5a11ea0dfcacfc2b1bbb63bf201219c9d 100644 --- a/paddle/infrt/host_context/paddle_mlir.h +++ b/paddle/infrt/host_context/paddle_mlir.h @@ -25,10 +25,10 @@ #include "mlir/IR/MLIRContext.h" #include "paddle/infrt/common/global.h" #include "paddle/infrt/common/string.h" -#include "paddle/infrt/dialect/basic_kernels.h" #include "paddle/infrt/dialect/dense_tensor.h" -#include "paddle/infrt/dialect/infrt_base.h" -#include "paddle/infrt/dialect/init_infrt_dialects.h" +#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h" + +#include "paddle/infrt/dialect/init_dialects.h" #include "paddle/infrt/dialect/pd_ops.h" #include "paddle/infrt/dialect/tensor_shape.h" #include "paddle/infrt/paddle/model_parser.h" diff --git a/paddle/infrt/host_context/value.h b/paddle/infrt/host_context/value.h index 86df3508cf813628b4a8ba8412ce93d6b1dfc5a2..957d852442b10620244e230a2f7704eb7fa0a33e 100644 --- a/paddle/infrt/host_context/value.h +++ b/paddle/infrt/host_context/value.h @@ -22,7 +22,7 @@ #include "paddle/infrt/common/object.h" #include "paddle/infrt/common/shared.h" -#include "paddle/infrt/dialect/infrt/common_type.h" +#include "paddle/infrt/dialect/infrt/common/types.h" #include "paddle/infrt/host_context/function.h" #include "paddle/infrt/support/variant.h" #include "paddle/infrt/tensor/dense_host_tensor.h" diff --git a/paddle/infrt/kernel/basic_kernels.cc b/paddle/infrt/kernel/basic_kernels.cc index 23e50a5ddc87427bbf0f49c559f185084e42c8ec..b186cfcfd2b355f97711ecc916e497c2916d4060 100644 --- a/paddle/infrt/kernel/basic_kernels.cc +++ b/paddle/infrt/kernel/basic_kernels.cc @@ -63,24 +63,24 @@ static void PrintString(const std::string &str) { void RegisterBasicKernels(host_context::KernelRegistry *registry) { RegisterIntBasicKernels(registry); RegisterFloatBasicKernels(registry); - registry->AddKernel("Infrt.get_string", INFRT_KERNEL(GetString)); - registry->AddKernel("Infrt.print_string", INFRT_KERNEL(PrintString)); + registry->AddKernel("infrt.get_string", INFRT_KERNEL(GetString)); + registry->AddKernel("infrt.print_string", INFRT_KERNEL(PrintString)); } void RegisterIntBasicKernels(host_context::KernelRegistry *registry) { - registry->AddKernel("Infrt.add.i32", INFRT_KERNEL(add)); - registry->AddKernel("Infrt.sub.i32", INFRT_KERNEL(sub)); - registry->AddKernel("Infrt.mul.i32", INFRT_KERNEL(mul)); - registry->AddKernel("Infrt.div.i32", INFRT_KERNEL(div)); - registry->AddKernel("Infrt.print.i32", INFRT_KERNEL(print)); + registry->AddKernel("infrt.add.i32", INFRT_KERNEL(add)); + registry->AddKernel("infrt.sub.i32", INFRT_KERNEL(sub)); + registry->AddKernel("infrt.mul.i32", INFRT_KERNEL(mul)); + registry->AddKernel("infrt.div.i32", INFRT_KERNEL(div)); + registry->AddKernel("infrt.print.i32", INFRT_KERNEL(print)); } void RegisterFloatBasicKernels(host_context::KernelRegistry *registry) { - registry->AddKernel("Infrt.add.f32", INFRT_KERNEL(add)); - registry->AddKernel("Infrt.sub.f32", INFRT_KERNEL(sub)); - registry->AddKernel("Infrt.mul.f32", INFRT_KERNEL(mul)); - registry->AddKernel("Infrt.div.f32", INFRT_KERNEL(div)); - registry->AddKernel("Infrt.print.f32", INFRT_KERNEL(print)); + registry->AddKernel("infrt.add.f32", INFRT_KERNEL(add)); + registry->AddKernel("infrt.sub.f32", INFRT_KERNEL(sub)); + registry->AddKernel("infrt.mul.f32", INFRT_KERNEL(mul)); + registry->AddKernel("infrt.div.f32", INFRT_KERNEL(div)); + registry->AddKernel("infrt.print.f32", INFRT_KERNEL(print)); } } // namespace kernel diff --git a/paddle/infrt/kernel/control_flow_kernels.cc b/paddle/infrt/kernel/control_flow_kernels.cc index 8b18aca0210860f4ae688f2133ffa022fda3195d..6cc94dbcce0775cb6b74f993bfdd262fd6a47e6f 100644 --- a/paddle/infrt/kernel/control_flow_kernels.cc +++ b/paddle/infrt/kernel/control_flow_kernels.cc @@ -37,7 +37,7 @@ static void INFRTCall( } void RegisterControlFlowKernels(host_context::KernelRegistry* registry) { - registry->AddKernel("Infrt.call", INFRT_KERNEL(INFRTCall)); + registry->AddKernel("infrt.call", INFRT_KERNEL(INFRTCall)); } } // namespace kernel diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc index e89ee7cfe5d6f51b3206aecc6ca283e06c0e5561..777fb29ac60d9c7125898752747bbdf553f370c0 100644 --- a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc +++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc @@ -23,23 +23,23 @@ namespace phi { ::phi::DenseTensor CreateDenseTensor( const ::phi::CPUContext& context, host_context::Attribute> dims, - host_context::Attribute<::infrt::LayoutType> layout, host_context::Attribute> lod, + host_context::Attribute<::infrt::LayoutType> layout, host_context::Attribute<::infrt::PrecisionType> precision) { return ::phi::DenseTensor( const_cast<::phi::Allocator*>(&context.GetAllocator()), - ::phi::DenseTensorMeta(cvtPrecision2Phi(precision.get()), + ::phi::DenseTensorMeta(ConvertPrecisionToPhi(precision.get()), ::phi::make_ddim(dims.get()), - cvtLayout2Phi(layout.get()), + ConvertLayoutToPhi(layout.get()), {})); } void FillDenseTensorF32(::phi::DenseTensor* dense_tensor, - host_context::Attribute> values) { + host_context::Attribute> value) { auto place = ::phi::CPUPlace(); float* a_data = dense_tensor->mutable_data(place); for (int64_t i = 0; i < dense_tensor->numel(); ++i) { - a_data[i] = (values.get())[i]; + a_data[i] = (value.get())[i]; } } @@ -57,7 +57,7 @@ void PrintDenseTensor(::phi::DenseTensor* dense_tensor) { ::phi::DDim dims = dense_tensor->dims(); std::cout << "dense_tensor: shape=shape" << dims.to_str() << "," - << " values=["; + << " value=["; switch (dense_tensor->dtype()) { PRINT_META_DATA(FLOAT32, float); PRINT_META_DATA(INT32, int32_t); diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.h b/paddle/infrt/kernel/phi/dense_tensor_kernels.h index 187e5c64511e83556bec50f4368ae7cbe89dda90..8cc0e39e0e4431f073ac37a7f0557f2c837dc753 100644 --- a/paddle/infrt/kernel/phi/dense_tensor_kernels.h +++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.h @@ -15,7 +15,7 @@ #pragma once #include "paddle/infrt/backends/host/phi_allocator.h" -#include "paddle/infrt/dialect/infrt/common_type.h" +#include "paddle/infrt/dialect/infrt/common/types.h" #include "paddle/infrt/host_context/kernel_utils.h" #include "paddle/phi/core/dense_tensor.h" @@ -26,8 +26,8 @@ namespace phi { ::phi::DenseTensor CreateDenseTensor( const ::phi::CPUContext& context, host_context::Attribute> dims, - host_context::Attribute<::infrt::LayoutType> layout, host_context::Attribute> lod, + host_context::Attribute<::infrt::LayoutType> layout, host_context::Attribute<::infrt::PrecisionType> precision); void FillDenseTensorF32(::phi::DenseTensor* dense_tensor, diff --git a/paddle/infrt/kernel/phi/registry.cc b/paddle/infrt/kernel/phi/registry.cc index 90570484179d1e555f86c55ea0e8ac4f9bc83c53..0e071418603f8390ca3283f617b06cf1fa91b94c 100644 --- a/paddle/infrt/kernel/phi/registry.cc +++ b/paddle/infrt/kernel/phi/registry.cc @@ -34,10 +34,14 @@ namespace kernel { void RegisterPhiKernels(host_context::KernelRegistry* registry) { registry->AddKernel("phi_dt.create_context.cpu", INFRT_KERNEL(infrt::kernel::phi::CreateCPUContext)); - registry->AddKernel("phi_dt.create_dense_tensor", - INFRT_KERNEL(infrt::kernel::phi::CreateDenseTensor)); - registry->AddKernel("phi_dt.fill_dense_tensor.f32", - INFRT_KERNEL(infrt::kernel::phi::FillDenseTensorF32)); + registry->AddKernelWithAttrs( + "phi_dt.create_dense_tensor", + INFRT_KERNEL(infrt::kernel::phi::CreateDenseTensor), + {"dims", "lod", "layout", "precision"}); + registry->AddKernelWithAttrs( + "phi_dt.fill_dense_tensor.f32", + INFRT_KERNEL(infrt::kernel::phi::FillDenseTensorF32), + {"value"}); registry->AddKernel("phi_dt.print_tensor", INFRT_KERNEL(infrt::kernel::phi::PrintDenseTensor)); } diff --git a/paddle/infrt/kernel/tensor_kernels.cc b/paddle/infrt/kernel/tensor_kernels.cc index d5922af9ada1f4983fe14df87c09180fe17fda19..b7503aa4ef35894dda514fdb7fa4336485323094 100644 --- a/paddle/infrt/kernel/tensor_kernels.cc +++ b/paddle/infrt/kernel/tensor_kernels.cc @@ -111,9 +111,9 @@ void NaiveMatmul(const DenseHostTensor &x, /// ===== Kernel end ==== void RegisterTensorKernels(host_context::KernelRegistry *registry) { - registry->AddKernel("dt.create_uninit_tensor.f32", - INFRT_KERNEL(CreateUninitTensor)); - registry->AddKernelAttrNameList("dt.create_uninit_tensor.f32", {"shape"}); + registry->AddKernelWithAttrs("dt.create_uninit_tensor.f32", + INFRT_KERNEL(CreateUninitTensor), + {"shape"}); registry->AddKernel("dt.print_tensor", INFRT_KERNEL(PrintTensor)); registry->AddKernel("dt.fill_tensor_with_constant.f32", INFRT_KERNEL(FillTensorWithConstant)); diff --git a/paddle/infrt/kernel/test_kernels.cc b/paddle/infrt/kernel/test_kernels.cc index d15bbe221f91a87b047863121f32699175183c54..bcf475d1bc09dab8be1b7a23359e1eb935ee02e0 100644 --- a/paddle/infrt/kernel/test_kernels.cc +++ b/paddle/infrt/kernel/test_kernels.cc @@ -193,7 +193,7 @@ tensor::DenseHostTensor ShadowCopyTensor(tensor::DenseHostTensor src) { } void RegisterTestKernels(host_context::KernelRegistry *registry) { - registry->AddKernel("Infrt.benchmark", INFRT_KERNEL(benchmark)); + registry->AddKernel("infrt.benchmark", INFRT_KERNEL(benchmark)); registry->AddKernel("Infrt.test.shadow_copy_tensor", INFRT_KERNEL(ShadowCopyTensor)); } diff --git a/paddle/infrt/tests/dialect/basic.mlir b/paddle/infrt/tests/dialect/basic.mlir index 2d4d6f2629ec7df989499f0a2e9649c01ae8428a..f534a3aa44aac964c262465da199ac926fa0904e 100644 --- a/paddle/infrt/tests/dialect/basic.mlir +++ b/paddle/infrt/tests/dialect/basic.mlir @@ -1,33 +1,33 @@ // RUN: infrtexec -i %s | FileCheck %s // CHECK-LABEL: @basic_f32 func @basic_f32() -> f32 { - %v0 = Infrt.constant.f32 1.0 - %v1 = Infrt.constant.f32 2.0 - %value = "Infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32 + %v0 = infrt.constant.f32 1.0 + %v1 = infrt.constant.f32 2.0 + %value = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32 // CHECK-NEXT: 3 - "Infrt.print.f32"(%value) : (f32) -> () + "infrt.print.f32"(%value) : (f32) -> () - Infrt.return %value : f32 + infrt.return %value : f32 } /// ================================================================ /// @caller call the other function @callee func @callee.add.f32(%x : f32, %y : f32, %y1 : f32) -> f32 { - %z = "Infrt.add.f32"(%x, %y) : (f32, f32) -> f32 - %z1 = "Infrt.add.f32"(%z, %y1) : (f32, f32) -> f32 - Infrt.return %z1 : f32 + %z = "infrt.add.f32"(%x, %y) : (f32, f32) -> f32 + %z1 = "infrt.add.f32"(%z, %y1) : (f32, f32) -> f32 + infrt.return %z1 : f32 } // CHECK-LABEL: @caller.add.f32 func @caller.add.f32() -> f32 { - %x = Infrt.constant.f32 1.0 - %y = Infrt.constant.f32 2.0 - %y1 = Infrt.constant.f32 3.0 - %z = Infrt.call @callee.add.f32(%x, %y, %y1) : (f32, f32, f32) -> f32 + %x = infrt.constant.f32 1.0 + %y = infrt.constant.f32 2.0 + %y1 = infrt.constant.f32 3.0 + %z = infrt.call @callee.add.f32(%x, %y, %y1) : (f32, f32, f32) -> f32 // CHECK-NEXT: 6 - "Infrt.print.f32"(%z) : (f32) -> () - Infrt.return %z : f32 + "infrt.print.f32"(%z) : (f32) -> () + infrt.return %z : f32 } /// <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< diff --git a/paddle/infrt/tests/dialect/benchmark.mlir b/paddle/infrt/tests/dialect/benchmark.mlir index 381fd534f6a5a09e3091203de88ebf00101074af..1a57b43499062410b346b38412a533d3edd6fbcc 100644 --- a/paddle/infrt/tests/dialect/benchmark.mlir +++ b/paddle/infrt/tests/dialect/benchmark.mlir @@ -12,13 +12,13 @@ func @benchmark() { // CHECK-LABEL: BM:add.f32:CPU 95%(ns) // CHECK-LABEL: BM:add.f32:CPU 99%(ns) // CHECK-LABEL: BM:add.f32:CPU utilization(percent) - Infrt.benchmark "add.f32"() duration_secs = 1, max_count = 3, num_warmup_runs = 3 + infrt.benchmark "add.f32"() duration_secs = 1, max_count = 3, num_warmup_runs = 3 { - %0 = Infrt.constant.f32 1.0 - %1 = Infrt.constant.f32 2.0 - %res = "Infrt.add.f32"(%0, %1) : (f32, f32) -> f32 - "Infrt.print.f32"(%res) : (f32) -> () - Infrt.return %res : f32 + %0 = infrt.constant.f32 1.0 + %1 = infrt.constant.f32 2.0 + %res = "infrt.add.f32"(%0, %1) : (f32, f32) -> f32 + "infrt.print.f32"(%res) : (f32) -> () + infrt.return %res : f32 } - Infrt.return + infrt.return } diff --git a/paddle/infrt/tests/dialect/dense_tensor.mlir b/paddle/infrt/tests/dialect/dense_tensor.mlir index faade62d35063b1d85c4c1d3ddad98b085a7726c..6dc9904610477139b6c254d0f9f7b754041a83cc 100644 --- a/paddle/infrt/tests/dialect/dense_tensor.mlir +++ b/paddle/infrt/tests/dialect/dense_tensor.mlir @@ -4,14 +4,14 @@ func @dense_shape0() { %shape = ts.build_shape [1:i64, 57:i64] %a = dt.create_uninit_tensor.f32 [12:i64, 23:i64] -> !infrt.dense_tensor - Infrt.return + infrt.return } func @predict(%a: !infrt.dense_tensor, %b: !infrt.dense_tensor) -> (!infrt.dense_tensor, !infrt.dense_tensor) { %a0 = dt.shallow_copy_tensor %a : !infrt.dense_tensor -> !infrt.dense_tensor %b0 = dt.shallow_copy_tensor %b : !infrt.dense_tensor -> !infrt.dense_tensor - Infrt.return %a0, %b0: !infrt.dense_tensor, !infrt.dense_tensor + infrt.return %a0, %b0: !infrt.dense_tensor, !infrt.dense_tensor } @@ -19,6 +19,6 @@ func @main() { %shape = ts.build_shape [1:i64, 57:i64] %a = dt.create_uninit_tensor.f32 [12:i64, 23:i64] -> !infrt.dense_tensor - %b, %c = Infrt.call @predict(%a, %a) : (!infrt.dense_tensor, !infrt.dense_tensor) -> (!infrt.dense_tensor, !infrt.dense_tensor) - Infrt.return + %b, %c = infrt.call @predict(%a, %a) : (!infrt.dense_tensor, !infrt.dense_tensor) -> (!infrt.dense_tensor, !infrt.dense_tensor) + infrt.return } diff --git a/paddle/infrt/tests/dialect/disabled_tensor_map.mlir b/paddle/infrt/tests/dialect/disabled_tensor_map.mlir index 1cae065bd5fb6a6a1aa06b4cd6605a240917b55f..936c8f32c01521817e185fa80e836018e7b02aa8 100644 --- a/paddle/infrt/tests/dialect/disabled_tensor_map.mlir +++ b/paddle/infrt/tests/dialect/disabled_tensor_map.mlir @@ -1,30 +1,30 @@ // CHECK-LABEL: @predict -func @predict(%input:!Infrt.tensor, %map: !Infrt.tensor_map) -> (!Infrt.tensor) { - %w = dt.get_param(%map, "create_parameter_0.w_0") -> !Infrt.tensor - %bias = dt.get_param(%map, "create_parameter_1.w_0") -> !Infrt.tensor +func @predict(%input:!infrt.dense_tensor, %map: !infrt.dense_tensor_map) -> (!infrt.dense_tensor) { + %w = dt.get_param(%map, "create_parameter_0.w_0") -> !infrt.dense_tensor + %bias = dt.get_param(%map, "create_parameter_1.w_0") -> !infrt.dense_tensor - %out = dt.create_uninit_tensor.f32 [3, 3] -> !Infrt.tensor + %out = dt.create_uninit_tensor.f32 [3, 3] -> !infrt.dense_tensor // fc - "external.matmul"(%input, %w, %out) {}: (!Infrt.tensor, !Infrt.tensor, !Infrt.tensor) -> () - "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!Infrt.tensor, !Infrt.tensor, !Infrt.tensor) -> () - "external.sigmoid"(%out, %out) {}: (!Infrt.tensor, !Infrt.tensor) -> () - //dt.print_tensor (%out : !Infrt.tensor) + "external.matmul"(%input, %w, %out) {}: (!infrt.dense_tensor, !infrt.dense_tensor, !infrt.dense_tensor) -> () + "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!infrt.dense_tensor, !infrt.dense_tensor, !infrt.dense_tensor) -> () + "external.sigmoid"(%out, %out) {}: (!infrt.dense_tensor, !infrt.dense_tensor) -> () + //dt.print_tensor (%out : !infrt.dense_tensor) - Infrt.return %out : !Infrt.tensor + infrt.return %out : !infrt.dense_tensor } // CHECK-LABEL: @main func @main() { - %input = dt.create_uninit_tensor.f32 [3, 3] -> !Infrt.tensor - dt.fill_tensor_with_constant.f32 (%input : !Infrt.tensor) {value=1.0:f32} + %input = dt.create_uninit_tensor.f32 [3, 3] -> !infrt.dense_tensor + dt.fill_tensor_with_constant.f32 (%input : !infrt.dense_tensor) {value=1.0:f32} // CHECK-LABEL: loading params %map = dt.load_params() {path="/Infrt/build/paddle/paddle_1.8_fc_model"} - %out = Infrt.call @predict(%input, %map): (!Infrt.tensor, !Infrt.tensor_map) -> (!Infrt.tensor) - dt.print_tensor (%out : !Infrt.tensor) + %out = infrt.call @predict(%input, %map): (!infrt.dense_tensor, !infrt.dense_tensor_map) -> (!infrt.dense_tensor) + dt.print_tensor (%out : !infrt.dense_tensor) - Infrt.return + infrt.return } diff --git a/paddle/infrt/tests/dialect/paddle_ops.mlir b/paddle/infrt/tests/dialect/paddle_ops.mlir index 48ee4b9d725c0aa36d4849c2842c99997de5c8ee..4b8055514936417dd83a6bb23afaea31eb2d1013 100644 --- a/paddle/infrt/tests/dialect/paddle_ops.mlir +++ b/paddle/infrt/tests/dialect/paddle_ops.mlir @@ -5,5 +5,5 @@ func @ops() { %b = pd.feed() {name="input1"}: tensor %d = pd.feed() {name="input3"}: !infrt.lod_tensor<3x4x9xf32, 0> %c = "pd.matmul"(%a, %b) {transpose_x=true, transpose_y=false} : (tensor, tensor) -> tensor - Infrt.return + infrt.return } diff --git a/paddle/infrt/tests/dialect/phi/dense_tensor.mlir b/paddle/infrt/tests/dialect/phi/dense_tensor.mlir index e8f09f07c82c4003e23a54c7275f576f7916f853..3657777a5b0bce1c5a5e4df8d59695f8b122da56 100644 --- a/paddle/infrt/tests/dialect/phi/dense_tensor.mlir +++ b/paddle/infrt/tests/dialect/phi/dense_tensor.mlir @@ -9,8 +9,8 @@ func @sign_any_float32_execute() { "phi_dt.fill_dense_tensor.f32"(%t) {value=[3.8:f32]} : (!infrt.dense_tensor) -> () %e = "phi_cpu.sign.float32.any"(%ctx, %t) : (!phi.context, !infrt.dense_tensor) -> (!infrt.dense_tensor) - // CHECK: dense_tensor: shape=shape[1], values=[1] + // CHECK: dense_tensor: shape=shape[1], value=[1] "phi_dt.print_tensor" (%e) : (!infrt.dense_tensor) -> () - Infrt.return + infrt.return } diff --git a/paddle/infrt/tests/dialect/phi/phi_test.mlir b/paddle/infrt/tests/dialect/phi/phi_test.mlir index 923f4e9d9d2ce6f6a24f91f04721f49712f900b5..5b0fa735897a31287bb6dea487e2f22eacd7b0aa 100644 --- a/paddle/infrt/tests/dialect/phi/phi_test.mlir +++ b/paddle/infrt/tests/dialect/phi/phi_test.mlir @@ -2,14 +2,14 @@ module { func @predict(%arg0: !infrt.dense_tensor) -> !infrt.dense_tensor { %2 = "pd.abs"(%arg0) : (!infrt.dense_tensor) -> !infrt.dense_tensor - Infrt.return %2 : !infrt.dense_tensor + infrt.return %2 : !infrt.dense_tensor } func @main() { %ctx = "phi_dt.create_context.cpu" (): () -> !phi.context %t = "phi_dt.create_dense_tensor" (%ctx) {precision=#infrt.precision, layout=#infrt.layout, lod=[1:i64], dims=[1:i64]}: (!phi.context) -> (!infrt.dense_tensor) "phi_dt.fill_dense_tensor.f32"(%t) {value=[3.8:f32]} : (!infrt.dense_tensor) -> () - %2 = Infrt.call@predict(%t) : (!infrt.dense_tensor) -> !infrt.dense_tensor + %2 = infrt.call@predict(%t) : (!infrt.dense_tensor) -> !infrt.dense_tensor phi_dt.print_tensor(%2 : !infrt.dense_tensor) - Infrt.return + infrt.return } } diff --git a/paddle/infrt/tests/dialect/tensor/dense_tensor.mlir b/paddle/infrt/tests/dialect/tensor/dense_tensor.mlir index 76ae140dd6cbd741f992315ee35d3e94058d4674..47bc1f7833140c8a876660673fa11f148d42db90 100644 --- a/paddle/infrt/tests/dialect/tensor/dense_tensor.mlir +++ b/paddle/infrt/tests/dialect/tensor/dense_tensor.mlir @@ -3,14 +3,14 @@ func @dense_shape0() { %a = dt.create_uninit_tensor.f32 [12:i64, 23:i64] -> !infrt.dense_tensor - Infrt.return + infrt.return } func @predict(%a: !infrt.dense_tensor, %b: !infrt.dense_tensor) -> (!infrt.dense_tensor, !infrt.dense_tensor) { %a0 = dt.shallow_copy_tensor %a : !infrt.dense_tensor -> !infrt.dense_tensor %b0 = dt.shallow_copy_tensor %b : !infrt.dense_tensor -> !infrt.dense_tensor - Infrt.return %a0, %b0: !infrt.dense_tensor, !infrt.dense_tensor + infrt.return %a0, %b0: !infrt.dense_tensor, !infrt.dense_tensor } @@ -18,6 +18,6 @@ func @main() { %shape = ts.build_shape [1:i64, 57:i64] %a = dt.create_uninit_tensor.f32 [12:i64, 23:i64] -> !infrt.dense_tensor - %b, %c = Infrt.call @predict(%a, %a) : (!infrt.dense_tensor, !infrt.dense_tensor) -> (!infrt.dense_tensor, !infrt.dense_tensor) - Infrt.return + %b, %c = infrt.call @predict(%a, %a) : (!infrt.dense_tensor, !infrt.dense_tensor) -> (!infrt.dense_tensor, !infrt.dense_tensor) + infrt.return } diff --git a/paddle/infrt/tests/dialect/tensor/naive_kernels.mlir b/paddle/infrt/tests/dialect/tensor/naive_kernels.mlir index 52b296e06cd365fbaa1249108f877dc9f7480ff0..d6b69fdd595ea520f623e4b9651fc6e2b321c26f 100644 --- a/paddle/infrt/tests/dialect/tensor/naive_kernels.mlir +++ b/paddle/infrt/tests/dialect/tensor/naive_kernels.mlir @@ -13,7 +13,7 @@ func @naive_elementwise_add() { // CHECK: tensor: shape=shape[2,8], values=[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3] dt.print_tensor (%c : !infrt.dense_tensor) - Infrt.return + infrt.return } // RUN: infrtexec -i %s | FileCheck %s @@ -31,5 +31,5 @@ func @naive_matmul() { // CHECK: tensor: shape=shape[2,4], values=[16, 16, 16, 16, 16, 16, 16, 16] dt.print_tensor (%c : !infrt.dense_tensor) - Infrt.return + infrt.return } diff --git a/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in b/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in index 28450ed6bd823f7d18eff19371a2a1a49292b329..7aeb3f8a4d0513deaed6bda73a591790b633d0db 100644 --- a/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in +++ b/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in @@ -3,12 +3,12 @@ func @load_tensor_map() { %map = dt.load_params(){path="@CMAKE_BINARY_DIR@/multi_fc_model"} %size = dt.tensor_map_get_size(%map) -> i32 - Infrt.print.i32 %size + infrt.print.i32 %size %a = dt.tensor_map_get_tensor(%map) {name="fc_bias"} -> !infrt.dense_tensor // CHECK: tensor: shape=shape[2], values=[0, 0] dt.print_tensor (%a : !infrt.dense_tensor) - Infrt.return + infrt.return } diff --git a/paddle/infrt/tests/dialect/tensor/tensor_shape.mlir b/paddle/infrt/tests/dialect/tensor/tensor_shape.mlir index 5623aef71aa2c33ff0bd3524855c56e9dcab5e9b..09210078b9d7d139f2bc2534acf07e83aa1146bb 100644 --- a/paddle/infrt/tests/dialect/tensor/tensor_shape.mlir +++ b/paddle/infrt/tests/dialect/tensor/tensor_shape.mlir @@ -4,5 +4,5 @@ func @build_tensor1() { %a = ts.build_shape [1:i64, 57:i64, 92:i64] // CHECK: shape[1,57,92] ts.print_shape %a - Infrt.return + infrt.return } diff --git a/paddle/infrt/tests/dialect/tensor/tensor_type.mlir b/paddle/infrt/tests/dialect/tensor/tensor_type.mlir index e580634055a72eae66196f67c8321c308599a1af..5847d567cf6b42a9404d33a938a67c6dc2f4aefc 100644 --- a/paddle/infrt/tests/dialect/tensor/tensor_type.mlir +++ b/paddle/infrt/tests/dialect/tensor/tensor_type.mlir @@ -6,5 +6,5 @@ func @test_tensor_type() { // CHECK: tensor: shape=shape[3,4], values=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] dt.print_tensor (%a : !infrt.dense_tensor) - Infrt.return + infrt.return } diff --git a/paddle/infrt/tests/dialect/tensor_shape.mlir b/paddle/infrt/tests/dialect/tensor_shape.mlir index 5623aef71aa2c33ff0bd3524855c56e9dcab5e9b..09210078b9d7d139f2bc2534acf07e83aa1146bb 100644 --- a/paddle/infrt/tests/dialect/tensor_shape.mlir +++ b/paddle/infrt/tests/dialect/tensor_shape.mlir @@ -4,5 +4,5 @@ func @build_tensor1() { %a = ts.build_shape [1:i64, 57:i64, 92:i64] // CHECK: shape[1,57,92] ts.print_shape %a - Infrt.return + infrt.return } diff --git a/paddle/infrt/tests/dialect/tensor_type.mlir b/paddle/infrt/tests/dialect/tensor_type.mlir index e580634055a72eae66196f67c8321c308599a1af..5847d567cf6b42a9404d33a938a67c6dc2f4aefc 100644 --- a/paddle/infrt/tests/dialect/tensor_type.mlir +++ b/paddle/infrt/tests/dialect/tensor_type.mlir @@ -6,5 +6,5 @@ func @test_tensor_type() { // CHECK: tensor: shape=shape[3,4], values=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] dt.print_tensor (%a : !infrt.dense_tensor) - Infrt.return + infrt.return } diff --git a/paddle/infrt/tests/dialect/trt_ops.mlir b/paddle/infrt/tests/dialect/trt_ops.mlir index 6d25044d139f32c0a29adefb44c8fd2640cadd82..e3cb9670bec015e58e2a538bb55dfbe7c8b7f554 100644 --- a/paddle/infrt/tests/dialect/trt_ops.mlir +++ b/paddle/infrt/tests/dialect/trt_ops.mlir @@ -12,5 +12,5 @@ func @main(%bias:tensor, %c:tensor, %b1:tensor, %b2:tensor< %d2 = "pd.elementwise_add"(%c2, %bias2) {axis=-1:si32} : (tensor, tensor) -> tensor %e2 = "pd.relu"(%d2) {} : (tensor) -> tensor - "infrt.return"(%e2) : (tensor)->() + infrt.return %e2 : tensor } diff --git a/paddle/phi/api/ext/op_meta_info.h b/paddle/phi/api/ext/op_meta_info.h index 7601696293a66d626a7fd417b32544d035921467..88660449b6821ef4cda2d1859c5551d0a00d59a6 100644 --- a/paddle/phi/api/ext/op_meta_info.h +++ b/paddle/phi/api/ext/op_meta_info.h @@ -86,19 +86,28 @@ class PADDLE_API CustomOpKernelContext { CustomOpKernelContext() = default; void EmplaceBackInput(Tensor&& input); - void EmplaceBackInputs(std::vector&& inputs); + void EmplaceBackInputs(const std::vector& inputs); void EmplaceBackOutput(Tensor&& output); - void EmplaceBackOutputs(std::vector&& outputs); + void EmplaceBackOutputs(const std::vector& outputs); void EmplaceBackAttr(paddle::any attr); - + void EmplaceBackAttrs(const std::vector& attrs) { + attrs_ = std::move(attrs); + } const std::pair& InputRangeAt(size_t idx) const; const std::pair& OutputRangeAt(size_t idx) const; const Tensor& InputAt(size_t idx) const; std::vector InputsBetween(size_t start, size_t end) const; - + const std::vector& Attrs() const { return attrs_; } + const std::vector>& InputRange() { + return input_range_; + } + const std::vector>& OutputRange() { + return output_range_; + } Tensor* MutableOutputAt(size_t idx); std::vector MutableOutputBetweeen(size_t start, size_t end); + std::vector OutputsBetweeen(size_t start, size_t end); std::vector* AllMutableOutput(); template @@ -552,7 +561,6 @@ class PADDLE_API OpMetaInfo { std::vector inputs_; std::vector outputs_; std::vector attrs_; - // 2. func info KernelFunc kernel_fn_{nullptr}; InferShapeFunc infer_shape_fn_{nullptr}; diff --git a/paddle/phi/api/lib/op_meta_info.cc b/paddle/phi/api/lib/op_meta_info.cc index 51d51c954de81d8e9116304e31374ce8d9934305..14dba664c41b3d7b138630c739bfe7b934d04e9f 100644 --- a/paddle/phi/api/lib/op_meta_info.cc +++ b/paddle/phi/api/lib/op_meta_info.cc @@ -51,7 +51,8 @@ void CustomOpKernelContext::EmplaceBackInput(Tensor&& input) { input_range_.emplace_back(std::make_pair(index, index + 1)); } -void CustomOpKernelContext::EmplaceBackInputs(std::vector&& inputs) { +void CustomOpKernelContext::EmplaceBackInputs( + const std::vector& inputs) { size_t index = inputs_.size(); input_range_.emplace_back(std::make_pair(index, index + inputs.size())); inputs_.insert(inputs_.end(), @@ -65,7 +66,8 @@ void CustomOpKernelContext::EmplaceBackOutput(Tensor&& output) { output_range_.emplace_back(std::make_pair(index, index + 1)); } -void CustomOpKernelContext::EmplaceBackOutputs(std::vector&& outputs) { +void CustomOpKernelContext::EmplaceBackOutputs( + const std::vector& outputs) { size_t index = outputs_.size(); output_range_.emplace_back(std::make_pair(index, index + outputs.size())); outputs_.insert(outputs_.end(), @@ -75,6 +77,8 @@ void CustomOpKernelContext::EmplaceBackOutputs(std::vector&& outputs) { void CustomOpKernelContext::EmplaceBackAttr(paddle::any attr) { attrs_.emplace_back(std::move(attr)); + VLOG(7) << "attrs_ No." << attrs_.size() - 1 + << " has value of type: " << attrs_[attrs_.size() - 1].type().name(); } const Tensor& CustomOpKernelContext::InputAt(size_t idx) const { @@ -102,6 +106,15 @@ std::vector CustomOpKernelContext::MutableOutputBetweeen(size_t start, return rlt; } +std::vector CustomOpKernelContext::OutputsBetweeen(size_t start, + size_t end) { + std::vector rlt; + for (size_t i = start; i < end; ++i) { + rlt.emplace_back(outputs_.at(i)); + } + return rlt; +} + std::vector* CustomOpKernelContext::AllMutableOutput() { return &outputs_; } diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc index 311dd0fc30941d2afb9f1bc1e7ae57f3a449a254..40174a505dcc9b3d475254b7cec7691300c7aecf 100644 --- a/paddle/phi/api/lib/tensor.cc +++ b/paddle/phi/api/lib/tensor.cc @@ -111,8 +111,8 @@ void Tensor::reshape(const std::vector &shape) { "touching underlying data, this requires the total size of " "the tensor to remain constant."; if (is_dense_tensor()) { - std::dynamic_pointer_cast(impl_)->set_meta( - phi::DenseTensorMeta(dtype(), phi::make_ddim(shape))); + std::dynamic_pointer_cast(impl_)->Resize( + phi::make_ddim(shape)); } else { PADDLE_THROW(phi::errors::Unimplemented( "Only support reshape operation on DenseTensor now.")); diff --git a/paddle/phi/api/lib/tensor_method.cc b/paddle/phi/api/lib/tensor_method.cc index aefa26952d1e5f224112576bfbd74be80cca72cc..885e29b27fa8e723ad0e89f9a99c2accd3c172f6 100644 --- a/paddle/phi/api/lib/tensor_method.cc +++ b/paddle/phi/api/lib/tensor_method.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/phi/api/include/tensor.h" #include "paddle/phi/api/lib/ext_compat_utils.h" +#include "paddle/phi/common/scalar_array.h" #include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/tensor_base.h" diff --git a/paddle/phi/api/lib/utils/tensor_utils.cc b/paddle/phi/api/lib/utils/tensor_utils.cc index 1c9f7c3a8683daaf26cb87b23e50284d0329c4a8..3d183ea7fee8b17a7037a3fd9a6b2999605d8e25 100644 --- a/paddle/phi/api/lib/utils/tensor_utils.cc +++ b/paddle/phi/api/lib/utils/tensor_utils.cc @@ -40,6 +40,13 @@ phi::Scalar MakePhiScalarFromVar(const framework::Variable& variable) { auto expected_place = phi::TransToPhiPlace(phi::Backend::CPU); if (variable.IsType()) { const auto& tensor = variable.Get(); + PADDLE_ENFORCE_EQ( + tensor.numel(), + 1UL, + platform::errors::InvalidArgument("The DenseTensor used to construct " + "the Scalar contains more than 1 " + "value, it contains `%d` values.", + tensor.numel())); if (!platform::is_same_place(tensor.place(), expected_place)) { framework::LoDTensor tmp_tensor; framework::TensorCopySync(tensor, expected_place, &tmp_tensor); diff --git a/paddle/phi/backends/device_ext.h b/paddle/phi/backends/device_ext.h index bbd4966b7274f88ad4fad47dfdf7ce8e50ae2a3a..6315fe15afdf1ecd9c7657396468320eda7d88c1 100644 --- a/paddle/phi/backends/device_ext.h +++ b/paddle/phi/backends/device_ext.h @@ -523,6 +523,15 @@ struct CustomRuntimeParams { char reserved[32]; }; +#define PADDLE_CUSTOM_RUNTIME_CHECK_VERSION(params) \ + if ((params)->size != sizeof(DevicePluginParams) && \ + (params)->interface->size != sizeof(C_DeviceInterface)) { \ + return; \ + } \ + (params)->version.major = PADDLE_DEVICE_PLUGIN_MAJOR_VERSION; \ + (params)->version.minor = PADDLE_DEVICE_PLUGIN_MINOR_VERSION; \ + (params)->version.patch = PADDLE_DEVICE_PLUGIN_PATCH_VERSION; + // Plugin implement it and fill CustomRuntimeParams void InitPlugin(CustomRuntimeParams*); diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc index 09deb575f2414a7a101c7f02d040ca1f4bd1a7f8..a3b252598582bc212ba66f9c18ec52e035a29a68 100644 --- a/paddle/phi/backends/gpu/gpu_context.cc +++ b/paddle/phi/backends/gpu/gpu_context.cc @@ -654,10 +654,17 @@ struct GPUContext::Impl { } void AddStreamCallback(const std::function& callback) const { - // TODO(wilber): Do we need ThreadPool? - auto* func = new std::function([this, callback] { + // NOTE(zhiqiu): better use threadpool here, otherwise "std::async" may + // launch too + // many threads and result in thread oversubscription. + auto* callback_func = new std::function(std::move(callback)); + auto* func = new std::function([this, callback_func] { std::lock_guard lock(stream_call_back_mtx_); - last_future_ = std::async(std::launch::deferred, [&]() { callback(); }); + VLOG(4) << "Stream callback"; + last_future_ = std::async(std::launch::async, [callback_func]() { + std::unique_ptr> releaser(callback_func); + (*callback_func)(); + }); }); #ifdef PADDLE_WITH_HIP diff --git a/paddle/phi/core/compat/op_utils.h b/paddle/phi/core/compat/op_utils.h index 00e9bff9bd5910ceedcca3dfb3a7a64ec88596df..7f4384545f353ecdbd33c73751e186061bf316cc 100644 --- a/paddle/phi/core/compat/op_utils.h +++ b/paddle/phi/core/compat/op_utils.h @@ -55,6 +55,7 @@ const std::unordered_set deprecated_op_names({"diag", "expand_grad", "expand_as_grad", "sum", + "one_hot", "sum_grad", "top_k", "top_k_grad"}); diff --git a/paddle/phi/core/custom_kernel.cc b/paddle/phi/core/custom_kernel.cc index bc317da8d98ed4eb8abf8250f03c364b17c178b1..48778bb38e5487506f4b402176fff26cbe485de7 100644 --- a/paddle/phi/core/custom_kernel.cc +++ b/paddle/phi/core/custom_kernel.cc @@ -33,6 +33,10 @@ void CustomKernelMap::RegisterCustomKernel(const std::string& name, void CustomKernelMap::RegisterCustomKernels() { VLOG(3) << "Size of custom_kernel_map: " << kernels_.size(); + if (kernels_.size() <= 0) { + LOG(INFO) << "No custom kernel info found in loaded lib(s)."; + return; + } auto& kernels = KernelFactory::Instance().kernels(); for (auto& pair : kernels_) { PADDLE_ENFORCE_NE( @@ -60,9 +64,10 @@ void CustomKernelMap::RegisterCustomKernels() { << info_pair.first << "] to Paddle. It will be used like native ones."; } - kernels_[pair.first].clear(); } - LOG(INFO) << "Successed in loading custom kernels."; + LOG(INFO) << "Successed in loading " << kernels_.size() + << " custom kernel(s) from loaded lib(s), will be " + << "used like native ones."; kernels_.clear(); } diff --git a/paddle/phi/core/meta_tensor.cc b/paddle/phi/core/meta_tensor.cc index eb114304f53ea08b05d36792330cf5bd3ebbee5d..bcbb1a4835b9d0397f6e85b7c44311bb9fe57209 100644 --- a/paddle/phi/core/meta_tensor.cc +++ b/paddle/phi/core/meta_tensor.cc @@ -72,6 +72,10 @@ void MetaTensor::set_layout(DataLayout layout) { } void MetaTensor::share_lod(const MetaTensor& meta_tensor) { + if (meta_tensor.lod().size() == 0) { + // no need share + return; + } if (phi::DenseTensor::classof(tensor_)) { DenseTensorUtils::GetMutableMeta(static_cast(tensor_))->lod = meta_tensor.lod(); @@ -110,7 +114,7 @@ void MetaTensor::share_meta(const MetaTensor& meta_tensor) { } } -TensorBase* MetaTensor::get_tensor() const { return tensor_; } +TensorBase* MetaTensor::tensor() const { return tensor_; } void MetaTensor::share_dims(const MetaTensor& meta_tensor) { bool is_dense_tensor = phi::DenseTensor::classof(tensor_); @@ -118,7 +122,7 @@ void MetaTensor::share_dims(const MetaTensor& meta_tensor) { if (is_dense_tensor || is_selected_rows) { set_dims(meta_tensor.dims()); if (is_selected_rows) { - const auto in_tensor_base = meta_tensor.get_tensor(); + const auto in_tensor_base = meta_tensor.tensor(); PADDLE_ENFORCE_EQ( phi::SelectedRows::classof(in_tensor_base), true, diff --git a/paddle/phi/core/meta_tensor.h b/paddle/phi/core/meta_tensor.h index 3971a9f7e99e0282cae5e4d1e61ee6eb28c4b9a7..10c3a7c1a3de376d21805a12ff0b2c98ab4fcbd3 100644 --- a/paddle/phi/core/meta_tensor.h +++ b/paddle/phi/core/meta_tensor.h @@ -26,11 +26,13 @@ namespace phi { // TODO(chenweihang): add other flags if needed struct MetaConfig { bool is_runtime{true}; - + bool is_run_mkldnn_kernel{false}; MetaConfig() = default; // supporting implicit construction is easier to use - MetaConfig(bool is_runtime) : is_runtime(is_runtime) {} // NOLINT + MetaConfig(bool is_runtime, bool is_run_mkldnn_kernel) + : is_runtime(is_runtime), + is_run_mkldnn_kernel(is_run_mkldnn_kernel) {} // NOLINT }; class MetaTensor { @@ -66,7 +68,7 @@ class MetaTensor { // Because the lod in compiletime and runtime is different, // so `LoD` cannot in public methods const LoD& lod() const; - TensorBase* get_tensor() const; + TensorBase* tensor() const; TensorBase* tensor_; }; diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc index 0a2b4dcae58ca07b054e04a5a5f8e7a720591034..37d1a234b5767a3873bda6b41e6e410df1c452af 100644 --- a/paddle/phi/infermeta/backward.cc +++ b/paddle/phi/infermeta/backward.cc @@ -64,6 +64,16 @@ void BilinearTensorProductGradInferMeta(const MetaTensor& x, } } +void GatherNdGradInferMeta(const MetaTensor& x, + const MetaTensor& index, + const MetaTensor& out_grad, + MetaTensor* x_grad) { + const auto& dtype = out_grad.dtype(); + x_grad->set_dims(x.dims()); + x_grad->share_lod(x); + x_grad->set_dtype(dtype); +} + void GeneralBinaryGradInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* dx, @@ -93,6 +103,12 @@ void GeneralTernaryGradInferMeta(const MetaTensor& x, } } +void GeneralUnaryGradInferMeta(const MetaTensor& x, MetaTensor* dx) { + if (dx) { + dx->share_meta(x); + } +} + void GumbelSoftmaxGradInferMeta(const MetaTensor& out, const MetaTensor& dout, int axis, @@ -102,17 +118,37 @@ void GumbelSoftmaxGradInferMeta(const MetaTensor& out, dout.dims(), errors::InvalidArgument( "Input(Out) and its gradients should have the same shape.")); + dx->share_meta(dout); } -void GatherNdGradInferMeta(const MetaTensor& x, - const MetaTensor& index, - const MetaTensor& out_grad, - MetaTensor* x_grad) { - const auto& dtype = out_grad.dtype(); - x_grad->set_dims(x.dims()); - x_grad->share_lod(x); - x_grad->set_dtype(dtype); +void MaxPoolWithIndexGradInferMeta(const MetaTensor& x, + const MetaTensor& mask, + const MetaTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool global_pooling, + bool adaptive, + MetaTensor* dx) { + dx->share_meta(x); +} + +void PoolGradInferMeta(const MetaTensor& x, + const MetaTensor& out, + const MetaTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + MetaTensor* dx) { + dx->share_meta(x); } void PsroiPoolGradInferMeta(const MetaTensor& x, diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h index c4003ca1fe76b865079e8f577fdee9db3be895ab..06ee5a205d7b0f2f842e1b9b4b8fad8948168b64 100644 --- a/paddle/phi/infermeta/backward.h +++ b/paddle/phi/infermeta/backward.h @@ -30,6 +30,11 @@ void BilinearTensorProductGradInferMeta(const MetaTensor& x, MetaTensor* dweight, MetaTensor* dbias); +void GatherNdGradInferMeta(const MetaTensor& x, + const MetaTensor& index, + const MetaTensor& out_grad, + MetaTensor* x_grad); + void GeneralBinaryGradInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* dx, @@ -42,11 +47,23 @@ void GeneralTernaryGradInferMeta(const MetaTensor& x, MetaTensor* dy, MetaTensor* dz); +void GeneralUnaryGradInferMeta(const MetaTensor& x, MetaTensor* dx); + void GumbelSoftmaxGradInferMeta(const MetaTensor& out, const MetaTensor& dout, int axis, MetaTensor* dx); +void MaxPoolWithIndexGradInferMeta(const MetaTensor& x, + const MetaTensor& mask, + const MetaTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool global_pooling, + bool adaptive, + MetaTensor* dx); + void PsroiPoolGradInferMeta(const MetaTensor& x, const MetaTensor& rois, paddle::optional rois_num, @@ -57,6 +74,21 @@ void PsroiPoolGradInferMeta(const MetaTensor& x, float spatial_scale, MetaTensor* dx); +void PoolGradInferMeta(const MetaTensor& x, + const MetaTensor& out, + const MetaTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + MetaTensor* dx); + void ScatterGradInferMeta(const MetaTensor& index, const MetaTensor& updates, const MetaTensor& out_grad, diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index 641956c4d9de796bed166e1f6238ff6988601bec..2947661517e78f328100d9b208cfa45c5f98dae1 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -21,6 +21,203 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/common_shape.h" namespace phi { +namespace detail { + +static void BinarySameInputDimsCheck(const MetaTensor& x, + const MetaTensor& y, + MetaConfig config) { + auto input_dim = x.dims(); + auto other_dim = y.dims(); + PADDLE_ENFORCE_EQ(input_dim.size(), + other_dim.size(), + phi::errors::PreconditionNotMet( + "Input(Input) and Input(Other) must have the same " + "dimension size.")); + int n = input_dim.size(); + bool is_runtime = config.is_runtime; + for (int i = 0; i < n; i++) { + if (is_runtime) { + PADDLE_ENFORCE_EQ(input_dim[i], + other_dim[i], + phi::errors::PreconditionNotMet( + "The value at dim %d of Input(Input) is not " + "equal to the Input(Other): %ld != %ld.", + i, + input_dim[i], + other_dim[i])); + } else { + if (!(input_dim[i] < 0 || other_dim[i] < 0)) { + PADDLE_ENFORCE_EQ(input_dim[i], + other_dim[i], + phi::errors::PreconditionNotMet( + "The value at dim %d of Input(Input) is not " + "equal to the Input(Other): %ld != %ld.", + i, + input_dim[i], + other_dim[i])); + } + } + } +} + +} // namespace detail + +void AllValueCompareInferMeta(const MetaTensor& x, + const MetaTensor& y, + MetaTensor* out, + MetaConfig config) { + detail::BinarySameInputDimsCheck(x, y, config); + + out->set_dims(phi::make_ddim({1})); + out->set_dtype(DataType::BOOL); +} + +void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) { + out->share_meta(x); +} + +void BCELossInferMeta(const MetaTensor& input, + const MetaTensor& label, + MetaTensor* out, + MetaConfig config) { + auto input_dims = input.dims(); + auto label_dims = label.dims(); + + int rank = input_dims.size(); + PADDLE_ENFORCE_EQ(rank, + label_dims.size(), + phi::errors::InvalidArgument( + "Input(X) and Input(Label) shall have the same rank." + "But received: the rank of Input(X) is [%d], " + "the rank of Input(Label) is [%d].", + rank, + label_dims.size())); + + bool check = true; + if ((!config.is_runtime) && + (phi::product(input_dims) <= 0 || phi::product(label_dims) <= 0)) { + check = false; + } + + if (check) { + PADDLE_ENFORCE_EQ(input_dims, + label_dims, + phi::errors::InvalidArgument( + "Input(X) and Input(Label) shall have the same " + "shape. But received: the shape of Input(X) is " + "[%s], the shape of Input(Label) is [%s].", + input_dims, + label_dims)); + } + + out->set_dims(input_dims); + out->set_dtype(input.dtype()); + out->share_lod(input); +} + +void BincountInferMeta(const MetaTensor& x, + const paddle::optional weights, + int minlength, + MetaTensor* out) { + auto input_dim = x.dims(); + + PADDLE_ENFORCE_GE(minlength, + 0, + phi::errors::InvalidArgument( + "The minlength should be greater than or equal to 0." + "But received minlength is %d", + minlength)); + + PADDLE_ENFORCE_EQ( + input_dim.size(), + 1, + phi::errors::InvalidArgument("The 'shape' of Input(X) must be 1-D tensor." + "But the dimension of Input(X) is [%d]", + input_dim.size())); + + if (weights.is_initialized()) { + auto weights_dim = weights->dims(); + PADDLE_ENFORCE_EQ(weights_dim.size(), + 1, + phi::errors::InvalidArgument( + "The 'shape' of Input(Weights) must be 1-D tensor." + "But the dimension of Input(Weights) is [%d]", + weights_dim.size())); + + PADDLE_ENFORCE_EQ( + weights_dim[0], + input_dim[0], + phi::errors::InvalidArgument( + "The 'shape' of Input(Weights) must be equal to the 'shape' of " + "Input(X)." + "But received: the 'shape' of Input(Weights) is [%s]," + "the 'shape' of Input(X) is [%s]", + weights_dim, + input_dim)); + } + out->set_dims(phi::make_ddim({-1})); + if (weights.is_initialized()) { + out->set_dtype(weights->dtype()); + } else { + out->set_dtype(x.dtype()); + } + + out->share_lod(x); +} + +void CholeskySolveInferMeta(const MetaTensor& x, + const MetaTensor& y, + bool upper, + MetaTensor* out) { + auto x_dims = x.dims(); + auto y_dims = y.dims(); + + auto x_dims_n = x_dims.size(); + auto y_dims_n = y_dims.size(); + + PADDLE_ENFORCE_GE(x_dims_n, + 2, + phi::errors::InvalidArgument( + "the rank of input Y must greater or equal to 2")); + PADDLE_ENFORCE_GE(y_dims_n, + 2, + phi::errors::InvalidArgument( + "the rank of input X must greater or equal to 2")); + PADDLE_ENFORCE_EQ( + y_dims[y_dims_n - 1], + y_dims[y_dims_n - 2], + phi::errors::InvalidArgument("input Matrix Y should be square matrix," + "But Got last shape of %ld x %ld", + y_dims[y_dims_n - 1], + y_dims[y_dims_n - 2])); + PADDLE_ENFORCE_EQ( + x_dims[x_dims_n - 2], + y_dims[y_dims_n - 2], + phi::errors::InvalidArgument("the first dim of Matrix X must be equal to " + "the fisrt dim of Matrix Y," + "But Got %ld and %ld", + x_dims[x_dims_n - 2], + y_dims[y_dims_n - 2])); + + std::vector x_dims_vec = phi::vectorize(x_dims); + std::vector y_dims_vec = phi::vectorize(y_dims); + + std::vector x_dims_vec_cut(x_dims_vec.begin(), x_dims_vec.end() - 2); + std::vector y_dims_vec_cut(y_dims_vec.begin(), y_dims_vec.end() - 2); + + std::vector expand_batch_portion = + funcs::MatrixGetBroadcastBatchPortion(x_dims_vec_cut, y_dims_vec_cut); + + std::vector x_broadcast_dims({expand_batch_portion}); + x_broadcast_dims.insert(x_broadcast_dims.end(), + {x_dims_vec[x_dims_n - 2], x_dims_vec[x_dims_n - 1]}); + + // dim of 'out' is the same with 'X' after broadcast + out->set_dims(phi::make_ddim(x_broadcast_dims)); + out->set_dtype(x.dtype()); + out->set_layout(x.layout()); + out->share_lod(x); +} void CompareInferMeta(const MetaTensor& x, const MetaTensor& y, @@ -67,6 +264,74 @@ void CompareAllInferMeta(const MetaTensor& x, out->set_dtype(DataType::BOOL); } +void CrossInferMeta(const MetaTensor& x, + const MetaTensor& y, + int axis, + MetaTensor* out) { + auto x_dim = x.dims(); + auto y_dim = y.dims(); + auto dim = axis; + + bool dims_match = phi::funcs::CheckDims(x_dim, y_dim); + PADDLE_ENFORCE_EQ( + dims_match, + true, + phi::errors::InvalidArgument("The 'shape' of Input(X) should be equal to " + "the 'shape' of Input(Y). But received " + "Input(X).dimensions = [%s], " + "Input(Y).dimensions = [%s]", + x_dim, + y_dim)); + + if (dim != DDim::kMaxRank) { + PADDLE_ENFORCE_EQ( + dim < x_dim.size() && dim >= (0 - x_dim.size()), + true, + phi::errors::OutOfRange( + "Attr(dim) is out of range, It's expected " + "to be in range of [-%d, %d]. But received Attr(dim) = %d.", + x_dim.size(), + x_dim.size() - 1, + dim)); + if (dim < 0) { + dim += x_dim.size(); + } + PADDLE_ENFORCE_EQ(x_dim[dim] == 3 && y_dim[dim] == 3, + true, + phi::errors::InvalidArgument( + "Input(X/Y).dims()[dim] should be equal to 3." + "But received Input(X/Y).dims()[dim] = %d.", + x_dim[dim])); + } + out->set_dims(x_dim); + out->set_dtype(x.dtype()); + out->set_layout(x.layout()); + out->share_lod(x); +} + +void DistInferMeta(const MetaTensor& x, + const MetaTensor& y, + float p, + MetaTensor* out) { + auto x_dims = x.dims(); + auto y_dims = y.dims(); + + PADDLE_ENFORCE_NE(phi::product(x_dims), + 0, + phi::errors::InvalidArgument( + "The Input(X) has not been initialized properly. The " + "shape of Input(X) = [%s].", + x_dims)); + PADDLE_ENFORCE_NE(phi::product(y_dims), + 0, + phi::errors::InvalidArgument( + "The Input(Y) has not been initialized properly. The " + "shape of Input(Y) = [%s].", + y_dims)); + out->set_dims({1}); + out->set_dtype(x.dtype()); +} + void DotInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) { auto x_dims = x.dims(); auto x_rank = static_cast(x_dims.size()); @@ -109,84 +374,11 @@ void DotInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) { out->set_layout(x.layout()); } -void MatmulInferMeta(const MetaTensor& x, - const MetaTensor& y, - bool trans_x, - bool trans_y, - MetaTensor* out) { - std::vector dims_x = phi::vectorize(x.dims()); - std::vector dims_y = phi::vectorize(y.dims()); - auto ndims_x = dims_x.size(); - auto ndims_y = dims_y.size(); - PADDLE_ENFORCE_GT(ndims_x, - 0UL, - phi::errors::InvalidArgument( - "The Input(x) dims size must be greater than 0," - " but reviced dims size is 0. ")); - PADDLE_ENFORCE_GT(ndims_y, - 0UL, - phi::errors::InvalidArgument( - "The Input(y) dims size must be greater than 0," - " but reviced dims size is 0. ")); - - bool x_broadcasted = false, y_broadcasted = false; - if (ndims_x == 1) { - dims_x.insert(dims_x.begin(), 1); - ndims_x = 2; - x_broadcasted = true; - } - - if (ndims_y == 1) { - dims_y.push_back(1); - ndims_y = 2; - y_broadcasted = true; - } - - size_t M, N; - if (trans_x) { - M = dims_x[ndims_x - 1]; - } else { - M = dims_x[ndims_x - 2]; - } - if (trans_y) { - N = dims_y[ndims_y - 2]; - } else { - N = dims_y[ndims_y - 1]; - } - - std::vector new_dims; - if (ndims_x > ndims_y) { - new_dims.assign(dims_x.begin(), dims_x.end() - 2); - } else if (ndims_x < ndims_y) { - new_dims.assign(dims_y.begin(), dims_y.end() - 2); - } else { - new_dims.reserve(ndims_x); - for (size_t i = 0; i < ndims_x - 2; ++i) { - new_dims.push_back(std::max(dims_x[i], dims_y[i])); - } - } - if (!x_broadcasted) { - new_dims.push_back(M); - } - if (!y_broadcasted) { - new_dims.push_back(N); - } - if (x_broadcasted && y_broadcasted) { - new_dims.push_back(1); - } - - auto ddim_out = phi::make_ddim(new_dims); - - out->set_dims(ddim_out); - out->set_dtype(x.dtype()); - out->set_layout(x.layout()); -} - -void ElementwiseInferMeta(const MetaTensor& x, - const MetaTensor& y, - MetaTensor* out) { - return ElementwiseRawInferMeta(x, y, -1, std::move(out)); -} +void ElementwiseInferMeta(const MetaTensor& x, + const MetaTensor& y, + MetaTensor* out) { + return ElementwiseRawInferMeta(x, y, -1, std::move(out)); +} void ElementwiseRawInferMeta(const MetaTensor& x, const MetaTensor& y, @@ -223,383 +415,19 @@ void ElementwiseRawInferMeta(const MetaTensor& x, funcs::GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array.data(), - y_dims_array.data(), - out_dims_array.data(), - max_dim, - axis); - auto out_dims = phi::make_ddim(out_dims_array); - out->set_dims(out_dims); - } else { - out->set_dims(x.dims()); - } - - out->set_dtype(x.dtype()); - out->set_layout(x.layout()); - out->share_lod(x); -} - -void HuberLossInferMeta(const MetaTensor& input, - const MetaTensor& label, - float delta, - MetaTensor* out, - MetaTensor* residual, - MetaConfig config) { - auto input_dims = input.dims(); - auto label_dims = label.dims(); - - PADDLE_ENFORCE_EQ(input_dims.size(), - label_dims.size(), - phi::errors::InvalidArgument( - "Input(input) rank and Input(label) rank should be " - "same, but received input rank(%d) != label rank(%d)", - input_dims.size(), - label_dims.size())); - - bool contain_unknown_dim = phi::contain_unknown_dim(input_dims) || - phi::contain_unknown_dim(label_dims); - if (config.is_runtime || !contain_unknown_dim) { - PADDLE_ENFORCE_EQ( - input_dims, - label_dims, - phi::errors::InvalidArgument( - "The Input(input) and Input(label) should have the same " - "shape, but received input shape [%s] != label shape [%s]", - input_dims, - label_dims)); - } - - auto out_dims = label_dims; - residual->set_dims(out_dims); - out->set_dims(out_dims); - out->share_lod(input); -} - -void CholeskySolveInferMeta(const MetaTensor& x, - const MetaTensor& y, - bool upper, - MetaTensor* out) { - auto x_dims = x.dims(); - auto y_dims = y.dims(); - - auto x_dims_n = x_dims.size(); - auto y_dims_n = y_dims.size(); - - PADDLE_ENFORCE_GE(x_dims_n, - 2, - phi::errors::InvalidArgument( - "the rank of input Y must greater or equal to 2")); - PADDLE_ENFORCE_GE(y_dims_n, - 2, - phi::errors::InvalidArgument( - "the rank of input X must greater or equal to 2")); - PADDLE_ENFORCE_EQ( - y_dims[y_dims_n - 1], - y_dims[y_dims_n - 2], - phi::errors::InvalidArgument("input Matrix Y should be square matrix," - "But Got last shape of %ld x %ld", - y_dims[y_dims_n - 1], - y_dims[y_dims_n - 2])); - PADDLE_ENFORCE_EQ( - x_dims[x_dims_n - 2], - y_dims[y_dims_n - 2], - phi::errors::InvalidArgument("the first dim of Matrix X must be equal to " - "the fisrt dim of Matrix Y," - "But Got %ld and %ld", - x_dims[x_dims_n - 2], - y_dims[y_dims_n - 2])); - - std::vector x_dims_vec = phi::vectorize(x_dims); - std::vector y_dims_vec = phi::vectorize(y_dims); - - std::vector x_dims_vec_cut(x_dims_vec.begin(), x_dims_vec.end() - 2); - std::vector y_dims_vec_cut(y_dims_vec.begin(), y_dims_vec.end() - 2); - - std::vector expand_batch_portion = - funcs::MatrixGetBroadcastBatchPortion(x_dims_vec_cut, y_dims_vec_cut); - - std::vector x_broadcast_dims({expand_batch_portion}); - x_broadcast_dims.insert(x_broadcast_dims.end(), - {x_dims_vec[x_dims_n - 2], x_dims_vec[x_dims_n - 1]}); - - // dim of 'out' is the same with 'X' after broadcast - out->set_dims(phi::make_ddim(x_broadcast_dims)); - out->set_dtype(x.dtype()); - out->set_layout(x.layout()); - out->share_lod(x); -} - -void TriangularSolveInferMeta(const MetaTensor& x, - const MetaTensor& y, - bool upper, - bool transpose, - bool unitriangular, - MetaTensor* out) { - auto x_dims = x.dims(); - auto y_dims = y.dims(); - - auto x_dims_n = x_dims.size(); - auto y_dims_n = y_dims.size(); - - PADDLE_ENFORCE_GE(x_dims_n, - 2, - phi::errors::InvalidArgument( - "The input tensor X's dimensions of TriangularSolveOp " - "should be >= 2. But received X's " - "dimensions = %d, X's shape = [%s]", - x_dims.size(), - x_dims)); - - PADDLE_ENFORCE_GE(y_dims_n, - 2, - phi::errors::InvalidArgument( - "The input tensor Y's dimensions of TriangularSolveOp " - "should be >=2. But received Y's " - "dimensions = %d, Y's shape = [%s]", - y_dims.size(), - y_dims)); - - PADDLE_ENFORCE_EQ(x_dims[x_dims_n - 2], - x_dims[x_dims_n - 1], - phi::errors::InvalidArgument( - "The inner-most 2 dimensions of Input(X) all should " - "be square matrices " - "But received X's shape[-2] = %d and shape[-1] = %d.", - x_dims[x_dims_n - 2], - x_dims[x_dims_n - 1])); - - std::vector x_dims_vec = phi::vectorize(x_dims); - std::vector y_dims_vec = phi::vectorize(y_dims); - - std::vector x_dims_vec_cut(x_dims_vec.begin(), x_dims_vec.end() - 2); - std::vector y_dims_vec_cut(y_dims_vec.begin(), y_dims_vec.end() - 2); - - std::vector expand_batch_portion = - funcs::MatrixGetBroadcastBatchPortion(x_dims_vec_cut, y_dims_vec_cut); - - std::vector y_broadcast_dims({expand_batch_portion}); - y_broadcast_dims.insert(y_broadcast_dims.end(), - {y_dims_vec[y_dims_n - 2], y_dims_vec[y_dims_n - 1]}); - - // dim of 'out' is the same with 'Y' after broadcast - out->set_dims(phi::make_ddim(y_broadcast_dims)); - out->set_dtype(y.dtype()); - out->set_layout(y.layout()); - out->share_lod(y); -} - -void IndexSampleInferMeta(const MetaTensor& x, - const MetaTensor& y, - MetaTensor* out, - MetaConfig config) { - auto input_dims = x.dims(); - PADDLE_ENFORCE_EQ(input_dims.size(), - 2, - errors::InvalidArgument( - "Inputs(X) shape of IndexSample op should be 2-D, but " - "got X's shape = [%s], please check X shape.", - input_dims)); - - auto index_dims = y.dims(); - PADDLE_ENFORCE_EQ( - index_dims.size(), - 2, - errors::InvalidArgument( - "Inputs(Index) shape of IndexSample op should be 2-D, but " - "got Index's shape [%s] , please check index shape.", - input_dims)); - if (config.is_runtime) { - PADDLE_ENFORCE_EQ(input_dims[0], - index_dims[0], - errors::InvalidArgument( - "Inputs(X)'s value of dimension 0 must same with " - "Inputs(Index)'s value of dimension 0, but " - "got %d of Inputs(X), and got %d of Inputs(Index), " - "please check Inputs shape.", - input_dims[0], - index_dims[0])); - } - out->set_dtype(x.dtype()); - out->set_dims(index_dims); - out->share_lod(y); -} -void CrossInferMeta(const MetaTensor& x, - const MetaTensor& y, - int axis, - MetaTensor* out) { - auto x_dim = x.dims(); - auto y_dim = y.dims(); - auto dim = axis; - - bool dims_match = phi::funcs::CheckDims(x_dim, y_dim); - PADDLE_ENFORCE_EQ( - dims_match, - true, - phi::errors::InvalidArgument("The 'shape' of Input(X) should be equal to " - "the 'shape' of Input(Y). But received " - "Input(X).dimensions = [%s], " - "Input(Y).dimensions = [%s]", - x_dim, - y_dim)); - - if (dim != DDim::kMaxRank) { - PADDLE_ENFORCE_EQ( - dim < x_dim.size() && dim >= (0 - x_dim.size()), - true, - phi::errors::OutOfRange( - "Attr(dim) is out of range, It's expected " - "to be in range of [-%d, %d]. But received Attr(dim) = %d.", - x_dim.size(), - x_dim.size() - 1, - dim)); - if (dim < 0) { - dim += x_dim.size(); - } - PADDLE_ENFORCE_EQ(x_dim[dim] == 3 && y_dim[dim] == 3, - true, - phi::errors::InvalidArgument( - "Input(X/Y).dims()[dim] should be equal to 3." - "But received Input(X/Y).dims()[dim] = %d.", - x_dim[dim])); - } - out->set_dims(x_dim); - out->set_dtype(x.dtype()); - out->set_layout(x.layout()); - out->share_lod(x); -} - -void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) { - out->share_meta(x); -} - -void SegmentPoolInferMeta(const MetaTensor& x, - const MetaTensor& segment_ids, - const std::string& pooltype, - MetaTensor* out, - MetaTensor* summed_ids, - MetaConfig config) { - auto dims = x.dims(); - dims[0] = -1; - out->set_dims(dims); - out->set_dtype(x.dtype()); - out->set_layout(x.layout()); - - if (pooltype == "MEAN") { - summed_ids->set_dims({-1, 1}); - summed_ids->set_dtype(x.dtype()); - summed_ids->set_layout(x.layout()); - } -} - -void BCELossInferMeta(const MetaTensor& input, - const MetaTensor& label, - MetaTensor* out, - MetaConfig config) { - auto input_dims = input.dims(); - auto label_dims = label.dims(); - - int rank = input_dims.size(); - PADDLE_ENFORCE_EQ(rank, - label_dims.size(), - phi::errors::InvalidArgument( - "Input(X) and Input(Label) shall have the same rank." - "But received: the rank of Input(X) is [%d], " - "the rank of Input(Label) is [%d].", - rank, - label_dims.size())); - - bool check = true; - if ((!config.is_runtime) && - (phi::product(input_dims) <= 0 || phi::product(label_dims) <= 0)) { - check = false; - } - - if (check) { - PADDLE_ENFORCE_EQ(input_dims, - label_dims, - phi::errors::InvalidArgument( - "Input(X) and Input(Label) shall have the same " - "shape. But received: the shape of Input(X) is " - "[%s], the shape of Input(Label) is [%s].", - input_dims, - label_dims)); - } - - out->set_dims(input_dims); - out->set_dtype(input.dtype()); - out->share_lod(input); -} - -void BincountInferMeta(const MetaTensor& x, - const paddle::optional weights, - int minlength, - MetaTensor* out) { - auto input_dim = x.dims(); - - PADDLE_ENFORCE_GE(minlength, - 0, - phi::errors::InvalidArgument( - "The minlength should be greater than or equal to 0." - "But received minlength is %d", - minlength)); - - PADDLE_ENFORCE_EQ( - input_dim.size(), - 1, - phi::errors::InvalidArgument("The 'shape' of Input(X) must be 1-D tensor." - "But the dimension of Input(X) is [%d]", - input_dim.size())); - - if (weights.is_initialized()) { - auto weights_dim = weights->dims(); - PADDLE_ENFORCE_EQ(weights_dim.size(), - 1, - phi::errors::InvalidArgument( - "The 'shape' of Input(Weights) must be 1-D tensor." - "But the dimension of Input(Weights) is [%d]", - weights_dim.size())); - - PADDLE_ENFORCE_EQ( - weights_dim[0], - input_dim[0], - phi::errors::InvalidArgument( - "The 'shape' of Input(Weights) must be equal to the 'shape' of " - "Input(X)." - "But received: the 'shape' of Input(Weights) is [%s]," - "the 'shape' of Input(X) is [%s]", - weights_dim, - input_dim)); - } - out->set_dims(phi::make_ddim({-1})); - if (weights.is_initialized()) { - out->set_dtype(weights->dtype()); + y_dims_array.data(), + out_dims_array.data(), + max_dim, + axis); + auto out_dims = phi::make_ddim(out_dims_array); + out->set_dims(out_dims); } else { - out->set_dtype(x.dtype()); + out->set_dims(x.dims()); } - out->share_lod(x); -} - -void DistInferMeta(const MetaTensor& x, - const MetaTensor& y, - float p, - MetaTensor* out) { - auto x_dims = x.dims(); - auto y_dims = y.dims(); - - PADDLE_ENFORCE_NE(phi::product(x_dims), - 0, - phi::errors::InvalidArgument( - "The Input(X) has not been initialized properly. The " - "shape of Input(X) = [%s].", - x_dims)); - PADDLE_ENFORCE_NE(phi::product(y_dims), - 0, - phi::errors::InvalidArgument( - "The Input(Y) has not been initialized properly. The " - "shape of Input(Y) = [%s].", - y_dims)); - out->set_dims({1}); out->set_dtype(x.dtype()); + out->set_layout(x.layout()); + out->share_lod(x); } void GatherNdInferMeta(const MetaTensor& x, @@ -648,6 +476,78 @@ void GatherTreeMeta(const MetaTensor& ids, out->set_dims(ids_dims); } +void HuberLossInferMeta(const MetaTensor& input, + const MetaTensor& label, + float delta, + MetaTensor* out, + MetaTensor* residual, + MetaConfig config) { + auto input_dims = input.dims(); + auto label_dims = label.dims(); + + PADDLE_ENFORCE_EQ(input_dims.size(), + label_dims.size(), + phi::errors::InvalidArgument( + "Input(input) rank and Input(label) rank should be " + "same, but received input rank(%d) != label rank(%d)", + input_dims.size(), + label_dims.size())); + + bool contain_unknown_dim = phi::contain_unknown_dim(input_dims) || + phi::contain_unknown_dim(label_dims); + if (config.is_runtime || !contain_unknown_dim) { + PADDLE_ENFORCE_EQ( + input_dims, + label_dims, + phi::errors::InvalidArgument( + "The Input(input) and Input(label) should have the same " + "shape, but received input shape [%s] != label shape [%s]", + input_dims, + label_dims)); + } + + auto out_dims = label_dims; + residual->set_dims(out_dims); + out->set_dims(out_dims); + out->share_lod(input); +} + +void IndexSampleInferMeta(const MetaTensor& x, + const MetaTensor& y, + MetaTensor* out, + MetaConfig config) { + auto input_dims = x.dims(); + PADDLE_ENFORCE_EQ(input_dims.size(), + 2, + errors::InvalidArgument( + "Inputs(X) shape of IndexSample op should be 2-D, but " + "got X's shape = [%s], please check X shape.", + input_dims)); + + auto index_dims = y.dims(); + PADDLE_ENFORCE_EQ( + index_dims.size(), + 2, + errors::InvalidArgument( + "Inputs(Index) shape of IndexSample op should be 2-D, but " + "got Index's shape [%s] , please check index shape.", + input_dims)); + if (config.is_runtime) { + PADDLE_ENFORCE_EQ(input_dims[0], + index_dims[0], + errors::InvalidArgument( + "Inputs(X)'s value of dimension 0 must same with " + "Inputs(Index)'s value of dimension 0, but " + "got %d of Inputs(X), and got %d of Inputs(Index), " + "please check Inputs shape.", + input_dims[0], + index_dims[0])); + } + out->set_dtype(x.dtype()); + out->set_dims(index_dims); + out->share_lod(y); +} + void LogLossInferMeta(const MetaTensor& input, const MetaTensor& label, float epsilon, @@ -690,6 +590,79 @@ void LogLossInferMeta(const MetaTensor& input, out->share_lod(input); } +void MatmulInferMeta(const MetaTensor& x, + const MetaTensor& y, + bool trans_x, + bool trans_y, + MetaTensor* out) { + std::vector dims_x = phi::vectorize(x.dims()); + std::vector dims_y = phi::vectorize(y.dims()); + auto ndims_x = dims_x.size(); + auto ndims_y = dims_y.size(); + PADDLE_ENFORCE_GT(ndims_x, + 0UL, + phi::errors::InvalidArgument( + "The Input(x) dims size must be greater than 0," + " but reviced dims size is 0. ")); + PADDLE_ENFORCE_GT(ndims_y, + 0UL, + phi::errors::InvalidArgument( + "The Input(y) dims size must be greater than 0," + " but reviced dims size is 0. ")); + + bool x_broadcasted = false, y_broadcasted = false; + if (ndims_x == 1) { + dims_x.insert(dims_x.begin(), 1); + ndims_x = 2; + x_broadcasted = true; + } + + if (ndims_y == 1) { + dims_y.push_back(1); + ndims_y = 2; + y_broadcasted = true; + } + + size_t M, N; + if (trans_x) { + M = dims_x[ndims_x - 1]; + } else { + M = dims_x[ndims_x - 2]; + } + if (trans_y) { + N = dims_y[ndims_y - 2]; + } else { + N = dims_y[ndims_y - 1]; + } + + std::vector new_dims; + if (ndims_x > ndims_y) { + new_dims.assign(dims_x.begin(), dims_x.end() - 2); + } else if (ndims_x < ndims_y) { + new_dims.assign(dims_y.begin(), dims_y.end() - 2); + } else { + new_dims.reserve(ndims_x); + for (size_t i = 0; i < ndims_x - 2; ++i) { + new_dims.push_back(std::max(dims_x[i], dims_y[i])); + } + } + if (!x_broadcasted) { + new_dims.push_back(M); + } + if (!y_broadcasted) { + new_dims.push_back(N); + } + if (x_broadcasted && y_broadcasted) { + new_dims.push_back(1); + } + + auto ddim_out = phi::make_ddim(new_dims); + + out->set_dims(ddim_out); + out->set_dtype(x.dtype()); + out->set_layout(x.layout()); +} + void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out) { auto dim_x = x.dims(); auto dim_vec = vec.dims(); @@ -720,6 +693,25 @@ void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out) { out->share_lod(x); } +void SegmentPoolInferMeta(const MetaTensor& x, + const MetaTensor& segment_ids, + const std::string& pooltype, + MetaTensor* out, + MetaTensor* summed_ids, + MetaConfig config) { + auto dims = x.dims(); + dims[0] = -1; + out->set_dims(dims); + out->set_dtype(x.dtype()); + out->set_layout(x.layout()); + + if (pooltype == "MEAN") { + summed_ids->set_dims({-1, 1}); + summed_ids->set_dtype(x.dtype()); + summed_ids->set_layout(x.layout()); + } +} + void SigmoidCrossEntropyWithLogitsInferMeta(const MetaTensor& x, const MetaTensor& label, bool normalize, @@ -761,4 +753,63 @@ void SigmoidCrossEntropyWithLogitsInferMeta(const MetaTensor& x, out->share_lod(x); } +void TriangularSolveInferMeta(const MetaTensor& x, + const MetaTensor& y, + bool upper, + bool transpose, + bool unitriangular, + MetaTensor* out) { + auto x_dims = x.dims(); + auto y_dims = y.dims(); + + auto x_dims_n = x_dims.size(); + auto y_dims_n = y_dims.size(); + + PADDLE_ENFORCE_GE(x_dims_n, + 2, + phi::errors::InvalidArgument( + "The input tensor X's dimensions of TriangularSolveOp " + "should be >= 2. But received X's " + "dimensions = %d, X's shape = [%s]", + x_dims.size(), + x_dims)); + + PADDLE_ENFORCE_GE(y_dims_n, + 2, + phi::errors::InvalidArgument( + "The input tensor Y's dimensions of TriangularSolveOp " + "should be >=2. But received Y's " + "dimensions = %d, Y's shape = [%s]", + y_dims.size(), + y_dims)); + + PADDLE_ENFORCE_EQ(x_dims[x_dims_n - 2], + x_dims[x_dims_n - 1], + phi::errors::InvalidArgument( + "The inner-most 2 dimensions of Input(X) all should " + "be square matrices " + "But received X's shape[-2] = %d and shape[-1] = %d.", + x_dims[x_dims_n - 2], + x_dims[x_dims_n - 1])); + + std::vector x_dims_vec = phi::vectorize(x_dims); + std::vector y_dims_vec = phi::vectorize(y_dims); + + std::vector x_dims_vec_cut(x_dims_vec.begin(), x_dims_vec.end() - 2); + std::vector y_dims_vec_cut(y_dims_vec.begin(), y_dims_vec.end() - 2); + + std::vector expand_batch_portion = + funcs::MatrixGetBroadcastBatchPortion(x_dims_vec_cut, y_dims_vec_cut); + + std::vector y_broadcast_dims({expand_batch_portion}); + y_broadcast_dims.insert(y_broadcast_dims.end(), + {y_dims_vec[y_dims_n - 2], y_dims_vec[y_dims_n - 1]}); + + // dim of 'out' is the same with 'Y' after broadcast + out->set_dims(phi::make_ddim(y_broadcast_dims)); + out->set_dtype(y.dtype()); + out->set_layout(y.layout()); + out->share_lod(y); +} + } // namespace phi diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h index d2b16e557b06dc94107788995f0c26f1e27e1761..cfae45cf04b87c287a174d172700a794c8c2a2a3 100644 --- a/paddle/phi/infermeta/binary.h +++ b/paddle/phi/infermeta/binary.h @@ -29,22 +29,48 @@ namespace phi { // Because functions in this file not only can infer shape, but also need // infer lod or other useful data. +void AllValueCompareInferMeta(const MetaTensor& x, + const MetaTensor& y, + MetaTensor* out, + MetaConfig config = MetaConfig()); + +void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out); + +void BCELossInferMeta(const MetaTensor& input, + const MetaTensor& label, + MetaTensor* out, + MetaConfig config = MetaConfig()); + +void BincountInferMeta(const MetaTensor& x, + const paddle::optional weights, + int minlength, + MetaTensor* out); + +void CholeskySolveInferMeta(const MetaTensor& x, + const MetaTensor& y, + bool upper, + MetaTensor* out); + +void CompareAllInferMeta(const MetaTensor& x, + const MetaTensor& y, + MetaTensor* out); + void CompareInferMeta(const MetaTensor& x, const MetaTensor& y, int axis, MetaTensor* out); -void CompareAllInferMeta(const MetaTensor& x, - const MetaTensor& y, - MetaTensor* out); +void CrossInferMeta(const MetaTensor& x, + const MetaTensor& y, + int axis, + MetaTensor* out); -void DotInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out); +void DistInferMeta(const MetaTensor& x, + const MetaTensor& y, + float p, + MetaTensor* out); -void MatmulInferMeta(const MetaTensor& x, - const MetaTensor& y, - bool trans_x, - bool trans_y, - MetaTensor* out); +void DotInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out); void ElementwiseInferMeta(const MetaTensor& x, const MetaTensor& y, @@ -55,6 +81,14 @@ void ElementwiseRawInferMeta(const MetaTensor& x_meta, int axis, MetaTensor* out); +void GatherNdInferMeta(const MetaTensor& x, + const MetaTensor& index, + MetaTensor* out); + +void GatherTreeMeta(const MetaTensor& ids, + const MetaTensor& parents, + MetaTensor* out); + void HuberLossInferMeta(const MetaTensor& input_meta, const MetaTensor& label_meta, float delta, @@ -62,29 +96,24 @@ void HuberLossInferMeta(const MetaTensor& input_meta, MetaTensor* residual, MetaConfig config = MetaConfig()); -void CholeskySolveInferMeta(const MetaTensor& x, - const MetaTensor& y, - bool upper, - MetaTensor* out); - -void TriangularSolveInferMeta(const MetaTensor& x, - const MetaTensor& y, - bool upper, - bool transpose, - bool unitriangular, - MetaTensor* out); - void IndexSampleInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out, MetaConfig config = MetaConfig()); -void CrossInferMeta(const MetaTensor& x, - const MetaTensor& y, - int axis, - MetaTensor* out); +void LogLossInferMeta(const MetaTensor& input, + const MetaTensor& label, + float epsilon, + MetaTensor* out, + MetaConfig config = MetaConfig()); -void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out); +void MatmulInferMeta(const MetaTensor& x, + const MetaTensor& y, + bool trans_x, + bool trans_y, + MetaTensor* out); + +void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out); void SegmentPoolInferMeta(const MetaTensor& x, const MetaTensor& segment_ids, @@ -93,37 +122,6 @@ void SegmentPoolInferMeta(const MetaTensor& x, MetaTensor* summed_ids, MetaConfig config = MetaConfig()); -void BCELossInferMeta(const MetaTensor& input, - const MetaTensor& label, - MetaTensor* out, - MetaConfig config = MetaConfig()); - -void BincountInferMeta(const MetaTensor& x, - const paddle::optional weights, - int minlength, - MetaTensor* out); - -void DistInferMeta(const MetaTensor& x, - const MetaTensor& y, - float p, - MetaTensor* out); - -void GatherNdInferMeta(const MetaTensor& x, - const MetaTensor& index, - MetaTensor* out); - -void GatherTreeMeta(const MetaTensor& ids, - const MetaTensor& parents, - MetaTensor* out); - -void LogLossInferMeta(const MetaTensor& input, - const MetaTensor& label, - float epsilon, - MetaTensor* out, - MetaConfig config = MetaConfig()); - -void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out); - void SigmoidCrossEntropyWithLogitsInferMeta(const MetaTensor& x, const MetaTensor& label, bool normalize, @@ -131,4 +129,11 @@ void SigmoidCrossEntropyWithLogitsInferMeta(const MetaTensor& x, MetaTensor* out, MetaConfig config = MetaConfig()); +void TriangularSolveInferMeta(const MetaTensor& x, + const MetaTensor& y, + bool upper, + bool transpose, + bool unitriangular, + MetaTensor* out); + } // namespace phi diff --git a/paddle/phi/infermeta/nullary.cc b/paddle/phi/infermeta/nullary.cc index 506d3fd14ea3fd568ce2f77d7ce30408062279e9..081084567e840f287bb113ee567888f4032f5638 100644 --- a/paddle/phi/infermeta/nullary.cc +++ b/paddle/phi/infermeta/nullary.cc @@ -16,6 +16,12 @@ limitations under the License. */ namespace phi { +void CreateInferMeta(const ScalarArray& shape, + DataType dtype, + MetaTensor* out) { + CreateInferMetaBase(shape.GetData(), dtype, DataLayout::NCHW, out); +} + void CreateInferMetaBase(const std::vector& shape, DataType dtype, DataLayout layout, @@ -26,12 +32,6 @@ void CreateInferMetaBase(const std::vector& shape, out->set_layout(layout); } -void CreateInferMeta(const ScalarArray& shape, - DataType dtype, - MetaTensor* out) { - CreateInferMetaBase(shape.GetData(), dtype, DataLayout::NCHW, out); -} - void EyeInferMeta(int64_t num_rows, int64_t num_columns, DataType dtype, @@ -41,18 +41,6 @@ void EyeInferMeta(int64_t num_rows, out->set_dtype(dtype); } -void TruncatedGaussianRandomInferMeta(const std::vector& shape, - float mean, - float std, - int seed, - DataType dtype, - MetaTensor* out) { - auto out_dims = phi::make_ddim(shape); - out->set_dims(out_dims); - out->set_dtype(dtype); - out->set_layout(DataLayout::NCHW); -} - void GaussianRandomInferMeta(const ScalarArray& shape, float mean, float std, @@ -65,4 +53,16 @@ void GaussianRandomInferMeta(const ScalarArray& shape, out->set_layout(DataLayout::NCHW); } +void TruncatedGaussianRandomInferMeta(const std::vector& shape, + float mean, + float std, + int seed, + DataType dtype, + MetaTensor* out) { + auto out_dims = phi::make_ddim(shape); + out->set_dims(out_dims); + out->set_dtype(dtype); + out->set_layout(DataLayout::NCHW); +} + } // namespace phi diff --git a/paddle/phi/infermeta/nullary.h b/paddle/phi/infermeta/nullary.h index bd0567486e4d62a9f6fe9adfa02727bfe79937e1..38eaa636f8c8779c5a1f597b8cfb23ce6efc5edc 100644 --- a/paddle/phi/infermeta/nullary.h +++ b/paddle/phi/infermeta/nullary.h @@ -28,25 +28,18 @@ namespace phi { // Because functions in this file not only can infer shape, but also need // infer lod or other useful data. +void CreateInferMeta(const ScalarArray& shape, DataType dtype, MetaTensor* out); + void CreateInferMetaBase(const std::vector& shape, DataType dtype, DataLayout layout, MetaTensor* out); -void CreateInferMeta(const ScalarArray& shape, DataType dtype, MetaTensor* out); - void EyeInferMeta(int64_t num_rows, int64_t num_columns, DataType dtype, MetaTensor* out); -void TruncatedGaussianRandomInferMeta(const std::vector& shape, - float mean, - float std, - int seed, - DataType dtype, - MetaTensor* out); - void GaussianRandomInferMeta(const ScalarArray& shape, float mean, float std, @@ -54,4 +47,11 @@ void GaussianRandomInferMeta(const ScalarArray& shape, DataType dtype, MetaTensor* out); +void TruncatedGaussianRandomInferMeta(const std::vector& shape, + float mean, + float std, + int seed, + DataType dtype, + MetaTensor* out); + } // namespace phi diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc index 88ac2cb0f8d1b01ade0e58bc8f1253c67ad05981..235cfe368c1921eac546b670470963fb49100290 100644 --- a/paddle/phi/infermeta/ternary.cc +++ b/paddle/phi/infermeta/ternary.cc @@ -18,6 +18,58 @@ limitations under the License. */ namespace phi { +void AccuracyInferMeta(const MetaTensor& out, + const MetaTensor& indice, + const MetaTensor& label, + MetaTensor* accuracy, + MetaTensor* correct, + MetaTensor* total, + MetaConfig config) { + auto inference_dim = out.dims(); + auto label_dim = label.dims(); + // Assume indices has same shape as inference, because + // it's the output of topk. + PADDLE_ENFORCE_EQ( + label_dim.size(), + 2, + phi::errors::InvalidArgument( + "ShapeError: label's dimensions of AccuracyOp must be 2. " + "But received label's dimensions = %d, label's shape = [%s]", + label_dim.size(), + label_dim)); + if (config.is_runtime) { + PADDLE_ENFORCE_EQ(label_dim[1], + 1, + phi::errors::InvalidArgument( + "ShapeError: label's second dimension of " + "AccuracyOp must be 1. But received label's " + "second dimension is = %d, label's shape = [%s]", + label_dim[1], + label_dim)); + PADDLE_ENFORCE_EQ( + inference_dim[0], + label_dim[0], + phi::errors::InvalidArgument( + "ShapeError: the output's num_rows of AccuracyOp must be" + " the same as label's num_rows. But received output's " + "shape = [%s], label's shape = [%s], output's num_rows = %d, " + "label's " + "num_rows = %d", + inference_dim, + label_dim, + inference_dim[0], + label_dim[0])); + } + + accuracy->set_dims({1}); + accuracy->set_dtype(out.dtype()); + correct->set_dims({1}); + correct->set_dtype(out.dtype()); + total->set_dims({1}); + total->set_dtype(out.dtype()); + accuracy->share_lod(out); +} + void AddmmInferMeta(const MetaTensor& input, const MetaTensor& x, const MetaTensor& y, @@ -89,6 +141,107 @@ void AddmmInferMeta(const MetaTensor& input, out->set_dtype(input.dtype()); } +void GraphSendRecvInferMeta(const MetaTensor& x, + const MetaTensor& src_index, + const MetaTensor& dst_index, + const std::string& pool_type, + MetaTensor* out, + MetaTensor* dst_count) { + auto src_index_dims = src_index.dims(); + if (src_index_dims.size() == 2) { + PADDLE_ENFORCE_EQ(src_index_dims[1], + 1, + phi::errors::InvalidArgument( + "The last dim of Src_index should be 1 when it " + "is 2D, but we get %d", + src_index_dims[1])); + } else { + PADDLE_ENFORCE_EQ( + src_index_dims.size(), + 1, + phi::errors::InvalidArgument( + "The Src_index should be 1D, when it is not 2D, but we get %d", + src_index_dims.size())); + } + + auto dst_index_dims = dst_index.dims(); + if (dst_index_dims.size() == 2) { + PADDLE_ENFORCE_EQ(dst_index_dims[1], + 1, + phi::errors::InvalidArgument( + "The last dim of Dst_index should be 1 when it " + "is 2D, but we get %d", + dst_index_dims[1])); + } else { + PADDLE_ENFORCE_EQ( + dst_index_dims.size(), + 1, + phi::errors::InvalidArgument("The Dst_index should be 1D, " + "when it is not 2D, but we get %d", + dst_index_dims.size())); + } + + PADDLE_ENFORCE_EQ(src_index_dims[0], + dst_index_dims[0], + phi::errors::InvalidArgument( + "Src_index and Dst_index should have the same shape.")); + + auto dims = x.dims(); + out->set_dims(dims); + out->set_dtype(x.dtype()); + + if (pool_type == "MEAN") { + dst_count->set_dims({dims[0]}); + dst_count->set_dtype(DataType::INT32); + } +} + +void LerpInferMeta(const MetaTensor& x, + const MetaTensor& y, + const MetaTensor& weight, + MetaTensor* out) { + auto x_dims = x.dims(); + auto y_dims = y.dims(); + auto w_dims = weight.dims(); + DDim out_dims; + out_dims = funcs::GetOutputDims(x_dims, y_dims); + if (w_dims.size() > 1 || w_dims[0] != 1) { + out_dims = funcs::GetOutputDims(out_dims, w_dims); + } + out->set_dims(out_dims); + out->set_dtype(x.dtype()); + out->share_lod(x); +} + +void LinspaceInferMeta(const MetaTensor& start, + const MetaTensor& stop, + const MetaTensor& number, + MetaTensor* out) { + auto s_dims = start.dims(); + PADDLE_ENFORCE_EQ( + (s_dims.size() == 1) && (s_dims[0] == 1), + true, + phi::errors::InvalidArgument("The shape of Input(Start) must be [1]," + "but received input shape is [%s].", + s_dims)); + auto e_dims = stop.dims(); + PADDLE_ENFORCE_EQ( + (e_dims.size() == 1) && (e_dims[0] == 1), + true, + phi::errors::InvalidArgument("The shape of Input(Stop) must be [1]," + "but received input shape is [%s].", + e_dims)); + auto step_dims = number.dims(); + PADDLE_ENFORCE_EQ( + (step_dims.size() == 1) && (step_dims[0] == 1), + true, + phi::errors::InvalidArgument("The shape of Input(Num) must be [1]," + "but received input shape is [%s].", + step_dims)); + out->set_dims(phi::make_ddim({-1})); + out->set_dtype(start.dtype()); +} + void NllLossRawInferMeta(const MetaTensor& input, const MetaTensor& label, paddle::optional weight, @@ -319,156 +472,4 @@ void ViterbiDecodeInferMeta(const MetaTensor& input, scores->set_dtype(length.dtype()); } -void LerpInferMeta(const MetaTensor& x, - const MetaTensor& y, - const MetaTensor& weight, - MetaTensor* out) { - auto x_dims = x.dims(); - auto y_dims = y.dims(); - auto w_dims = weight.dims(); - DDim out_dims; - out_dims = funcs::GetOutputDims(x_dims, y_dims); - if (w_dims.size() > 1 || w_dims[0] != 1) { - out_dims = funcs::GetOutputDims(out_dims, w_dims); - } - out->set_dims(out_dims); - out->set_dtype(x.dtype()); - out->share_lod(x); -} - -void LinspaceInferMeta(const MetaTensor& start, - const MetaTensor& stop, - const MetaTensor& number, - MetaTensor* out) { - auto s_dims = start.dims(); - PADDLE_ENFORCE_EQ( - (s_dims.size() == 1) && (s_dims[0] == 1), - true, - phi::errors::InvalidArgument("The shape of Input(Start) must be [1]," - "but received input shape is [%s].", - s_dims)); - auto e_dims = stop.dims(); - PADDLE_ENFORCE_EQ( - (e_dims.size() == 1) && (e_dims[0] == 1), - true, - phi::errors::InvalidArgument("The shape of Input(Stop) must be [1]," - "but received input shape is [%s].", - e_dims)); - auto step_dims = number.dims(); - PADDLE_ENFORCE_EQ( - (step_dims.size() == 1) && (step_dims[0] == 1), - true, - phi::errors::InvalidArgument("The shape of Input(Num) must be [1]," - "but received input shape is [%s].", - step_dims)); - out->set_dims(phi::make_ddim({-1})); - out->set_dtype(start.dtype()); -} - -void AccuracyInferMeta(const MetaTensor& out, - const MetaTensor& indice, - const MetaTensor& label, - MetaTensor* accuracy, - MetaTensor* correct, - MetaTensor* total, - MetaConfig config) { - auto inference_dim = out.dims(); - auto label_dim = label.dims(); - // Assume indices has same shape as inference, because - // it's the output of topk. - PADDLE_ENFORCE_EQ( - label_dim.size(), - 2, - phi::errors::InvalidArgument( - "ShapeError: label's dimensions of AccuracyOp must be 2. " - "But received label's dimensions = %d, label's shape = [%s]", - label_dim.size(), - label_dim)); - if (config.is_runtime) { - PADDLE_ENFORCE_EQ(label_dim[1], - 1, - phi::errors::InvalidArgument( - "ShapeError: label's second dimension of " - "AccuracyOp must be 1. But received label's " - "second dimension is = %d, label's shape = [%s]", - label_dim[1], - label_dim)); - PADDLE_ENFORCE_EQ( - inference_dim[0], - label_dim[0], - phi::errors::InvalidArgument( - "ShapeError: the output's num_rows of AccuracyOp must be" - " the same as label's num_rows. But received output's " - "shape = [%s], label's shape = [%s], output's num_rows = %d, " - "label's " - "num_rows = %d", - inference_dim, - label_dim, - inference_dim[0], - label_dim[0])); - } - - accuracy->set_dims({1}); - accuracy->set_dtype(out.dtype()); - correct->set_dims({1}); - correct->set_dtype(out.dtype()); - total->set_dims({1}); - total->set_dtype(out.dtype()); - accuracy->share_lod(out); -} - -void GraphSendRecvInferMeta(const MetaTensor& x, - const MetaTensor& src_index, - const MetaTensor& dst_index, - const std::string& pool_type, - MetaTensor* out, - MetaTensor* dst_count) { - auto src_index_dims = src_index.dims(); - if (src_index_dims.size() == 2) { - PADDLE_ENFORCE_EQ(src_index_dims[1], - 1, - phi::errors::InvalidArgument( - "The last dim of Src_index should be 1 when it " - "is 2D, but we get %d", - src_index_dims[1])); - } else { - PADDLE_ENFORCE_EQ( - src_index_dims.size(), - 1, - phi::errors::InvalidArgument( - "The Src_index should be 1D, when it is not 2D, but we get %d", - src_index_dims.size())); - } - - auto dst_index_dims = dst_index.dims(); - if (dst_index_dims.size() == 2) { - PADDLE_ENFORCE_EQ(dst_index_dims[1], - 1, - phi::errors::InvalidArgument( - "The last dim of Dst_index should be 1 when it " - "is 2D, but we get %d", - dst_index_dims[1])); - } else { - PADDLE_ENFORCE_EQ( - dst_index_dims.size(), - 1, - phi::errors::InvalidArgument("The Dst_index should be 1D, " - "when it is not 2D, but we get %d", - dst_index_dims.size())); - } - - PADDLE_ENFORCE_EQ(src_index_dims[0], - dst_index_dims[0], - phi::errors::InvalidArgument( - "Src_index and Dst_index should have the same shape.")); - - auto dims = x.dims(); - out->set_dims(dims); - out->set_dtype(x.dtype()); - - if (pool_type == "MEAN") { - dst_count->set_dims({dims[0]}); - dst_count->set_dtype(DataType::INT32); - } -} } // namespace phi diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h index c9a7e78db752f95c7e38857e3f1075a0d672246b..209a07db18b5c7a87ba094c5839149533757220d 100644 --- a/paddle/phi/infermeta/ternary.h +++ b/paddle/phi/infermeta/ternary.h @@ -45,16 +45,22 @@ void AddmmInferMeta(const MetaTensor& input, float beta, MetaTensor* out); -void GatherNdGradInferMeta(const MetaTensor& x, - const MetaTensor& index, - const MetaTensor& out_grad, - MetaTensor* x_grad); +void GraphSendRecvInferMeta(const MetaTensor& x, + const MetaTensor& src_index, + const MetaTensor& dst_index, + const std::string& pool_type, + MetaTensor* out, + MetaTensor* dst_count); -void ScatterInferMeta(const MetaTensor& x, - const MetaTensor& index, - const MetaTensor& updates, - bool overwrite, - MetaTensor* out); +void LerpInferMeta(const MetaTensor& x, + const MetaTensor& y, + const MetaTensor& weight, + MetaTensor* out); + +void LinspaceInferMeta(const MetaTensor& start, + const MetaTensor& stop, + const MetaTensor& number, + MetaTensor* out); void NllLossRawInferMeta(const MetaTensor& input, const MetaTensor& label, @@ -65,6 +71,12 @@ void NllLossRawInferMeta(const MetaTensor& input, MetaTensor* total_weight, MetaConfig config = MetaConfig()); +void ScatterInferMeta(const MetaTensor& x, + const MetaTensor& index, + const MetaTensor& updates, + bool overwrite, + MetaTensor* out); + void ScatterNdAddInferMeta(const MetaTensor& x, const MetaTensor& index, const MetaTensor& updates, @@ -78,20 +90,4 @@ void ViterbiDecodeInferMeta(const MetaTensor& input, MetaTensor* path, MetaConfig config = MetaConfig()); -void LerpInferMeta(const MetaTensor& x, - const MetaTensor& y, - const MetaTensor& weight, - MetaTensor* out); - -void LinspaceInferMeta(const MetaTensor& start, - const MetaTensor& stop, - const MetaTensor& number, - MetaTensor* out); - -void GraphSendRecvInferMeta(const MetaTensor& x, - const MetaTensor& src_index, - const MetaTensor& dst_index, - const std::string& pool_type, - MetaTensor* out, - MetaTensor* dst_count); } // namespace phi diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index 9daad7d6aaa9f5af70b4b7c3b4bfa96bc351194b..4d1cb42bd59f072e5926b237528a742c231bcdcf 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -22,10 +22,87 @@ limitations under the License. */ #include "paddle/phi/common/type_traits.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/kernels/funcs/pooling.h" #include "paddle/phi/kernels/funcs/unfold_functor.h" namespace phi { +void ArgMinMaxInferMeta(const MetaTensor& x, + int64_t axis, + bool keepdims, + bool flatten, + int dtype, + MetaTensor* out, + MetaConfig config) { + const auto& x_dims = x.dims(); + + PADDLE_ENFORCE_GE( + axis, + -x_dims.size(), + phi::errors::InvalidArgument("'axis'(%d) must be greater than or equal to" + " -Rank(X)(%d).", + axis, + -x_dims.size())); + PADDLE_ENFORCE_LT(axis, + x_dims.size(), + phi::errors::InvalidArgument( + "'axis'(%d) must be less than Rank(X)(%d) of Input(X).", + axis, + x_dims.size())); + + PADDLE_ENFORCE_EQ( + (dtype < 0 || dtype == 2 || dtype == 3), + true, + phi::errors::InvalidArgument( + "The attribute of dtype in argmin/argmax must be [%s] or [%s], but " + "received [%s]", + paddle::framework::DataTypeToString( + paddle::framework::proto::VarType::INT32), + paddle::framework::DataTypeToString( + paddle::framework::proto::VarType::INT64), + paddle::framework::DataTypeToString( + static_cast(dtype)))); + + auto x_rank = x_dims.size(); + if (axis < 0) axis += x_rank; + if (config.is_runtime) { + if (dtype == paddle::framework::proto::VarType::INT32) { + int64_t all_element_num = 0; + if (flatten) { + all_element_num = phi::product(x_dims); + + } else { + all_element_num = x_dims[axis]; + } + PADDLE_ENFORCE_LE( + all_element_num, + INT_MAX, + phi::errors::InvalidArgument( + "The element num of the argmin/argmax input at axis is " + "%d, is larger than int32 maximum value:%d, you must " + "set the dtype of argmin/argmax to 'int64'.", + all_element_num, + INT_MAX)); + } + } + std::vector vec; + if (flatten) { + vec.emplace_back(static_cast(1)); + } else { + for (int64_t i = 0; i < axis; i++) vec.emplace_back(x_dims[i]); + if (keepdims) { + vec.emplace_back(static_cast(1)); + } + for (int64_t i = axis + 1; i < x_rank; i++) vec.emplace_back(x_dims[i]); + } + out->set_dims(phi::make_ddim(vec)); + if (dtype == 2) { + out->set_dtype(DataType::INT32); + } else if (dtype == 3) { + out->set_dtype(DataType::INT64); + } +} + void ArgsortInferMeta(const MetaTensor& input, int axis, bool descending, @@ -54,96 +131,6 @@ void ArgsortInferMeta(const MetaTensor& input, indices->share_lod(input); } -void UnchangedInferMeta(const MetaTensor& x, MetaTensor* out) { - out->share_meta(x); -} - -// meta x -> out without change, check if axis in range [-Rank(x), Rank(x)-1] -void UnchangedInferMetaCheckAxis(const MetaTensor& x, - int axis, - MetaTensor* out) { - auto rank = x.dims().size(); - PADDLE_ENFORCE_GE( - axis, - -rank, - errors::InvalidArgument( - "Attr(axis) value should be in range [-R, R-1], " - "R is the rank of Input(X). But received axis: %d, R: %d.", - axis, - rank)); - PADDLE_ENFORCE_LT( - axis, - rank, - phi::errors::InvalidArgument( - "Attr(axis) value should be in range [-R, R-1], " - "R is the rank of Input(X). But received axis: %d, R: %d.", - axis, - rank)); - out->share_meta(x); -} - -void RealAndImagInferMeta(const MetaTensor& x, MetaTensor* out) { - out->set_dims(x.dims()); - out->set_dtype(dtype::ToReal(x.dtype())); - out->set_layout(x.layout()); -} - -void FlattenInferMeta(const MetaTensor& x, - int start_axis, - int stop_axis, - MetaTensor* out) { - auto x_dims = x.dims(); - int in_dims_size = x_dims.size(); - if (start_axis < 0) { - start_axis = start_axis + in_dims_size; - } - if (stop_axis < 0) { - stop_axis = stop_axis + in_dims_size; - } - PADDLE_ENFORCE_GE( - stop_axis, - start_axis, - phi::errors::InvalidArgument("The stop_axis should be greater" - "than or equal to start_axis.")); - - int64_t outer = 1; - std::vector out_shape; - out_shape.reserve(in_dims_size - stop_axis + start_axis); - - for (int i = 0; i < start_axis; ++i) { - out_shape.push_back(x_dims[i]); - } - for (int i = start_axis; i <= stop_axis; i++) { - if (x_dims[i] == -1 || outer == -1) { - outer = -1; - } else { - outer *= x_dims[i]; - } - } - out_shape.push_back(outer); - for (int i = stop_axis + 1; i < in_dims_size; i++) { - out_shape.push_back(x_dims[i]); - } - const auto& out_dims = phi::make_ddim(out_shape); - out->set_dims(out_dims); - out->set_dtype(x.dtype()); - out->set_layout(x.layout()); - - if (x_dims[0] == out_dims[0]) { - // Only pass LoD when the first dimension of output and Input(X) - // are the same. - out->share_lod(x); - } -} - -void GumbelSoftmaxInferMeta(const MetaTensor& x, - float temperature, - bool hard, - int axis, - MetaTensor* out) { - UnchangedInferMetaCheckAxis(x, axis, out); -} - void CastInferMeta(const MetaTensor& x, DataType out_dtype, MetaTensor* out) { out->set_dims(x.dims()); out->set_dtype(out_dtype); @@ -203,73 +190,275 @@ void CumsumInferMeta(const MetaTensor& x, out->share_lod(x); } -void IncrementInferMeta(const MetaTensor& x, float value, MetaTensor* out) { - PADDLE_ENFORCE_EQ( - product(x.dims()), - 1UL, - errors::InvalidArgument("The number of elements in Input(X) should be 1." - "Now the number is %d.", - product(x.dims()))); - out->set_dims(x.dims()); - out->share_lod(x); - out->set_dtype(x.dtype()); -} - -static phi::DDim ValidateShape(const std::vector shape, - const phi::DDim& in_dims) { - const int64_t in_size = phi::product(in_dims); - auto in_dims_vec = phi::vectorize(in_dims); - bool all_positive = std::all_of(in_dims_vec.cbegin(), - in_dims_vec.cend(), - [](int64_t i) { return i > 0; }); - // only one dimension can be set to -1, whose size will be automatically - // infered. - const int64_t unk_dim_val = -1; - const int64_t copy_dim_val = 0; +void DiagInferMeta(const MetaTensor& x, + int offset, + float padding_value, + MetaTensor* out) { + auto x_dims = x.dims(); - std::vector output_shape(shape.size(), 0); - int64_t capacity = 1; - int unk_dim_idx = -1; - for (size_t i = 0; i < shape.size(); ++i) { - if (shape[i] == unk_dim_val) { - PADDLE_ENFORCE_EQ( - unk_dim_idx, - -1, - phi::errors::InvalidArgument( - "Only one dimension value of 'shape' in ReshapeOp can " - "be -1. But received shape = [%s], shape[%d] is also -1.", - phi::make_ddim(shape), - i)); - unk_dim_idx = i; - } else if (shape[i] == copy_dim_val) { - PADDLE_ENFORCE_LT( - static_cast(i), - in_dims.size(), - phi::errors::InvalidArgument( - "The index of 0 in `shape` must be less than " - "the input tensor X's dimensions. " - "But received shape = [%s], shape[%d] = 0, X's shape = [%s], " - "X's dimensions = %d.", - phi::make_ddim(shape), - i, - in_dims, - in_dims.size())); + if (x_dims.size() == 1UL) { + int64_t size_ = x_dims[0] + std::abs(offset); + out->set_dims({size_, size_}); + out->set_dtype(x.dtype()); + } else if (x_dims.size() == 2UL) { + int64_t size_ = 0; + if (offset >= 0) { + // Note(LutaoChu): Do not use std::min here, otherwise the calculation + // of `size_` will have unexpected result on Windows Python3.8 + if (x_dims[0] < x_dims[1] - offset) { + size_ = x_dims[0]; + } else { + size_ = x_dims[1] - offset; + } } else { - PADDLE_ENFORCE_GT( - shape[i], - 0, - phi::errors::InvalidArgument( - "Each dimension value of 'shape' in ReshapeOp must not " - "be negative except one unknown dimension. " - "But received shape = [%s], shape[%d] = %d.", - phi::make_ddim(shape), - i, - shape[i])); + // Note(LutaoChu): Do not use std::min here, otherwise the calculation + // of `size_` will have unexpected result on Windows Python3.8 + if (x_dims[0] + offset < x_dims[1]) { + size_ = x_dims[0] + offset; + } else { + size_ = x_dims[1]; + } } - - // NOTE all non-zero values will be converted to True (include negative - // value) - capacity *= (shape[i] ? shape[i] : in_dims[i]); + out->set_dims({size_}); + out->set_dtype(x.dtype()); + } else { + PADDLE_THROW(phi::errors::InvalidArgument( + "The input tensor X's dimensions of DiagV2Op should be either 1 or " + "2, but received %d.", + x_dims.size())); + } +} + +void DiagonalInferMeta(const MetaTensor& input, + int offset, + int axis1, + int axis2, + MetaTensor* out) { + auto x_dims = input.dims(); + int offset_ = offset; + int axis1_ = axis1 < 0 ? x_dims.size() + axis1 : axis1; + int axis2_ = axis2 < 0 ? x_dims.size() + axis2 : axis2; + + PADDLE_ENFORCE_GE( + x_dims.size(), + 2, + phi::errors::OutOfRange("Input's dim is out of range (expected at " + "least 2 dimensions, but got %ld).", + x_dims.size())); + PADDLE_ENFORCE_LT( + axis1_, + x_dims.size(), + phi::errors::OutOfRange( + "Attr(axis1) is out of range (expected to be in range of [%ld, " + "%ld], but got %ld).", + -(x_dims.size()), + (x_dims.size() - 1), + axis1)); + PADDLE_ENFORCE_LT( + axis2_, + x_dims.size(), + phi::errors::OutOfRange( + "Attr(axis2) is out of range (expected to be in range of [%ld, " + "%ld], but got %ld).", + -(x_dims.size()), + (x_dims.size() - 1), + axis2)); + PADDLE_ENFORCE_NE( + axis1_, + axis2_, + phi::errors::InvalidArgument("The dimensions should not be identical " + "%d vs %d.", + axis1, + axis2)); + + auto out_dims = vectorize(x_dims); + // from out_dims get the dim size of axis1_. + auto axis1_size = out_dims[axis1_]; + auto axis2_size = out_dims[axis2_]; + // delete two dims by attr axis1 and axis2 from out_dims. + /* example: + out_dim = [2, 3, 4]; + axis1 = 0; + axis2 = 1; + according to the attr of axis1 and axis2, we get: + out_dim = [4]. + */ + out_dims.erase(out_dims.begin() + std::max(axis1_, axis2_)); + out_dims.erase(out_dims.begin() + std::min(axis1_, axis2_)); + + if (offset_ == 0) { + out_dims.push_back(std::min(axis1_size, axis2_size)); + } else if (offset_ > 0) { + if ((axis2_size - offset_) > 0) { + out_dims.push_back(std::min(axis1_size, axis2_size - offset_)); + } else { + out_dims.push_back(0); + } + } else { + if ((axis1_size + offset_) > 0) { + out_dims.push_back(std::min(axis1_size + offset_, axis2_size)); + } else { + out_dims.push_back(0); + } + } + out->set_dims(phi::make_ddim(out_dims)); +} + +void EighInferMeta(const MetaTensor& x, + const std::string& uplo, + MetaTensor* out_w, + MetaTensor* out_v) { + auto input_dim = x.dims(); + auto rank = input_dim.size(); + + PADDLE_ENFORCE_GE(rank, + 2, + phi::errors::InvalidArgument( + "The Input(X) should have at least 2 dimensions." + "But received a %d dimension tensor.", + rank)); + PADDLE_ENFORCE_EQ( + input_dim[rank - 2], + input_dim[rank - 1], + phi::errors::InvalidArgument( + "Eigh op is designed for square matrix, consequently" + "inner-most 2 dimensions of Input(X) should be symmetric." + "But received X's shape[-2] = %d and shape[-1] = %d.", + input_dim[rank - 2], + input_dim[rank - 1])); + + std::vector values_dim; + + for (auto i = 0; i < rank - 1; i++) { + values_dim.emplace_back(input_dim[i]); + } + out_w->set_dims(phi::make_ddim(values_dim)); + out_v->set_dims(input_dim); +} + +void FlattenInferMeta(const MetaTensor& x, + int start_axis, + int stop_axis, + MetaTensor* out) { + auto x_dims = x.dims(); + int in_dims_size = x_dims.size(); + if (start_axis < 0) { + start_axis = start_axis + in_dims_size; + } + if (stop_axis < 0) { + stop_axis = stop_axis + in_dims_size; + } + PADDLE_ENFORCE_GE( + stop_axis, + start_axis, + phi::errors::InvalidArgument("The stop_axis should be greater" + "than or equal to start_axis.")); + + int64_t outer = 1; + std::vector out_shape; + out_shape.reserve(in_dims_size - stop_axis + start_axis); + + for (int i = 0; i < start_axis; ++i) { + out_shape.push_back(x_dims[i]); + } + for (int i = start_axis; i <= stop_axis; i++) { + if (x_dims[i] == -1 || outer == -1) { + outer = -1; + } else { + outer *= x_dims[i]; + } + } + out_shape.push_back(outer); + for (int i = stop_axis + 1; i < in_dims_size; i++) { + out_shape.push_back(x_dims[i]); + } + const auto& out_dims = phi::make_ddim(out_shape); + out->set_dims(out_dims); + out->set_dtype(x.dtype()); + out->set_layout(x.layout()); + + if (x_dims[0] == out_dims[0]) { + // Only pass LoD when the first dimension of output and Input(X) + // are the same. + out->share_lod(x); + } +} + +void GumbelSoftmaxInferMeta(const MetaTensor& x, + float temperature, + bool hard, + int axis, + MetaTensor* out) { + UnchangedInferMetaCheckAxis(x, axis, out); +} + +void IncrementInferMeta(const MetaTensor& x, float value, MetaTensor* out) { + PADDLE_ENFORCE_EQ( + product(x.dims()), + 1UL, + errors::InvalidArgument("The number of elements in Input(X) should be 1." + "Now the number is %d.", + product(x.dims()))); + out->set_dims(x.dims()); + out->share_lod(x); + out->set_dtype(x.dtype()); +} + +static phi::DDim ValidateShape(const std::vector shape, + const phi::DDim& in_dims) { + const int64_t in_size = phi::product(in_dims); + auto in_dims_vec = phi::vectorize(in_dims); + bool all_positive = std::all_of(in_dims_vec.cbegin(), + in_dims_vec.cend(), + [](int64_t i) { return i > 0; }); + // only one dimension can be set to -1, whose size will be automatically + // infered. + const int64_t unk_dim_val = -1; + const int64_t copy_dim_val = 0; + + std::vector output_shape(shape.size(), 0); + int64_t capacity = 1; + int unk_dim_idx = -1; + for (size_t i = 0; i < shape.size(); ++i) { + if (shape[i] == unk_dim_val) { + PADDLE_ENFORCE_EQ( + unk_dim_idx, + -1, + phi::errors::InvalidArgument( + "Only one dimension value of 'shape' in ReshapeOp can " + "be -1. But received shape = [%s], shape[%d] is also -1.", + phi::make_ddim(shape), + i)); + unk_dim_idx = i; + } else if (shape[i] == copy_dim_val) { + PADDLE_ENFORCE_LT( + static_cast(i), + in_dims.size(), + phi::errors::InvalidArgument( + "The index of 0 in `shape` must be less than " + "the input tensor X's dimensions. " + "But received shape = [%s], shape[%d] = 0, X's shape = [%s], " + "X's dimensions = %d.", + phi::make_ddim(shape), + i, + in_dims, + in_dims.size())); + } else { + PADDLE_ENFORCE_GT( + shape[i], + 0, + phi::errors::InvalidArgument( + "Each dimension value of 'shape' in ReshapeOp must not " + "be negative except one unknown dimension. " + "But received shape = [%s], shape[%d] = %d.", + phi::make_ddim(shape), + i, + shape[i])); + } + + // NOTE all non-zero values will be converted to True (include negative + // value) + capacity *= (shape[i] ? shape[i] : in_dims[i]); output_shape[i] = (shape[i] ? static_cast(shape[i]) : in_dims[i]); } @@ -360,22 +549,99 @@ void IsEmptyInferMeta(const MetaTensor& x, MetaTensor* out) { out->set_dtype(DataType::BOOL); } -void MultinomialInferMeta(const MetaTensor& x, - int num_samples, - bool replacement, - MetaTensor* out) { - auto x_dim = x.dims(); - int64_t x_rank = x_dim.size(); - PADDLE_ENFORCE_GT(x_rank, - 0, - errors::InvalidArgument( - "The number of dimensions of the input probability " - "distribution should be > 0, but got %d.", - x_rank)); - PADDLE_ENFORCE_LE(x_rank, - 2, - errors::InvalidArgument( - "The number of dimensions of the input probability " +void IsfiniteInferMeta(const MetaTensor& x, MetaTensor* out) { + out->set_dims(x.dims()); + out->set_dtype(DataType::BOOL); +} + +void MaxPoolWithIndexInferMeta(const MetaTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool global_pooling, + bool adaptive, + MetaTensor* out, + MetaTensor* mask, + MetaConfig config) { + std::vector paddings_ = paddings; + std::vector kernel_size_ = kernel_size; + + auto x_dims = x.dims(); + + PADDLE_ENFORCE( + x_dims.size() == 4 || x_dims.size() == 5, + errors::InvalidArgument( + "Pooling intput should be 4-D or 5-D tensor but received %dD-Tensor", + x_dims.size())); + + if (global_pooling) { + kernel_size_.resize(static_cast(x_dims.size()) - 2); + for (size_t i = 0; i < kernel_size_.size(); ++i) { + paddings_[i] = 0; + kernel_size_[i] = static_cast(x_dims[i + 2]); + } + } + + PADDLE_ENFORCE_EQ( + x_dims.size() - kernel_size_.size(), + 2U, + errors::InvalidArgument( + "The input size %d minus the kernel size %d should equal to 2.", + x_dims.size(), + kernel_size_.size())); + PADDLE_ENFORCE_EQ( + kernel_size_.size(), + strides.size(), + errors::InvalidArgument( + "Strides size %d and pooling size %d should be the same.", + strides.size(), + kernel_size_.size())); + PADDLE_ENFORCE_EQ( + kernel_size_.size(), + paddings_.size(), + errors::InvalidArgument( + "Paddings size %d and pooling size %d should be the same.", + paddings_.size(), + kernel_size_.size())); + + std::vector output_shape({x_dims[0], x_dims[1]}); + if (adaptive) { + output_shape.insert( + output_shape.end(), kernel_size_.begin(), kernel_size_.end()); + } else { + for (size_t i = 0; i < kernel_size_.size(); ++i) { + if ((!config.is_runtime) && (x_dims[i + 2] < 0)) { + output_shape.push_back(x_dims[i + 2]); + } else { + output_shape.push_back(funcs::MaxPoolOutputSize( + x_dims[i + 2], kernel_size_[i], paddings_[i], strides[i])); + } + } + } + + out->set_dims(make_ddim(output_shape)); + out->set_dtype(x.dtype()); + + mask->set_dims(make_ddim(output_shape)); + mask->set_dtype(paddle::experimental::CppTypeToDataType::Type()); +} + +void MultinomialInferMeta(const MetaTensor& x, + int num_samples, + bool replacement, + MetaTensor* out) { + auto x_dim = x.dims(); + int64_t x_rank = x_dim.size(); + PADDLE_ENFORCE_GT(x_rank, + 0, + errors::InvalidArgument( + "The number of dimensions of the input probability " + "distribution should be > 0, but got %d.", + x_rank)); + PADDLE_ENFORCE_LE(x_rank, + 2, + errors::InvalidArgument( + "The number of dimensions of the input probability " "distribution should be <= 2, but got %d.", x_rank)); @@ -395,124 +661,209 @@ void MultinomialInferMeta(const MetaTensor& x, out->set_dtype(DataType::INT64); } -void TileInferMeta(const MetaTensor& x, - const ScalarArray& repeat_times, +void PadInferMeta(const MetaTensor& input, + const std::vector& paddings, + float pad_value, + MetaTensor* out, + MetaConfig config) { + auto x_dim = input.dims(); + PADDLE_ENFORCE_EQ( + static_cast(paddings.size()), + x_dim.size() * 2, + phi::errors::InvalidArgument( + "Size of 'paddings' dimension should be equal to 2 * size of " + "Input(X)'s dimension, but received (size of 'paddings' dimension " + "is) %d vs (2 * size of Input(X)'s dimension is) %d.", + static_cast(paddings.size()), + x_dim.size() * 2)); + for (size_t i = 0; i < paddings.size(); ++i) { + PADDLE_ENFORCE_GE(paddings[i], + 0, + phi::errors::InvalidArgument( + "The element of 'paddings' should >= 0, but " + "received %d for index %d.", + paddings[i], + static_cast(i))); + } + std::vector out_dims(x_dim.size()); + for (int i = 0; i < x_dim.size(); ++i) { + if ((!config.is_runtime) && (x_dim[i] == -1)) { + out_dims[i] = -1; + } else { + out_dims[i] = x_dim[i] + paddings[i * 2] + paddings[i * 2 + 1]; + } + } + out->set_dims(phi::make_ddim(out_dims)); + if (out_dims[0] == x_dim[0]) { + // Only pass LoD when the first dimension is equal between + // output and input. + out->share_lod(input); + } + out->set_dtype(input.dtype()); +} + +void PixelShuffleInferMeta(const MetaTensor& x, + int upscale_factor, + const std::string& data_format, + MetaTensor* out) { + auto input_dims = x.dims(); + PADDLE_ENFORCE_EQ(input_dims.size(), + 4, + phi::errors::InvalidArgument( + "Input should be a 4-D tensor of format [N, C, H, W] " + "or [N, H, W, C], but got %u.", + input_dims.size())); + + const bool channel_last = (data_format == "NHWC"); + + if (!channel_last) { + PADDLE_ENFORCE_EQ(input_dims[1] % (upscale_factor * upscale_factor), + 0, + phi::errors::InvalidArgument( + "The square of upscale_factor[%u] should divide the " + "number of channel[%u]", + upscale_factor * upscale_factor, + input_dims[1])); + } else { + PADDLE_ENFORCE_EQ(input_dims[3] % (upscale_factor * upscale_factor), + 0, + phi::errors::InvalidArgument( + "The square of upscale_factor[%u] should divide the " + "number of channel[%u]", + upscale_factor * upscale_factor, + input_dims[3])); + } + auto output_dims = input_dims; + output_dims[0] = input_dims[0]; + if (!channel_last) { + output_dims[1] = input_dims[1] / (upscale_factor * upscale_factor); + output_dims[2] = input_dims[2] * upscale_factor; + output_dims[3] = input_dims[3] * upscale_factor; + } else { + output_dims[1] = input_dims[1] * upscale_factor; + output_dims[2] = input_dims[2] * upscale_factor; + output_dims[3] = input_dims[3] / (upscale_factor * upscale_factor); + } + out->set_dtype(x.dtype()); + out->set_dims(output_dims); +} + +void PoolInferMeta(const MetaTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, MetaTensor* out, MetaConfig config) { -#define MAX_RANK_SUPPORTED 6 + std::vector paddings_ = paddings; + std::vector kernel_size_ = kernel_size; - auto repeat_times_data = repeat_times.GetData(); auto x_dims = x.dims(); - if (repeat_times_data.size() == 0) { - repeat_times_data = std::vector(x_dims.size(), -1); - } - - PADDLE_ENFORCE_LE( - x_dims.size(), - MAX_RANK_SUPPORTED, - errors::InvalidArgument( - "The rank of the input 'x' for tile op " - "must not be greater than %d, but the value received is %d.", - MAX_RANK_SUPPORTED, - x_dims.size())); - PADDLE_ENFORCE_LE( - repeat_times_data.size(), - MAX_RANK_SUPPORTED, - errors::InvalidArgument( - "The size of the shape of input 'repeat_times' for tile op " - "must not be greater than %d, but the value received is %d.", - MAX_RANK_SUPPORTED, - repeat_times_data.size())); - PADDLE_ENFORCE_GE( - repeat_times_data.size(), - 1, + PADDLE_ENFORCE_EQ( + x_dims.size() == 4 || x_dims.size() == 5, + true, errors::InvalidArgument( - "The size of the shape of input 'repeat_times' for tile op " - "must be positive integers, but the value received is %d.", - repeat_times_data.size())); + "the input of Op(pool) should be 4-D or 5-D Tensor. But " + "received: %u-D Tensor and it's shape is [%s].", + x_dims.size(), + x_dims)); - auto out_rank = - std::max(static_cast(x_dims.size()), repeat_times_data.size()); - std::vector out_shape(out_rank); - auto x_dim_vec = phi::vectorize(x_dims); - if (x_dim_vec.size() > repeat_times_data.size()) { - auto diff = x_dim_vec.size() - repeat_times_data.size(); - repeat_times_data.insert(repeat_times_data.begin(), diff, -1); + PADDLE_ENFORCE_EQ(x_dims.size() - kernel_size_.size(), + 2U, + errors::InvalidArgument( + "the dimension of input minus the size of " + "Attr(kernel_size_) must be euqal to 2 in Op(pool). " + "But received: the dimension of input minus the size " + "of Attr(kernel_size_) is %d, the " + "input's dimension is %d, the shape of input " + "is [%s], the Attr(kernel_size_)'s size is %d, the " + "Attr(kernel_size_) is [%s].", + x_dims.size() - kernel_size_.size(), + x_dims.size(), + x_dims, + kernel_size_.size(), + make_ddim(kernel_size_))); + + PADDLE_ENFORCE_EQ( + kernel_size_.size(), + strides.size(), + errors::InvalidArgument( + "the size of Attr(kernel_size_) and Attr(strides) in " + "Op(pool) must be equal. " + "But received: Attr(kernel_size_)'s size is %d, Attr(strides)'s " + "size is %d, Attr(kernel_size_) is [%s], Attr(strides)is [%s].", + kernel_size_.size(), + strides.size(), + make_ddim(kernel_size_), + make_ddim(strides))); + + // MKL-DNN Kernels are using NCHW order of dims description + // so we ignore data_format consideration for MKL-DNN kernel + const bool channel_last = (config.is_run_mkldnn_kernel == false) && + (data_format == "NHWC" || data_format == "NDHWC"); + + // update paddings if "SAME" or global_pooling + DDim data_dims; + if (channel_last) { + data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1); } else { - auto diff = repeat_times_data.size() - x_dim_vec.size(); - x_dim_vec.insert(x_dim_vec.begin(), diff, -1); + data_dims = slice_ddim(x_dims, 2, x_dims.size()); } - for (size_t i = 0; i < repeat_times_data.size(); ++i) { - if (x_dim_vec[i] == -1 || repeat_times_data[i] == -1) { - out_shape[i] = -1; - } else { - PADDLE_ENFORCE_GT( - repeat_times_data[i], - 0, - errors::InvalidArgument( - "Every element of the input 'repeat_times' for tile op must be " - "greater than 0, but the value given is %d.", - repeat_times_data[i])); - out_shape[i] = x_dim_vec[i] * repeat_times_data[i]; - } + funcs::UpdatePadding(&paddings_, + global_pooling, + adaptive, + padding_algorithm, + data_dims, + strides, + kernel_size_); + + if (global_pooling) { + funcs::UpdateKernelSize(&kernel_size_, data_dims); } - out->set_dims(phi::make_ddim(out_shape)); - if (out_shape[0] == x_dims[0]) { - out->share_lod(x); + std::vector output_shape; + if (adaptive) { + output_shape.insert( + output_shape.end(), kernel_size_.begin(), kernel_size_.end()); + } else { + for (int i = 0; i < data_dims.size(); ++i) { + if ((!config.is_runtime) && (data_dims[i] < 0)) { + output_shape.push_back(data_dims[i]); + } else { + output_shape.push_back(funcs::PoolOutputSize(data_dims[i], + kernel_size_[i], + paddings_[2 * i], + paddings_[2 * i + 1], + strides[i], + ceil_mode)); + } + } } -} -void ReshapeInferMeta(const MetaTensor& x, - const ScalarArray& shape, - MetaTensor* out, - MetaConfig config) { - auto& shape_data = shape.GetData(); - PADDLE_ENFORCE_NOT_NULL(out, - phi::errors::InvalidArgument( - "Output(Out) of ReshapeOp should not be null.")); - if (!config.is_runtime && shape.FromTensor()) { - out->set_dims(phi::make_ddim(shape_data)); - out->share_lod(x); - return; + // output_N = input_N + output_shape.insert(output_shape.begin(), x_dims[0]); + // output_C = input_C + if (channel_last) { + output_shape.push_back(x_dims[x_dims.size() - 1]); + } else { + output_shape.insert(output_shape.begin() + 1, x_dims[1]); } - PADDLE_ENFORCE_GT(shape_data.size(), - 0, - phi::errors::InvalidArgument( - "The shape's size in ReshapeOp can't be zero.")); - InferMetaFromVecValue(x, shape_data, out); + + out->set_dims(make_ddim(output_shape)); + out->share_lod(x); + out->set_dtype(x.dtype()); } -void ReshapeWithXShapeInferMeta(const MetaTensor& x, - const ScalarArray& shape, - MetaTensor* xshape, - MetaTensor* out, - MetaConfig config) { - PADDLE_ENFORCE_NOT_NULL( - xshape, - phi::errors::InvalidArgument( - "Output(XShape) of ReshapeOp should not be null.")); - const auto& x_dims = x.dims(); - std::vector xshape_dims(x_dims.size() + 1); - xshape_dims[0] = 0; - for (int i = 0; i < x_dims.size(); ++i) { - xshape_dims[i + 1] = x_dims[i]; - } - xshape->set_dims(phi::make_ddim(xshape_dims)); - xshape->share_lod(x); - ReshapeInferMeta(x, shape, out, config); -} - -/* Why not use SumRawInferMeta directly? - Because we need make InferMetaFunction's args follow the design of api.yaml -*/ -void SumInferMeta(const MetaTensor& x, - const std::vector& axis, - DataType dtype, - bool keep_dim, - MetaTensor* out) { - bool reduce_all = false; - SumRawInferMeta(x, axis, keep_dim, reduce_all, dtype, out); +void RealAndImagInferMeta(const MetaTensor& x, MetaTensor* out) { + out->set_dims(x.dims()); + out->set_dtype(dtype::ToReal(x.dtype())); + out->set_layout(x.layout()); } DDim ReduceInferDim(const MetaTensor& x, @@ -584,29 +935,12 @@ DDim ReduceInferDim(const MetaTensor& x, return out_dim; } -void SumRawInferMeta(const MetaTensor& x, +void ReduceInferMeta(const MetaTensor& x, const std::vector& axis, bool keep_dim, - bool reduce_all, - DataType dtype, MetaTensor* out) { - DDim out_dim = ReduceInferDim(x, axis, keep_dim, reduce_all); - - DataType out_dtype; - if (dtype != DataType::UNDEFINED) { - out_dtype = dtype; - } else { - if (x.dtype() == DataType::BOOL || x.dtype() == DataType::INT32 || - x.dtype() == DataType::INT64) { - out_dtype = DataType::INT64; - } else { - out_dtype = x.dtype(); - } - } - - out->set_dims(out_dim); - out->set_dtype(out_dtype); - out->set_layout(x.layout()); + bool reduce_all = false; + ReduceInferMetaBase(x, axis, keep_dim, reduce_all, out); } void ReduceInferMetaBase(const MetaTensor& x, @@ -620,20 +954,96 @@ void ReduceInferMetaBase(const MetaTensor& x, out->set_layout(x.layout()); } -void ReduceInferMeta(const MetaTensor& x, - const std::vector& axis, - bool keep_dim, - MetaTensor* out) { - bool reduce_all = false; - ReduceInferMetaBase(x, axis, keep_dim, reduce_all, out); +void ReshapeInferMeta(const MetaTensor& x, + const ScalarArray& shape, + MetaTensor* out, + MetaConfig config) { + auto& shape_data = shape.GetData(); + PADDLE_ENFORCE_NOT_NULL(out, + phi::errors::InvalidArgument( + "Output(Out) of ReshapeOp should not be null.")); + if (!config.is_runtime && shape.FromTensor()) { + out->set_dims(phi::make_ddim(shape_data)); + out->share_lod(x); + return; + } + PADDLE_ENFORCE_GT(shape_data.size(), + 0, + phi::errors::InvalidArgument( + "The shape's size in ReshapeOp can't be zero.")); + InferMetaFromVecValue(x, shape_data, out); } -void TransferLayoutInferMeta(const MetaTensor& x, - DataLayout layout, - MetaTensor* out) { +void ReshapeWithXShapeInferMeta(const MetaTensor& x, + const ScalarArray& shape, + MetaTensor* xshape, + MetaTensor* out, + MetaConfig config) { + PADDLE_ENFORCE_NOT_NULL( + xshape, + phi::errors::InvalidArgument( + "Output(XShape) of ReshapeOp should not be null.")); + const auto& x_dims = x.dims(); + std::vector xshape_dims(x_dims.size() + 1); + xshape_dims[0] = 0; + for (int i = 0; i < x_dims.size(); ++i) { + xshape_dims[i + 1] = x_dims[i]; + } + xshape->set_dims(phi::make_ddim(xshape_dims)); + xshape->share_lod(x); + ReshapeInferMeta(x, shape, out, config); +} + +void ShardIndexInferMeta(const MetaTensor& in, + int index_num, + int nshards, + int shard_id, + int ignore_value, + MetaTensor* out, + MetaConfig config) { + auto x_dims = in.dims(); + PADDLE_ENFORCE_GE( + x_dims.size(), + 2, + phi::errors::InvalidArgument("Rank of Input(X) should be at least 2, " + "but the value given is %d.", + x_dims.size())); + if (config.is_runtime || x_dims[x_dims.size() - 1] > 0) { + PADDLE_ENFORCE_EQ(x_dims[x_dims.size() - 1], + 1U, + phi::errors::InvalidArgument( + "The last dimension of Input(X) should be 1, " + "but the value given is %d.", + x_dims[x_dims.size() - 1])); + } + + out->set_dims(x_dims); + out->share_lod(in); + out->set_dtype(in.dtype()); +} + +void SizeInferMeta(const MetaTensor& input, MetaTensor* out) { + out->set_dtype(DataType::INT64); + out->set_dims({1}); +} + +void SoftmaxInferMeta(const MetaTensor& x, int axis, MetaTensor* out) { + auto dim_x = x.dims(); + auto rank_x = dim_x.size(); + PADDLE_ENFORCE_GE(axis, + -rank_x, + phi::errors::InvalidArgument( + "Attr(axis) value should be in range [-R, R-1], " + "R is the rank of Input(X).")); + PADDLE_ENFORCE_LT(axis, + rank_x, + phi::errors::InvalidArgument( + "Attr(axis) value should be in range [-R, R-1], " + "R is the rank of Input(X).")); + out->set_dims(x.dims()); out->set_dtype(x.dtype()); - out->set_layout(layout); + out->share_lod(x); } void SplitInferMeta(const MetaTensor& x, @@ -767,22 +1177,108 @@ void SplitInferMeta(const MetaTensor& x, } } -void UnbindInferMeta(const MetaTensor& x, - int axis, - std::vector* outs) { - auto in_dims = x.dims(); - std::vector out_dim; - axis = axis < 0 ? in_dims.size() + axis : axis; - for (int i = 0; i < in_dims.size(); ++i) { - if (i != axis) out_dim.push_back(in_dims[i]); +/* Why not use SumRawInferMeta directly? + Because we need make InferMetaFunction's args follow the design of api.yaml +*/ +void SumInferMeta(const MetaTensor& x, + const std::vector& axis, + DataType dtype, + bool keep_dim, + MetaTensor* out) { + bool reduce_all = false; + SumRawInferMeta(x, axis, keep_dim, reduce_all, dtype, out); +} + +void SumRawInferMeta(const MetaTensor& x, + const std::vector& axis, + bool keep_dim, + bool reduce_all, + DataType dtype, + MetaTensor* out) { + DDim out_dim = ReduceInferDim(x, axis, keep_dim, reduce_all); + + DataType out_dtype; + if (dtype != DataType::UNDEFINED) { + out_dtype = dtype; + } else { + if (x.dtype() == DataType::BOOL || x.dtype() == DataType::INT32 || + x.dtype() == DataType::INT64) { + out_dtype = DataType::INT64; + } else { + out_dtype = x.dtype(); + } } - auto out_dims = phi::make_ddim(out_dim); - for (size_t i = 0; i < outs->size(); ++i) { - (*outs)[i].set_dtype(x.dtype()); - (*outs)[i].set_dims(out_dims); - (*outs)[i].set_layout(x.layout()); - (*outs)[i].share_lod(x); + out->set_dims(out_dim); + out->set_dtype(out_dtype); + out->set_layout(x.layout()); +} + +void TileInferMeta(const MetaTensor& x, + const ScalarArray& repeat_times, + MetaTensor* out, + MetaConfig config) { +#define MAX_RANK_SUPPORTED 6 + + auto repeat_times_data = repeat_times.GetData(); + auto x_dims = x.dims(); + if (repeat_times_data.size() == 0) { + repeat_times_data = std::vector(x_dims.size(), -1); + } + + PADDLE_ENFORCE_LE( + x_dims.size(), + MAX_RANK_SUPPORTED, + errors::InvalidArgument( + "The rank of the input 'x' for tile op " + "must not be greater than %d, but the value received is %d.", + MAX_RANK_SUPPORTED, + x_dims.size())); + PADDLE_ENFORCE_LE( + repeat_times_data.size(), + MAX_RANK_SUPPORTED, + errors::InvalidArgument( + "The size of the shape of input 'repeat_times' for tile op " + "must not be greater than %d, but the value received is %d.", + MAX_RANK_SUPPORTED, + repeat_times_data.size())); + PADDLE_ENFORCE_GE( + repeat_times_data.size(), + 1, + errors::InvalidArgument( + "The size of the shape of input 'repeat_times' for tile op " + "must be positive integers, but the value received is %d.", + repeat_times_data.size())); + + auto out_rank = + std::max(static_cast(x_dims.size()), repeat_times_data.size()); + std::vector out_shape(out_rank); + auto x_dim_vec = phi::vectorize(x_dims); + if (x_dim_vec.size() > repeat_times_data.size()) { + auto diff = x_dim_vec.size() - repeat_times_data.size(); + repeat_times_data.insert(repeat_times_data.begin(), diff, -1); + } else { + auto diff = repeat_times_data.size() - x_dim_vec.size(); + x_dim_vec.insert(x_dim_vec.begin(), diff, -1); + } + for (size_t i = 0; i < repeat_times_data.size(); ++i) { + if (x_dim_vec[i] == -1 || repeat_times_data[i] == -1) { + out_shape[i] = -1; + } else { + PADDLE_ENFORCE_GT( + repeat_times_data[i], + 0, + errors::InvalidArgument( + "Every element of the input 'repeat_times' for tile op must be " + "greater than 0, but the value given is %d.", + repeat_times_data[i])); + out_shape[i] = x_dim_vec[i] * repeat_times_data[i]; + } + } + + out->set_dims(phi::make_ddim(out_shape)); + if (out_shape[0] == x_dims[0]) { + out->share_lod(x); } } @@ -840,79 +1336,112 @@ void TraceInferMeta( out->set_dtype(x.dtype()); } -void DiagonalInferMeta(const MetaTensor& input, - int offset, - int axis1, - int axis2, - MetaTensor* out) { - auto x_dims = input.dims(); - int offset_ = offset; - int axis1_ = axis1 < 0 ? x_dims.size() + axis1 : axis1; - int axis2_ = axis2 < 0 ? x_dims.size() + axis2 : axis2; +void TransferLayoutInferMeta(const MetaTensor& x, + DataLayout layout, + MetaTensor* out) { + out->set_dims(x.dims()); + out->set_dtype(x.dtype()); + out->set_layout(layout); +} - PADDLE_ENFORCE_GE( - x_dims.size(), - 2, - phi::errors::OutOfRange("Input's dim is out of range (expected at " - "least 2 dimensions, but got %ld).", - x_dims.size())); - PADDLE_ENFORCE_LT( - axis1_, - x_dims.size(), - phi::errors::OutOfRange( - "Attr(axis1) is out of range (expected to be in range of [%ld, " - "%ld], but got %ld).", - -(x_dims.size()), - (x_dims.size() - 1), - axis1)); - PADDLE_ENFORCE_LT( - axis2_, - x_dims.size(), - phi::errors::OutOfRange( - "Attr(axis2) is out of range (expected to be in range of [%ld, " - "%ld], but got %ld).", - -(x_dims.size()), - (x_dims.size() - 1), - axis2)); - PADDLE_ENFORCE_NE( - axis1_, - axis2_, - phi::errors::InvalidArgument("The dimensions should not be identical " - "%d vs %d.", - axis1, - axis2)); +void TransposeInferMeta(const MetaTensor& x, + const std::vector& axis, + MetaTensor* out) { + auto x_dims = x.dims(); + size_t x_rank = x_dims.size(); + size_t axis_size = axis.size(); - auto out_dims = vectorize(x_dims); - // from out_dims get the dim size of axis1_. - auto axis1_size = out_dims[axis1_]; - auto axis2_size = out_dims[axis2_]; - // delete two dims by attr axis1 and axis2 from out_dims. - /* example: - out_dim = [2, 3, 4]; - axis1 = 0; - axis2 = 1; - according to the attr of axis1 and axis2, we get: - out_dim = [4]. - */ - out_dims.erase(out_dims.begin() + std::max(axis1_, axis2_)); - out_dims.erase(out_dims.begin() + std::min(axis1_, axis2_)); + PADDLE_ENFORCE_EQ( + x_rank, + axis_size, + errors::InvalidArgument("The input tensor's dimension " + "should be equal to the axis's size. " + "But received input tensor's dimension is %d, " + "axis's size is %d", + x_rank, + axis_size)); - if (offset_ == 0) { - out_dims.push_back(std::min(axis1_size, axis2_size)); - } else if (offset_ > 0) { - if ((axis2_size - offset_) > 0) { - out_dims.push_back(std::min(axis1_size, axis2_size - offset_)); - } else { - out_dims.push_back(0); - } - } else { - if ((axis1_size + offset_) > 0) { - out_dims.push_back(std::min(axis1_size + offset_, axis2_size)); - } else { - out_dims.push_back(0); - } + std::vector count(axis_size, 0); + for (size_t i = 0; i < axis_size; i++) { + PADDLE_ENFORCE_GE( + axis[i], + 0, + errors::InvalidArgument("The axis should be greater than or equal to 0." + "But received %d of axis[%d]", + axis[i], + i)); + + PADDLE_ENFORCE_EQ( + axis[i] < static_cast(axis_size) && ++count[axis[i]] == 1, + true, + errors::InvalidArgument( + "Each element of Attribute axis should " + "be a unique value range from 0 to (dims - 1), " + "where the dims is the axis's size, " + "unique value means this axis value can appear only once. " + "But received axis[%d] is %d, axis_size is %d, " + "count[axis[%d]] is %d", + i, + axis[i], + axis_size, + i, + count[axis[i]])); } - out->set_dims(phi::make_ddim(out_dims)); + + phi::DDim out_dims(x_dims); + for (size_t i = 0; i < axis_size; ++i) { + out_dims[i] = x_dims[axis[i]]; + } + + out->set_dims(out_dims); + out->set_dtype(x.dtype()); +} + +void UnbindInferMeta(const MetaTensor& x, + int axis, + std::vector* outs) { + auto in_dims = x.dims(); + std::vector out_dim; + axis = axis < 0 ? in_dims.size() + axis : axis; + for (int i = 0; i < in_dims.size(); ++i) { + if (i != axis) out_dim.push_back(in_dims[i]); + } + auto out_dims = phi::make_ddim(out_dim); + + for (size_t i = 0; i < outs->size(); ++i) { + (*outs)[i].set_dtype(x.dtype()); + (*outs)[i].set_dims(out_dims); + (*outs)[i].set_layout(x.layout()); + (*outs)[i].share_lod(x); + } +} + +void UnchangedInferMeta(const MetaTensor& x, MetaTensor* out) { + out->share_meta(x); +} + +// meta x -> out without change, check if axis in range [-Rank(x), Rank(x)-1] +void UnchangedInferMetaCheckAxis(const MetaTensor& x, + int axis, + MetaTensor* out) { + auto rank = x.dims().size(); + PADDLE_ENFORCE_GE( + axis, + -rank, + errors::InvalidArgument( + "Attr(axis) value should be in range [-R, R-1], " + "R is the rank of Input(X). But received axis: %d, R: %d.", + axis, + rank)); + PADDLE_ENFORCE_LT( + axis, + rank, + phi::errors::InvalidArgument( + "Attr(axis) value should be in range [-R, R-1], " + "R is the rank of Input(X). But received axis: %d, R: %d.", + axis, + rank)); + out->share_meta(x); } void UnfoldInferMeta(const MetaTensor& x, @@ -1073,301 +1602,41 @@ void UnfoldInferMeta(const MetaTensor& x, out->set_dims(phi::make_ddim(out_dims)); } -void DiagInferMeta(const MetaTensor& x, - int offset, - float padding_value, - MetaTensor* out) { +void OneHotRawInferMeta(const MetaTensor& x, + int32_t depth, + DataType dtype, + bool allow_out_of_range, + MetaTensor* out) { auto x_dims = x.dims(); - - if (x_dims.size() == 1UL) { - int64_t size_ = x_dims[0] + std::abs(offset); - out->set_dims({size_, size_}); - out->set_dtype(x.dtype()); - } else if (x_dims.size() == 2UL) { - int64_t size_ = 0; - if (offset >= 0) { - // Note(LutaoChu): Do not use std::min here, otherwise the calculation - // of `size_` will have unexpected result on Windows Python3.8 - if (x_dims[0] < x_dims[1] - offset) { - size_ = x_dims[0]; - } else { - size_ = x_dims[1] - offset; - } - } else { - // Note(LutaoChu): Do not use std::min here, otherwise the calculation - // of `size_` will have unexpected result on Windows Python3.8 - if (x_dims[0] + offset < x_dims[1]) { - size_ = x_dims[0] + offset; - } else { - size_ = x_dims[1]; - } - } - out->set_dims({size_}); - out->set_dtype(x.dtype()); - } else { - PADDLE_THROW(phi::errors::InvalidArgument( - "The input tensor X's dimensions of DiagV2Op should be either 1 or " - "2, but received %d.", - x_dims.size())); - } -} - -void ArgMinMaxInferMeta(const MetaTensor& x, - int64_t axis, - bool keepdims, - bool flatten, - int dtype, - MetaTensor* out, - MetaConfig config) { - const auto& x_dims = x.dims(); - PADDLE_ENFORCE_GE( - axis, - -x_dims.size(), - phi::errors::InvalidArgument("'axis'(%d) must be greater than or equal to" - " -Rank(X)(%d).", - axis, - -x_dims.size())); - PADDLE_ENFORCE_LT(axis, - x_dims.size(), - phi::errors::InvalidArgument( - "'axis'(%d) must be less than Rank(X)(%d) of Input(X).", - axis, - x_dims.size())); - - PADDLE_ENFORCE_EQ( - (dtype < 0 || dtype == 2 || dtype == 3), - true, - phi::errors::InvalidArgument( - "The attribute of dtype in argmin/argmax must be [%s] or [%s], but " - "received [%s]", - paddle::framework::DataTypeToString( - paddle::framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - paddle::framework::proto::VarType::INT64), - paddle::framework::DataTypeToString( - static_cast(dtype)))); - - auto x_rank = x_dims.size(); - if (axis < 0) axis += x_rank; - if (config.is_runtime) { - if (dtype == paddle::framework::proto::VarType::INT32) { - int64_t all_element_num = 0; - if (flatten) { - all_element_num = phi::product(x_dims); - - } else { - all_element_num = x_dims[axis]; - } - PADDLE_ENFORCE_LE( - all_element_num, - INT_MAX, - phi::errors::InvalidArgument( - "The element num of the argmin/argmax input at axis is " - "%d, is larger than int32 maximum value:%d, you must " - "set the dtype of argmin/argmax to 'int64'.", - all_element_num, - INT_MAX)); - } - } - std::vector vec; - if (flatten) { - vec.emplace_back(static_cast(1)); - } else { - for (int64_t i = 0; i < axis; i++) vec.emplace_back(x_dims[i]); - if (keepdims) { - vec.emplace_back(static_cast(1)); - } - for (int64_t i = axis + 1; i < x_rank; i++) vec.emplace_back(x_dims[i]); - } - out->set_dims(phi::make_ddim(vec)); - if (dtype == 2) { - out->set_dtype(DataType::INT32); - } else if (dtype == 3) { - out->set_dtype(DataType::INT64); - } -} - -void SizeInferMeta(const MetaTensor& input, MetaTensor* out) { - out->set_dtype(DataType::INT64); - out->set_dims({1}); -} - -void PadInferMeta(const MetaTensor& input, - const std::vector& paddings, - float pad_value, - MetaTensor* out, - MetaConfig config) { - auto x_dim = input.dims(); - PADDLE_ENFORCE_EQ( - static_cast(paddings.size()), - x_dim.size() * 2, - phi::errors::InvalidArgument( - "Size of 'paddings' dimension should be equal to 2 * size of " - "Input(X)'s dimension, but received (size of 'paddings' dimension " - "is) %d vs (2 * size of Input(X)'s dimension is) %d.", - static_cast(paddings.size()), - x_dim.size() * 2)); - for (size_t i = 0; i < paddings.size(); ++i) { - PADDLE_ENFORCE_GE(paddings[i], - 0, - phi::errors::InvalidArgument( - "The element of 'paddings' should >= 0, but " - "received %d for index %d.", - paddings[i], - static_cast(i))); - } - std::vector out_dims(x_dim.size()); - for (int i = 0; i < x_dim.size(); ++i) { - if ((!config.is_runtime) && (x_dim[i] == -1)) { - out_dims[i] = -1; - } else { - out_dims[i] = x_dim[i] + paddings[i * 2] + paddings[i * 2 + 1]; - } - } - out->set_dims(phi::make_ddim(out_dims)); - if (out_dims[0] == x_dim[0]) { - // Only pass LoD when the first dimension is equal between - // output and input. - out->share_lod(input); - } - out->set_dtype(input.dtype()); -} - -void IsfiniteInferMeta(const MetaTensor& x, MetaTensor* out) { - out->set_dims(x.dims()); - out->set_dtype(DataType::BOOL); -} - -void PixelShuffleInferMeta(const MetaTensor& x, - int upscale_factor, - const std::string& data_format, - MetaTensor* out) { - auto input_dims = x.dims(); - PADDLE_ENFORCE_EQ(input_dims.size(), - 4, - phi::errors::InvalidArgument( - "Input should be a 4-D tensor of format [N, C, H, W] " - "or [N, H, W, C], but got %u.", - input_dims.size())); - - const bool channel_last = (data_format == "NHWC"); + x_dims.size(), + 1, + phi::errors::InvalidArgument("Rank of Input(X) should be at least 1.")); - if (!channel_last) { - PADDLE_ENFORCE_EQ(input_dims[1] % (upscale_factor * upscale_factor), - 0, - phi::errors::InvalidArgument( - "The square of upscale_factor[%u] should divide the " - "number of channel[%u]", - upscale_factor * upscale_factor, - input_dims[1])); - } else { - PADDLE_ENFORCE_EQ(input_dims[3] % (upscale_factor * upscale_factor), - 0, - phi::errors::InvalidArgument( - "The square of upscale_factor[%u] should divide the " - "number of channel[%u]", - upscale_factor * upscale_factor, - input_dims[3])); - } - auto output_dims = input_dims; - output_dims[0] = input_dims[0]; - if (!channel_last) { - output_dims[1] = input_dims[1] / (upscale_factor * upscale_factor); - output_dims[2] = input_dims[2] * upscale_factor; - output_dims[3] = input_dims[3] * upscale_factor; - } else { - output_dims[1] = input_dims[1] * upscale_factor; - output_dims[2] = input_dims[2] * upscale_factor; - output_dims[3] = input_dims[3] / (upscale_factor * upscale_factor); - } - out->set_dtype(x.dtype()); - out->set_dims(output_dims); + auto out_dims_vec = phi::vectorize(x_dims); + out_dims_vec.push_back(depth); + auto out_dims = phi::make_ddim(out_dims_vec); + out->set_dims(out_dims); + out->share_lod(x); + out->set_dtype(dtype); } -void TransposeInferMeta(const MetaTensor& x, - const std::vector& axis, - MetaTensor* out) { +void OneHotInferMeta(const MetaTensor& x, + const Scalar& depth_t, + MetaTensor* out) { auto x_dims = x.dims(); - size_t x_rank = x_dims.size(); - size_t axis_size = axis.size(); - - PADDLE_ENFORCE_EQ( - x_rank, - axis_size, - errors::InvalidArgument("The input tensor's dimension " - "should be equal to the axis's size. " - "But received input tensor's dimension is %d, " - "axis's size is %d", - x_rank, - axis_size)); - - std::vector count(axis_size, 0); - for (size_t i = 0; i < axis_size; i++) { - PADDLE_ENFORCE_GE( - axis[i], - 0, - errors::InvalidArgument("The axis should be greater than or equal to 0." - "But received %d of axis[%d]", - axis[i], - i)); - - PADDLE_ENFORCE_EQ( - axis[i] < static_cast(axis_size) && ++count[axis[i]] == 1, - true, - errors::InvalidArgument( - "Each element of Attribute axis should " - "be a unique value range from 0 to (dims - 1), " - "where the dims is the axis's size, " - "unique value means this axis value can appear only once. " - "But received axis[%d] is %d, axis_size is %d, " - "count[axis[%d]] is %d", - i, - axis[i], - axis_size, - i, - count[axis[i]])); - } - - phi::DDim out_dims(x_dims); - for (size_t i = 0; i < axis_size; ++i) { - out_dims[i] = x_dims[axis[i]]; - } + PADDLE_ENFORCE_GE( + x_dims.size(), + 1, + phi::errors::InvalidArgument("Rank of Input(X) should be at least 1.")); + int depth = depth_t.to(); + auto out_dims_vec = phi::vectorize(x_dims); + out_dims_vec.push_back(depth); + auto out_dims = phi::make_ddim(out_dims_vec); out->set_dims(out_dims); - out->set_dtype(x.dtype()); -} - -void EighInferMeta(const MetaTensor& x, - const std::string& uplo, - MetaTensor* out_w, - MetaTensor* out_v) { - auto input_dim = x.dims(); - auto rank = input_dim.size(); - - PADDLE_ENFORCE_GE(rank, - 2, - phi::errors::InvalidArgument( - "The Input(X) should have at least 2 dimensions." - "But received a %d dimension tensor.", - rank)); - PADDLE_ENFORCE_EQ( - input_dim[rank - 2], - input_dim[rank - 1], - phi::errors::InvalidArgument( - "Eigh op is designed for square matrix, consequently" - "inner-most 2 dimensions of Input(X) should be symmetric." - "But received X's shape[-2] = %d and shape[-1] = %d.", - input_dim[rank - 2], - input_dim[rank - 1])); - - std::vector values_dim; - - for (auto i = 0; i < rank - 1; i++) { - values_dim.emplace_back(input_dim[i]); - } - out_w->set_dims(phi::make_ddim(values_dim)); - out_v->set_dims(input_dim); + out->share_lod(x); + out->set_dtype(phi::DataType::FLOAT32); } void WhereIndexInferMeta(const MetaTensor& condition, MetaTensor* out) { @@ -1381,34 +1650,6 @@ void WhereIndexInferMeta(const MetaTensor& condition, MetaTensor* out) { out->set_dtype(DataType::INT64); } -void ShardIndexInferMeta(const MetaTensor& in, - int index_num, - int nshards, - int shard_id, - int ignore_value, - MetaTensor* out, - MetaConfig config) { - auto x_dims = in.dims(); - PADDLE_ENFORCE_GE( - x_dims.size(), - 2, - phi::errors::InvalidArgument("Rank of Input(X) should be at least 2, " - "but the value given is %d.", - x_dims.size())); - if (config.is_runtime || x_dims[x_dims.size() - 1] > 0) { - PADDLE_ENFORCE_EQ(x_dims[x_dims.size() - 1], - 1U, - phi::errors::InvalidArgument( - "The last dimension of Input(X) should be 1, " - "but the value given is %d.", - x_dims[x_dims.size() - 1])); - } - - out->set_dims(x_dims); - out->share_lod(in); - out->set_dtype(in.dtype()); -} - } // namespace phi PD_REGISTER_INFER_META_FN(copy_to, phi::CopyToInferMeta); diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index e8be73e943e09c9794376945cc904fe6f2a3d324..75fb9fadf82dc87ac18814e0674e5012fea95ec4 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -32,32 +32,20 @@ class MetaConfig; // Because functions in this file not only can infer shape, but also need // infer lod or other useful data. +void ArgMinMaxInferMeta(const MetaTensor& x, + int64_t axis, + bool keepdims, + bool flatten, + int dtype, + MetaTensor* out, + MetaConfig config = MetaConfig()); + void ArgsortInferMeta(const MetaTensor& input, int axis, bool descending, MetaTensor* output, MetaTensor* indices); -void UnchangedInferMeta(const MetaTensor& x, MetaTensor* out); - -// meta x -> out without change, check if axis in range [-Rank(x), Rank(x)-1] -void UnchangedInferMetaCheckAxis(const MetaTensor& x, - int axis, - MetaTensor* out); - -void RealAndImagInferMeta(const MetaTensor& x, MetaTensor* out); - -void FlattenInferMeta(const MetaTensor& x, - int start_axis, - int stop_axis, - MetaTensor* out); - -void GumbelSoftmaxInferMeta(const MetaTensor& x, - float temperature, - bool hard, - int axis, - MetaTensor* out); - void CastInferMeta(const MetaTensor& x, DataType out_dtype, MetaTensor* out); void CholeskyInferMeta(const MetaTensor& x, bool upper, MetaTensor* out); @@ -76,6 +64,30 @@ void CumsumInferMeta(const MetaTensor& x, bool reverse, MetaTensor* out); +void DiagInferMeta(const MetaTensor& x, + int offset, + float padding_value, + MetaTensor* out); + +void DiagonalInferMeta( + const MetaTensor& input, int offset, int axis1, int axis2, MetaTensor* out); + +void EighInferMeta(const MetaTensor& x, + const std::string& uplo, + MetaTensor* out_w, + MetaTensor* out_v); + +void FlattenInferMeta(const MetaTensor& x, + int start_axis, + int stop_axis, + MetaTensor* out); + +void GumbelSoftmaxInferMeta(const MetaTensor& x, + float temperature, + bool hard, + int axis, + MetaTensor* out); + void IncrementInferMeta(const MetaTensor& x, float value, MetaTensor* out); void InferMetaFromVecValue(const MetaTensor& x, @@ -84,32 +96,53 @@ void InferMetaFromVecValue(const MetaTensor& x, void IsEmptyInferMeta(const MetaTensor& x, MetaTensor* out); +void IsfiniteInferMeta(const MetaTensor& input, MetaTensor* out); + +void MaxPoolWithIndexInferMeta(const MetaTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool global_pooling, + bool adaptive, + MetaTensor* out, + MetaTensor* mask, + MetaConfig config = MetaConfig()); + void MultinomialInferMeta(const MetaTensor& x, int num_samples, bool replacement, MetaTensor* out); -void ReshapeInferMeta(const MetaTensor& x, - const ScalarArray& shape, - MetaTensor* out, - MetaConfig config = MetaConfig()); +void PadInferMeta(const MetaTensor& input, + const std::vector& paddings, + float pad_value, + MetaTensor* out, + MetaConfig config = MetaConfig()); -void ReshapeWithXShapeInferMeta(const MetaTensor& x, - const ScalarArray& shape, - MetaTensor* xshape, - MetaTensor* out, - MetaConfig config = MetaConfig()); +void PixelShuffleInferMeta(const MetaTensor& x, + int upscale_factor, + const std::string& data_format, + MetaTensor* out); -void TileInferMeta(const MetaTensor& x, - const ScalarArray& repeat_times, +void PoolInferMeta(const MetaTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, MetaTensor* out, MetaConfig config = MetaConfig()); -void SumRawInferMeta(const MetaTensor& x, +void RealAndImagInferMeta(const MetaTensor& x, MetaTensor* out); + +void ReduceInferMeta(const MetaTensor& x, const std::vector& axis, bool keep_dim, - bool reduce_all, - DataType dtype, MetaTensor* out); void ReduceInferMetaBase(const MetaTensor& x, @@ -118,10 +151,34 @@ void ReduceInferMetaBase(const MetaTensor& x, bool reduce_all, MetaTensor* out); -void ReduceInferMeta(const MetaTensor& x, - const std::vector& axis, - bool keep_dim, - MetaTensor* out); +void ReshapeInferMeta(const MetaTensor& x, + const ScalarArray& shape, + MetaTensor* out, + MetaConfig config = MetaConfig()); + +void ReshapeWithXShapeInferMeta(const MetaTensor& x, + const ScalarArray& shape, + MetaTensor* xshape, + MetaTensor* out, + MetaConfig config = MetaConfig()); + +void ShardIndexInferMeta(const MetaTensor& in, + int index_num, + int nshards, + int shard_id, + int ignore_value, + MetaTensor* out, + MetaConfig config = MetaConfig()); + +void SizeInferMeta(const MetaTensor& input, MetaTensor* out); + +void SoftmaxInferMeta(const MetaTensor& x, int axis, MetaTensor* out); + +void SplitInferMeta(const MetaTensor& x_meta, + const ScalarArray& num_or_sections, + const Scalar& axis, + std::vector out, + MetaConfig config = MetaConfig()); void SumInferMeta(const MetaTensor& x, const std::vector& axis, @@ -129,21 +186,39 @@ void SumInferMeta(const MetaTensor& x, bool keep_dim, MetaTensor* out); +void SumRawInferMeta(const MetaTensor& x, + const std::vector& axis, + bool keep_dim, + bool reduce_all, + DataType dtype, + MetaTensor* out); + +void TileInferMeta(const MetaTensor& x, + const ScalarArray& repeat_times, + MetaTensor* out, + MetaConfig config = MetaConfig()); + +void TraceInferMeta( + const MetaTensor& x, int offset, int axis1, int axis2, MetaTensor* out); + void TransferLayoutInferMeta(const MetaTensor& x, DataLayout layout, MetaTensor* out); -void SplitInferMeta(const MetaTensor& x_meta, - const ScalarArray& num_or_sections, - const Scalar& axis, - std::vector out, - MetaConfig config = MetaConfig()); +void TransposeInferMeta(const MetaTensor& x, + const std::vector& axis, + MetaTensor* out); void UnbindInferMeta(const MetaTensor& x, int axis, std::vector* outs); -void TraceInferMeta( - const MetaTensor& x, int offset, int axis1, int axis2, MetaTensor* out); + +void UnchangedInferMeta(const MetaTensor& x, MetaTensor* out); + +// meta x -> out without change, check if axis in range [-Rank(x), Rank(x)-1] +void UnchangedInferMetaCheckAxis(const MetaTensor& x, + int axis, + MetaTensor* out); void UnfoldInferMeta(const MetaTensor& x, const std::vector& kernel_sizes, @@ -153,54 +228,14 @@ void UnfoldInferMeta(const MetaTensor& x, MetaTensor* out, MetaConfig config = MetaConfig()); -void DiagInferMeta(const MetaTensor& x, - int offset, - float padding_value, - MetaTensor* out); - -void ArgMinMaxInferMeta(const MetaTensor& x, - int64_t axis, - bool keepdims, - bool flatten, - int dtype, - MetaTensor* out, - MetaConfig config = MetaConfig()); - -void SizeInferMeta(const MetaTensor& input, MetaTensor* out); - -void PadInferMeta(const MetaTensor& input, - const std::vector& paddings, - float pad_value, - MetaTensor* out, - MetaConfig config = MetaConfig()); - -void DiagonalInferMeta( - const MetaTensor& input, int offset, int axis1, int axis2, MetaTensor* out); - -void PixelShuffleInferMeta(const MetaTensor& x, - int upscale_factor, - const std::string& data_format, - MetaTensor* out); - -void IsfiniteInferMeta(const MetaTensor& input, MetaTensor* out); - -void TransposeInferMeta(const MetaTensor& x, - const std::vector& axis, +void OneHotRawInferMeta(const MetaTensor& x, + int32_t depth, + DataType dtype, + bool allow_out_of_range, MetaTensor* out); -void EighInferMeta(const MetaTensor& x, - const std::string& uplo, - MetaTensor* out_w, - MetaTensor* out_v); +void OneHotInferMeta(const MetaTensor& x, const Scalar& depth, MetaTensor* out); void WhereIndexInferMeta(const MetaTensor& condition, MetaTensor* out); -void ShardIndexInferMeta(const MetaTensor& in, - int index_num, - int nshards, - int shard_id, - int ignore_value, - MetaTensor* out, - MetaConfig config = MetaConfig()); - } // namespace phi diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index 093cb6549797d198ccaaff533357243a51188a74..d443b7bb2a09225e78a7001374821114c59a1557 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -11,7 +11,7 @@ set_property(GLOBAL PROPERTY PHI_KERNELS "") # [ 1. Common kernel compilation dependencies ] set(COMMON_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils custom_kernel) -set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col vol2col concat_and_split_functor softmax) +set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col vol2col concat_and_split_functor) # remove this dep after removing fluid deps on tensor creation set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} phi_api_utils) set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta) @@ -27,22 +27,25 @@ kernel_library(full_kernel DEPS ${COMMON_KERNEL_DEPS} empty_kernel) # Some kernels depend on some targets that are not commonly used. # These targets are not suitable for common dependencies. # In this case, you need to manually generate them here. -set(MANUAL_BUILD_KERNELS math_kernel softmax_kernel softmax_grad_kernel triangular_solve_grad_kernel maxout_kernel maxout_grad_kernel put_along_axis_kernel put_along_axis_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel eigh_kernel segment_pool_kernel segment_pool_grad_kernel matrix_power_kernel matrix_power_grad_kernel) +set(MANUAL_BUILD_KERNELS eigh_kernel gumbel_softmax_kernel gumbel_softmax_grad_kernel math_kernel matrix_power_kernel matrix_power_grad_kernel maxout_kernel maxout_grad_kernel pool_kernel put_along_axis_kernel put_along_axis_grad_kernel segment_pool_kernel segment_pool_grad_kernel softmax_kernel softmax_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel triangular_solve_grad_kernel) +kernel_library(eigh_kernel DEPS ${COMMON_KERNEL_DEPS} lapack_function) +kernel_library(gumbel_softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) +kernel_library(gumbel_softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) kernel_library(math_kernel DEPS ${COMMON_KERNEL_DEPS} cast_kernel copy_kernel) -kernel_library(softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) -kernel_library(softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) -kernel_library(triangular_solve_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_reduce) +kernel_library(matrix_power_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse) +kernel_library(matrix_power_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse) kernel_library(maxout_kernel DEPS ${COMMON_KERNEL_DEPS} maxouting) kernel_library(maxout_grad_kernel DEPS ${COMMON_KERNEL_DEPS} maxouting) +kernel_library(pool_kernel DEPS ${COMMON_KERNEL_DEPS} pooling) kernel_library(put_along_axis_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel) kernel_library(put_along_axis_grad_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel) -kernel_library(take_along_axis_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel) -kernel_library(take_along_axis_grad_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel) -kernel_library(matrix_power_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse) -kernel_library(matrix_power_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse) -kernel_library(eigh_kernel DEPS ${COMMON_KERNEL_DEPS} lapack_function) kernel_library(segment_pool_kernel DEPS ${COMMON_KERNEL_DEPS} segment_pooling) kernel_library(segment_pool_grad_kernel DEPS ${COMMON_KERNEL_DEPS} segment_pooling) +kernel_library(softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) +kernel_library(softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) +kernel_library(take_along_axis_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel) +kernel_library(take_along_axis_grad_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel) +kernel_library(triangular_solve_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_reduce) # 4. auto parse and build kernel targets by cmake register_kernels(EXCLUDES ${COMMON_BAISC_KERNELS} ${MANUAL_BUILD_KERNELS} DEPS ${COMMON_KERNEL_DEPS} ${COMMON_BAISC_KERNELS} ) diff --git a/paddle/phi/kernels/activation_grad_kernel.h b/paddle/phi/kernels/activation_grad_kernel.h index f34e5710ab7294425bacba4e5d5782859ac5f081..a5b737b28c23ba97988915f00cbf447d2e1b1c22 100644 --- a/paddle/phi/kernels/activation_grad_kernel.h +++ b/paddle/phi/kernels/activation_grad_kernel.h @@ -39,6 +39,54 @@ void ReluDoubleGradKernel(const Context& dev_ctx, const DenseTensor& ddx, DenseTensor* ddout); +template +void TanhDoubleGradKernel(const Context& dev_ctx, + const DenseTensor& out, + const DenseTensor& ddx, + const DenseTensor& dout, + DenseTensor* dout_new, + DenseTensor* ddout); + +template +void TanhTripleGradKernel(const Context& dev_ctx, + const DenseTensor& out, + const DenseTensor& ddx, + const DenseTensor& dout, + const DenseTensor& d_ddout, + const DenseTensor& d_dout_new, + DenseTensor* d_out_new, + DenseTensor* d_dout, + DenseTensor* d_ddx); + +template +void BReluGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& dout, + float t_min, + float t_max, + DenseTensor* dx); + +template +void LeakyReluGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& dout, + float alpha, + DenseTensor* dx); + +template +void LeakyReluDoubleGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& ddx, + float alpha, + DenseTensor* ddout); + +template +void ThresholdedReluGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& dout, + float threshold, + DenseTensor* dx); + DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Cos); DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Tan); DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Acos); @@ -51,5 +99,6 @@ DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Asinh); DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Acosh); DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Atanh); DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(Relu); +DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(Tanh); } // namespace phi diff --git a/paddle/phi/kernels/activation_kernel.h b/paddle/phi/kernels/activation_kernel.h index bdf8f4363598f8c25e6f128b3f38f13f23005828..885dccad8e377642b4cb9e36832ac4bd45f7915f 100644 --- a/paddle/phi/kernels/activation_kernel.h +++ b/paddle/phi/kernels/activation_kernel.h @@ -36,5 +36,25 @@ DECLARE_ACTIVATION_KERNEL(Asinh) DECLARE_ACTIVATION_KERNEL(Acosh) DECLARE_ACTIVATION_KERNEL(Atanh) DECLARE_ACTIVATION_KERNEL(Relu) +DECLARE_ACTIVATION_KERNEL(Tanh) + +template +void BReluKernel(const Context& dev_ctx, + const DenseTensor& x, + float t_min, + float t_max, + DenseTensor* out); + +template +void LeakyReluKernel(const Context& dev_ctx, + const DenseTensor& x, + float alpha, + DenseTensor* out); + +template +void ThresholdedReluKernel(const Context& dev_ctx, + const DenseTensor& x, + float threshold, + DenseTensor* out); } // namespace phi diff --git a/paddle/phi/kernels/allclose_kernel.h b/paddle/phi/kernels/allclose_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..3f24078b86ca1736411cb929754441d737ee6028 --- /dev/null +++ b/paddle/phi/kernels/allclose_kernel.h @@ -0,0 +1,31 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/scalar.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void AllCloseKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const Scalar& rtol, + const Scalar& atol, + bool equal_nan, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/concat_kernel.h b/paddle/phi/kernels/concat_kernel.h index 4e72159aeca671614ccfe483ec1496f70e6b1d6a..cf83ab9aaabe135573a2887a01166f4a7bd0d5e1 100644 --- a/paddle/phi/kernels/concat_kernel.h +++ b/paddle/phi/kernels/concat_kernel.h @@ -40,7 +40,7 @@ DenseTensor Concat(const Context& dev_ctx, DenseTensor dense_out; MetaTensor meta_out(&dense_out); - ConcatInferMeta(meta_x_ptr, axis.to(), &meta_out, /*is_runtime=*/true); + ConcatInferMeta(meta_x_ptr, axis.to(), &meta_out); ConcatKernel(dev_ctx, x, axis, &dense_out); return dense_out; } diff --git a/paddle/phi/kernels/cpu/activation_grad_kernel.cc b/paddle/phi/kernels/cpu/activation_grad_kernel.cc index fe43ebb816077432ca4e7f678be4591e5d31b6f7..f9af50f6832a1884f3ef58ccb5708b1f2636ccea 100644 --- a/paddle/phi/kernels/cpu/activation_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/activation_grad_kernel.cc @@ -27,65 +27,135 @@ namespace phi { const DenseTensor& x, \ const DenseTensor& dout, \ DenseTensor* dx) { \ - functor_class functor; \ - ActivationGradImpl( \ + functor_class functor; \ + ActivationGradImpl>( \ dev_ctx, &x, nullptr, &dout, dx, functor); \ } +#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX( \ + name, functor_class, attr) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& dout, \ + float attr, \ + DenseTensor* dx) { \ + functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr; \ + ActivationGradImpl>( \ + dev_ctx, &x, nullptr, &dout, dx, functor); \ + } + +#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX( \ + name, functor_class, attr1, attr2) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& dout, \ + float attr1, \ + float attr2, \ + DenseTensor* dx) { \ + functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr1; \ + *(attrs[1].second) = attr2; \ + ActivationGradImpl>( \ + dev_ctx, &x, nullptr, &dout, dx, functor); \ + } + #define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(name, functor_class) \ template \ void name##GradKernel(const Context& dev_ctx, \ const DenseTensor& out, \ const DenseTensor& dout, \ DenseTensor* dx) { \ - functor_class functor; \ - ActivationGradImpl( \ + functor_class functor; \ + ActivationGradImpl>( \ dev_ctx, nullptr, &out, &dout, dx, functor); \ } -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Cos, funcs::CosGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Tan, funcs::TanGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Acos, funcs::AcosGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Sin, funcs::SinGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Asin, funcs::AsinGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Atan, funcs::AtanGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Sinh, funcs::SinhGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Cosh, funcs::CoshGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Asinh, funcs::AsinhGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Acosh, funcs::AcoshGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Atanh, funcs::AtanhGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(Relu, funcs::ReluGradFunctor); +#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepOut( \ + name, functor_class, attr) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& out, \ + const DenseTensor& dout, \ + float attr, \ + DenseTensor* dx) { \ + functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr; \ + ActivationGradImpl>( \ + dev_ctx, nullptr, &out, &dout, dx, functor); \ + } + +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Cos, funcs::CosGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Tan, funcs::TanGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Acos, funcs::AcosGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Sin, funcs::SinGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Asin, funcs::AsinGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Atan, funcs::AtanGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Sinh, funcs::SinhGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Cosh, funcs::CoshGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Asinh, funcs::AsinhGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Acosh, funcs::AcoshGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Atanh, funcs::AtanhGradFunctor); + +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(Relu, funcs::ReluGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(Tanh, funcs::TanhGradFunctor); + +DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(LeakyRelu, + funcs::LeakyReluGradFunctor, + alpha); +DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX( + ThresholdedRelu, funcs::ThresholdedReluGradFunctor, threshold); + +DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX(BRelu, + funcs::BReluGradFunctor, + t_min, + t_max); } // namespace phi -PD_REGISTER_KERNEL( - cos_grad, CPU, ALL_LAYOUT, phi::CosGradKernel, float, double) {} -PD_REGISTER_KERNEL( - tan_grad, CPU, ALL_LAYOUT, phi::TanGradKernel, float, double) {} -PD_REGISTER_KERNEL( - acos_grad, CPU, ALL_LAYOUT, phi::AcosGradKernel, float, double) {} -PD_REGISTER_KERNEL( - sin_grad, CPU, ALL_LAYOUT, phi::SinGradKernel, float, double) {} -PD_REGISTER_KERNEL( - asin_grad, CPU, ALL_LAYOUT, phi::AsinGradKernel, float, double) {} -PD_REGISTER_KERNEL( - atan_grad, CPU, ALL_LAYOUT, phi::AtanGradKernel, float, double) {} -PD_REGISTER_KERNEL( - sinh_grad, CPU, ALL_LAYOUT, phi::SinhGradKernel, float, double) {} -PD_REGISTER_KERNEL( - cosh_grad, CPU, ALL_LAYOUT, phi::CoshGradKernel, float, double) {} -PD_REGISTER_KERNEL( - asinh_grad, CPU, ALL_LAYOUT, phi::AsinhGradKernel, float, double) {} -PD_REGISTER_KERNEL( - acosh_grad, CPU, ALL_LAYOUT, phi::AcoshGradKernel, float, double) {} -PD_REGISTER_KERNEL( - atanh_grad, CPU, ALL_LAYOUT, phi::AtanhGradKernel, float, double) {} PD_REGISTER_KERNEL( relu_grad, CPU, ALL_LAYOUT, phi::ReluGradKernel, float, double) {} -PD_REGISTER_KERNEL(relu_double_grad, + +#define PD_REGISTER_ACTIVATION_GRAD_KERNEL(name, func) \ + PD_REGISTER_KERNEL(name, CPU, ALL_LAYOUT, phi::func, float, double) {} + +#define PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(name, func) \ + PD_REGISTER_KERNEL( \ + name, CPU, ALL_LAYOUT, phi::func, float, double, phi::dtype::float16) {} + +PD_REGISTER_ACTIVATION_GRAD_KERNEL(sin_grad, SinGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(cos_grad, CosGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(tan_grad, TanGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(acos_grad, AcosGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(asin_grad, AsinGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(atan_grad, AtanGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(sinh_grad, SinhGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(cosh_grad, CoshGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(asinh_grad, AsinhGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(acosh_grad, AcoshGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(atanh_grad, AtanhGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_grad, TanhGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(brelu_grad, BReluGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_grad, LeakyReluGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(thresholded_relu_grad, + ThresholdedReluGradKernel) + +PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(relu_double_grad, + ReluDoubleGradKernel) +PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(tanh_double_grad, + TanhDoubleGradKernel) +PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(leaky_relu_double_grad, + LeakyReluDoubleGradKernel) + +PD_REGISTER_KERNEL(tanh_triple_grad, CPU, ALL_LAYOUT, - phi::ReluDoubleGradKernel, + phi::TanhTripleGradKernel, float, double, phi::dtype::float16) {} diff --git a/paddle/phi/kernels/cpu/activation_kernel.cc b/paddle/phi/kernels/cpu/activation_kernel.cc index 51883f25183af7c8013bbfb403404397c8492988..0d13429c8f651ccb40646fddd82a3529a95ab45d 100644 --- a/paddle/phi/kernels/cpu/activation_kernel.cc +++ b/paddle/phi/kernels/cpu/activation_kernel.cc @@ -27,6 +27,33 @@ namespace phi { ActivationImpl(dev_ctx, x, out, functor); \ } +#define DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(name, functor_class, attr) \ + template \ + void name##Kernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + float attr, \ + DenseTensor* out) { \ + functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr; \ + ActivationImpl>(dev_ctx, x, out, functor); \ + } + +#define DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS( \ + name, functor_class, attr1, attr2) \ + template \ + void name##Kernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + float attr1, \ + float attr2, \ + DenseTensor* out) { \ + functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr1; \ + *(attrs[1].second) = attr2; \ + ActivationImpl>(dev_ctx, x, out, functor); \ + } + DEFINE_CPU_ACTIVATION_KERNEL(Sin, funcs::SinFunctor) DEFINE_CPU_ACTIVATION_KERNEL(Cos, funcs::CosFunctor) DEFINE_CPU_ACTIVATION_KERNEL(Tan, funcs::TanFunctor) @@ -39,17 +66,31 @@ DEFINE_CPU_ACTIVATION_KERNEL(Asinh, funcs::AsinhFunctor) DEFINE_CPU_ACTIVATION_KERNEL(Acosh, funcs::AcoshFunctor) DEFINE_CPU_ACTIVATION_KERNEL(Atanh, funcs::AtanhFunctor) DEFINE_CPU_ACTIVATION_KERNEL(Relu, funcs::ReluCPUFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Tanh, funcs::TanhFunctor) +DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, funcs::LeakyReluFunctor, alpha) +DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu, + funcs::ThresholdedReluFunctor, + threshold) +DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, funcs::BReluFunctor, t_min, t_max) } // namespace phi -PD_REGISTER_KERNEL(sin, CPU, ALL_LAYOUT, phi::SinKernel, float, double) {} -PD_REGISTER_KERNEL(cos, CPU, ALL_LAYOUT, phi::CosKernel, float, double) {} -PD_REGISTER_KERNEL(tan, CPU, ALL_LAYOUT, phi::TanKernel, float, double) {} -PD_REGISTER_KERNEL(acos, CPU, ALL_LAYOUT, phi::AcosKernel, float, double) {} -PD_REGISTER_KERNEL(asin, CPU, ALL_LAYOUT, phi::AsinKernel, float, double) {} -PD_REGISTER_KERNEL(atan, CPU, ALL_LAYOUT, phi::AtanKernel, float, double) {} -PD_REGISTER_KERNEL(sinh, CPU, ALL_LAYOUT, phi::SinhKernel, float, double) {} -PD_REGISTER_KERNEL(cosh, CPU, ALL_LAYOUT, phi::CoshKernel, float, double) {} -PD_REGISTER_KERNEL(asinh, CPU, ALL_LAYOUT, phi::AsinhKernel, float, double) {} -PD_REGISTER_KERNEL(acosh, CPU, ALL_LAYOUT, phi::AcoshKernel, float, double) {} -PD_REGISTER_KERNEL(atanh, CPU, ALL_LAYOUT, phi::AtanhKernel, float, double) {} PD_REGISTER_KERNEL(relu, CPU, ALL_LAYOUT, phi::ReluKernel, float, double) {} + +#define PD_REGISTER_ACTIVATION_KERNEL(name, func) \ + PD_REGISTER_KERNEL(name, CPU, ALL_LAYOUT, phi::func##Kernel, float, double) {} + +PD_REGISTER_ACTIVATION_KERNEL(sin, Sin) +PD_REGISTER_ACTIVATION_KERNEL(cos, Cos) +PD_REGISTER_ACTIVATION_KERNEL(tan, Tan) +PD_REGISTER_ACTIVATION_KERNEL(acos, Acos) +PD_REGISTER_ACTIVATION_KERNEL(asin, Asin) +PD_REGISTER_ACTIVATION_KERNEL(atan, Atan) +PD_REGISTER_ACTIVATION_KERNEL(sinh, Sinh) +PD_REGISTER_ACTIVATION_KERNEL(cosh, Cosh) +PD_REGISTER_ACTIVATION_KERNEL(asinh, Asinh) +PD_REGISTER_ACTIVATION_KERNEL(acosh, Acosh) +PD_REGISTER_ACTIVATION_KERNEL(atanh, Atanh) +PD_REGISTER_ACTIVATION_KERNEL(tanh, Tanh) +PD_REGISTER_ACTIVATION_KERNEL(brelu, BRelu) +PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyRelu) +PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedRelu) diff --git a/paddle/phi/kernels/cpu/allclose_kernel.cc b/paddle/phi/kernels/cpu/allclose_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..7ffeadfeed8aaa60c13b651188c5099c949b98ab --- /dev/null +++ b/paddle/phi/kernels/cpu/allclose_kernel.cc @@ -0,0 +1,71 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/allclose_kernel.h" + +#include + +#include "paddle/phi/core/enforce.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void AllCloseKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const Scalar& rtol, + const Scalar& atol, + bool equal_nan, + DenseTensor* out) { + PADDLE_ENFORCE_EQ( + rtol.dtype(), + DataType::FLOAT64, + phi::errors::InvalidArgument( + "Input (Rtol) type must be double, but get %s.", rtol.dtype())); + PADDLE_ENFORCE_EQ( + atol.dtype(), + DataType::FLOAT64, + phi::errors::InvalidArgument( + "Input (Atol) type must be double, but get %s.", atol.dtype())); + + auto* in_a = x.data(); + auto* in_b = y.data(); + auto rtol_v = rtol.to(); + auto atol_v = atol.to(); + auto* out_data = dev_ctx.template Alloc(out); + *out_data = true; + + auto num = x.numel(); + for (int64_t i = 0; i < num; ++i) { + const T a = in_a[i], b = in_b[i]; + bool val; + if (std::isnan(a) || std::isnan(b)) { + val = equal_nan && std::isnan(a) == std::isnan(b); + } else { + T left = (a > b ? a - b : b - a); + T right = atol_v + (b > 0 ? rtol_v * b : (-rtol_v) * b); + T diff = (left > right ? left - right : right - left); + val = a == b || left <= right || diff <= 1e-15; + } + *out_data &= val; + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + allclose, CPU, ALL_LAYOUT, phi::AllCloseKernel, float, double) { + kernel->OutputAt(0).SetDataType(phi::DataType::BOOL); +} diff --git a/paddle/phi/kernels/cpu/diag_grad_kernel.cc b/paddle/phi/kernels/cpu/diag_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..c56b225e2a753f963651f5e3f0a5cf711f5bb8a6 --- /dev/null +++ b/paddle/phi/kernels/cpu/diag_grad_kernel.cc @@ -0,0 +1,72 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/diag_grad_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/diag_functor.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +void DiagGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + int offset, + DenseTensor* x_grad) { + T* dx_data = dev_ctx.template Alloc(x_grad); + const T* dout_data = out_grad.data(); + auto dx_dims = x_grad->dims(); + auto dout_dims = out_grad.dims(); + + if (dx_dims.size() == 1) { + auto dx_length = dx_dims[0]; + int dx_stride = phi::funcs::ComputeStride(0, dx_dims); + + auto dout_stride_0 = phi::funcs::ComputeStride(0, dout_dims); + auto dout_stride_1 = phi::funcs::ComputeStride(1, dout_dims); + dout_data += + (offset >= 0 ? offset * dout_stride_1 : -offset * dout_stride_0); + + for (int i = 0; i < dx_length; i++) { + dx_data[i * dx_stride] = dout_data[i * (dout_stride_0 + dout_stride_1)]; + } + } else { + phi::funcs::SetConstant set_padding_value; + set_padding_value(dev_ctx, x_grad, static_cast(0)); + + int dx_stride_0 = phi::funcs::ComputeStride(0, dx_dims); + int dx_stride_1 = phi::funcs::ComputeStride(1, dx_dims); + auto dout_stride_0 = phi::funcs::ComputeStride(0, dout_dims); + dx_data += (offset >= 0 ? offset * dx_stride_1 : -offset * dx_stride_0); + + auto dout_length = dout_dims[0]; + for (int i = 0; i < dout_length; i++) { + dx_data[i * (dx_stride_0 + dx_stride_1)] = dout_data[i * dout_stride_0]; + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(diag_grad, + CPU, + ALL_LAYOUT, + phi::DiagGradKernel, + phi::dtype::float16, + int, + int64_t, + float, + double) {} diff --git a/paddle/phi/kernels/cpu/diag_kernel.cc b/paddle/phi/kernels/cpu/diag_kernel.cc index d1e0b8e31e78fd74e6a15722546971a3cb72807a..4b060f0372a5bf50d9378239dae635e5723d0c7a 100644 --- a/paddle/phi/kernels/cpu/diag_kernel.cc +++ b/paddle/phi/kernels/cpu/diag_kernel.cc @@ -62,5 +62,12 @@ void DiagKernel(const Context& dev_ctx, } // namespace phi -PD_REGISTER_KERNEL( - diag, CPU, ALL_LAYOUT, phi::DiagKernel, int, float, double, int64_t) {} +PD_REGISTER_KERNEL(diag, + CPU, + ALL_LAYOUT, + phi::DiagKernel, + phi::dtype::float16, + int, + float, + double, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc index cd513e809fd84ace9b01b50aed537204b2be1684..bf6ec012b24443e877b235e17488725dc0d14151 100644 --- a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc @@ -259,3 +259,20 @@ PD_REGISTER_KERNEL(multiply_triple_grad, phi::dtype::bfloat16, phi::dtype::complex, phi::dtype::complex) {} +PD_REGISTER_KERNEL(elementwise_fmax_grad, + CPU, + ALL_LAYOUT, + phi::ElementwiseFMaxGradKernel, + float, + double, + int, + int64_t) {} + +PD_REGISTER_KERNEL(elementwise_fmin_grad, + CPU, + ALL_LAYOUT, + phi::ElementwiseFMinGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/elementwise_kernel.cc b/paddle/phi/kernels/cpu/elementwise_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..37ad18df56ec30c838dd5bd03c484d7889e976c0 --- /dev/null +++ b/paddle/phi/kernels/cpu/elementwise_kernel.cc @@ -0,0 +1,35 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/elementwise_kernel_impl.h" + +PD_REGISTER_KERNEL(elementwise_fmax, + CPU, + ALL_LAYOUT, + phi::ElementwiseFMaxKernel, + float, + double, + int, + int64_t) {} + +PD_REGISTER_KERNEL(elementwise_fmin, + CPU, + ALL_LAYOUT, + phi::ElementwiseFMinKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/kron_grad_kernel.cc b/paddle/phi/kernels/cpu/kron_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..01f5e5404b61d3ac96ffd6a811e449eae260c27d --- /dev/null +++ b/paddle/phi/kernels/cpu/kron_grad_kernel.cc @@ -0,0 +1,31 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/kron_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/kron_grad_kernel_impl.h" + +PD_REGISTER_KERNEL(kron_grad, + CPU, + ALL_LAYOUT, + phi::KronGradKernel, + int, + int64_t, + float, + double, + phi::dtype::float16, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/cpu/kron_kernel.cc b/paddle/phi/kernels/cpu/kron_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..aaea509dc7641b8b6c44b031c44e2b210c0cde39 --- /dev/null +++ b/paddle/phi/kernels/cpu/kron_kernel.cc @@ -0,0 +1,31 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/kron_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/kron_kernel_impl.h" + +PD_REGISTER_KERNEL(kron, + CPU, + ALL_LAYOUT, + phi::KronKernel, + int, + int64_t, + float, + double, + phi::dtype::float16, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/cpu/matrix_rank_kernel.cc b/paddle/phi/kernels/cpu/matrix_rank_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..5e13abe8aed2caf205871a24cfddff0b8b959498 --- /dev/null +++ b/paddle/phi/kernels/cpu/matrix_rank_kernel.cc @@ -0,0 +1,43 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/matrix_rank_kernel.h" +#include "paddle/phi/kernels/matrix_rank_tol_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/full_kernel.h" + +namespace phi { + +template +void MatrixRankKernel(const Context& dev_ctx, + const DenseTensor& x, + float tol, + bool use_default_tol, + bool hermitian, + DenseTensor* out) { + DenseTensor atol_tensor; + if (use_default_tol) { + atol_tensor = phi::Full(dev_ctx, {1}, static_cast(0)); + } else { + atol_tensor = phi::Full(dev_ctx, {1}, static_cast(tol)); + } + MatrixRankTolKernel( + dev_ctx, x, atol_tensor, use_default_tol, hermitian, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + matrix_rank, CPU, ALL_LAYOUT, phi::MatrixRankKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..70b6316e1044426a0743c8d5251ca7d210956356 --- /dev/null +++ b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc @@ -0,0 +1,174 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/matrix_rank_tol_kernel.h" + +#include +#include +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/full_kernel.h" +#include "paddle/phi/kernels/funcs/compare_functors.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/elementwise_base.h" +#include "paddle/phi/kernels/impl/matrix_rank_kernel_impl.h" +#include "paddle/phi/kernels/math_kernel.h" +#include "paddle/phi/kernels/reduce_max_kernel.h" + +namespace phi { + +template +void BatchEigenvalues(const T* x_data, + T* eigenvalues_data, + int batches, + int rows, + int cols, + int k) { + // Eigen::Matrix API need non-const pointer. + T* input = const_cast(x_data); + int stride = rows * cols; + for (int i = 0; i < batches; i++) { + auto m = Eigen::Map< + Eigen::Matrix>( + input + i * stride, rows, rows); + Eigen::SelfAdjointEigenSolver< + Eigen::Matrix> + eigen_solver(m); + auto eigenvalues = eigen_solver.eigenvalues().cwiseAbs(); + for (int j = 0; j < k; j++) { + *(eigenvalues_data + i * k + j) = eigenvalues[j]; + } + } +} + +template +void BatchSVD(const T* x_data, + T* eigenvalues_data, + int batches, + int rows, + int cols, + int k) { + // Eigen::Matrix API need non-const pointer. + T* input = const_cast(x_data); + int stride = rows * cols; + Eigen::BDCSVD< + Eigen::Matrix> + svd; + for (int i = 0; i < batches; i++) { + auto m = Eigen::Map< + Eigen::Matrix>( + input + i * stride, rows, cols); + svd.compute(m); + auto res_s = svd.singularValues(); + for (int j = 0; j < k; j++) { + eigenvalues_data[i * k + j] = res_s[j]; + } + } +} + +template +void MatrixRankTolKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& atol_tensor, + bool use_default_tol, + bool hermitian, + DenseTensor* out) { + auto* x_data = x.data(); + dev_ctx.template Alloc(out); + auto dim_x = x.dims(); + auto dim_out = out->dims(); + int rows = dim_x[dim_x.size() - 2]; + int cols = dim_x[dim_x.size() - 1]; + int k = std::min(rows, cols); + auto numel = x.numel(); + int batches = numel / (rows * cols); + + T rtol_T = 0; + + if (use_default_tol) { + rtol_T = std::numeric_limits::epsilon() * std::max(rows, cols); + } + + DenseTensor eigenvalue_tensor; + eigenvalue_tensor.Resize(detail::GetEigenvalueDim(dim_x, k)); + auto* eigenvalue_data = dev_ctx.template Alloc(&eigenvalue_tensor); + + if (hermitian) { + BatchEigenvalues(x_data, eigenvalue_data, batches, rows, cols, k); + } else { + BatchSVD(x_data, eigenvalue_data, batches, rows, cols, k); + } + + DenseTensor max_eigenvalue_tensor; + max_eigenvalue_tensor.Resize(detail::RemoveLastDim(eigenvalue_tensor.dims())); + dev_ctx.template Alloc(&max_eigenvalue_tensor); + phi::MaxKernel(dev_ctx, + eigenvalue_tensor, + std::vector{-1}, + false, + &max_eigenvalue_tensor); + + DenseTensor temp_rtol_tensor; + temp_rtol_tensor = + phi::Full(dev_ctx, {1}, static_cast(rtol_T)); + + DenseTensor rtol_tensor = + phi::Multiply(dev_ctx, temp_rtol_tensor, max_eigenvalue_tensor); + + DenseTensor tol_tensor; + tol_tensor.Resize(dim_out); + dev_ctx.template Alloc(&tol_tensor); + funcs::ElementwiseCompute, T, T>( + dev_ctx, + atol_tensor, + rtol_tensor, + -1, + GreaterElementFunctor(), + &tol_tensor); + + tol_tensor.Resize(detail::NewAxisDim(tol_tensor.dims(), 1)); + + DenseTensor compare_result; + compare_result.Resize(detail::NewAxisDim(dim_out, k)); + dev_ctx.template Alloc(&compare_result); + int axis = -1; + if (eigenvalue_tensor.dims().size() >= tol_tensor.dims().size()) { + funcs::ElementwiseCompute, T, int>( + dev_ctx, + eigenvalue_tensor, + tol_tensor, + axis, + funcs::GreaterThanFunctor(), + &compare_result); + } else { + funcs::ElementwiseCompute, T, int>( + dev_ctx, + eigenvalue_tensor, + tol_tensor, + axis, + funcs::LessThanFunctor(), + &compare_result); + } + + phi::SumKernel(dev_ctx, + compare_result, + std::vector{-1}, + compare_result.dtype(), + false, + out); +} +} // namespace phi + +PD_REGISTER_KERNEL( + matrix_rank_tol, CPU, ALL_LAYOUT, phi::MatrixRankTolKernel, float, double) { +} diff --git a/paddle/fluid/operators/one_hot_v2_op.h b/paddle/phi/kernels/cpu/one_hot_kernel.cc similarity index 50% rename from paddle/fluid/operators/one_hot_v2_op.h rename to paddle/phi/kernels/cpu/one_hot_kernel.cc index 9d42c5875bb6eecd1244ca1a0dd6442985ec2a02..dc58489ebf70eaaa7efba52775f7ac62bb2ef5b2 100644 --- a/paddle/fluid/operators/one_hot_v2_op.h +++ b/paddle/phi/kernels/cpu/one_hot_kernel.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,23 +12,25 @@ // See the License for the specific language governing permissions and // limitations under the License. -#pragma once -#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/kernels/one_hot_kernel.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/utils/data_type.h" #include "paddle/phi/kernels/funcs/math_function.h" -namespace paddle { -namespace operators { +namespace phi { template struct OneHotV2OpFunctor { - const framework::LoDTensor* in_; - framework::LoDTensor* out_; + const DenseTensor* in_; + DenseTensor* out_; int depth_; const DeviceContext& ctx_; bool allow_out_of_range_; - OneHotV2OpFunctor(const framework::LoDTensor* in, framework::LoDTensor* out, - int depth, const DeviceContext& ctx, + OneHotV2OpFunctor(const DenseTensor* in, + DenseTensor* out, + int depth, + const DeviceContext& ctx, bool allow_out_of_range = false) : in_(in), out_(out), @@ -40,8 +42,8 @@ struct OneHotV2OpFunctor { void apply() const { auto* p_in_data = in_->data(); auto numel = in_->numel(); - auto* p_out_data = out_->mutable_data(ctx_.GetPlace()); - phi::funcs::set_constant(ctx_, out_, 0.0); + auto* p_out_data = ctx_.template Alloc(out_); + funcs::set_constant(ctx_, out_, 0.0); if (allow_out_of_range_) { for (int i = 0; i < numel; ++i) { @@ -52,51 +54,46 @@ struct OneHotV2OpFunctor { } else { for (int i = 0; i < numel; ++i) { PADDLE_ENFORCE_GE( - p_in_data[i], 0, - platform::errors::InvalidArgument( + p_in_data[i], + 0, + phi::errors::InvalidArgument( "Illegal index value, Input(input) value should be at least 0, " "but received input (%d) less than 0", p_in_data[i])); PADDLE_ENFORCE_LT( - p_in_data[i], depth_, - platform::errors::InvalidArgument( + p_in_data[i], + depth_, + phi::errors::InvalidArgument( "Illegal index value, Input(input) value should be less than " "Input(depth), " "but received input (%d) not less than depth (%d)", - p_in_data[i], depth_)); + p_in_data[i], + depth_)); *(p_out_data + i * depth_ + p_in_data[i]) = 1.0; } } } }; -using LoDTensor = framework::LoDTensor; -using Tensor = framework::Tensor; -template -class OneHotV2Kernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* in = context.Input("X"); - auto* out = context.Output("Out"); - int depth = context.Attr("depth"); - bool allow_out_of_range = context.Attr("allow_out_of_range"); - if (context.HasInput("depth_tensor")) { - auto* depth_tensor = context.Input("depth_tensor"); - auto* depth_data = depth_tensor->data(); - depth = depth_data[0]; - auto out_dims = out->dims(); - out_dims[out_dims.size() - 1] = depth; - out->Resize(out_dims); - } - - framework::VisitDataType( - static_cast( - context.Attr("dtype")), - OneHotV2OpFunctor( - in, out, depth, context.template device_context(), - allow_out_of_range)); +template +void OneHotRawKernel(const Context& dev_ctx, + const DenseTensor& x, + int32_t depth, + DataType dtype, + bool allow_out_of_range, + DenseTensor* out) { + auto out_dims = out->dims(); + if (out_dims[out_dims.size() - 1] == -1) { + out_dims[out_dims.size() - 1] = depth; + out->Resize(out_dims); } -}; -} // namespace operators -} // namespace paddle + phi::VisitDataType(dtype, + OneHotV2OpFunctor( + &x, out, depth, dev_ctx, allow_out_of_range)); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + one_hot_raw, CPU, ALL_LAYOUT, phi::OneHotRawKernel, int, int64_t) {} diff --git a/paddle/phi/kernels/cpu/pool_grad_kernel.cc b/paddle/phi/kernels/cpu/pool_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..bb97694d8fc38d92f5290894a2c45dd21e7b1717 --- /dev/null +++ b/paddle/phi/kernels/cpu/pool_grad_kernel.cc @@ -0,0 +1,49 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/pool_grad_kernel.h" + +#include "paddle/phi/kernels/impl/pool_grad_kernel_impl.h" + +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL( + pool2d_grad, CPU, ALL_LAYOUT, phi::Pool2dGradKernel, float, double) {} +PD_REGISTER_KERNEL(pool2d_double_grad, + CPU, + ALL_LAYOUT, + phi::Pool2dDoubleGradKernel, + float, + double) {} +PD_REGISTER_KERNEL(max_pool2d_with_index_grad, + CPU, + ALL_LAYOUT, + phi::MaxPool2dWithIndexGradKernel, + float, + double) { + kernel->InputAt(1).SetDataType( + paddle::experimental::CppTypeToDataType::Type()); +} + +PD_REGISTER_KERNEL( + pool3d_grad, CPU, ALL_LAYOUT, phi::Pool3dGradKernel, float, double) {} +PD_REGISTER_KERNEL(max_pool3d_with_index_grad, + CPU, + ALL_LAYOUT, + phi::MaxPool3dWithIndexGradKernel, + float, + double) { + kernel->InputAt(1).SetDataType( + paddle::experimental::CppTypeToDataType::Type()); +} diff --git a/paddle/phi/kernels/cpu/pool_kernel.cc b/paddle/phi/kernels/cpu/pool_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..1d57e282c3c8ae85573bf11eff43e6551a808ea0 --- /dev/null +++ b/paddle/phi/kernels/cpu/pool_kernel.cc @@ -0,0 +1,41 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/pool_kernel.h" + +#include "paddle/phi/kernels/impl/pool_kernel_impl.h" + +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(pool2d, CPU, ALL_LAYOUT, phi::Pool2dKernel, float, double) {} +PD_REGISTER_KERNEL(max_pool2d_with_index, + CPU, + ALL_LAYOUT, + phi::MaxPool2dWithIndexKernel, + float, + double) { + kernel->OutputAt(1).SetDataType( + paddle::experimental::CppTypeToDataType::Type()); +} + +PD_REGISTER_KERNEL(pool3d, CPU, ALL_LAYOUT, phi::Pool3dKernel, float, double) {} +PD_REGISTER_KERNEL(max_pool3d_with_index, + CPU, + ALL_LAYOUT, + phi::MaxPool3dWithIndexKernel, + float, + double) { + kernel->OutputAt(1).SetDataType( + paddle::experimental::CppTypeToDataType::Type()); +} diff --git a/paddle/phi/kernels/cpu/reduce_prod_kernel.cc b/paddle/phi/kernels/cpu/reduce_prod_kernel.cc index cf0179124ebdfcb58a2ac3436fcbd4d5347bb6f2..9a9bf46e948bc52c6a1e9679b4d3e51b10d89e6d 100644 --- a/paddle/phi/kernels/cpu/reduce_prod_kernel.cc +++ b/paddle/phi/kernels/cpu/reduce_prod_kernel.cc @@ -13,7 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/reduce_prod_kernel.h" -#include "paddle/phi/backends/cpu/cpu_context.h" + #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/cpu/reduce.h" #include "paddle/phi/kernels/funcs/reduce_functor.h" diff --git a/paddle/phi/kernels/cpu/roi_align_kernel.cc b/paddle/phi/kernels/cpu/roi_align_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..35ab99a98eba7e59853fb311d5b2307b69ae31b2 --- /dev/null +++ b/paddle/phi/kernels/cpu/roi_align_kernel.cc @@ -0,0 +1,318 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/roi_align_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/empty_kernel.h" + +namespace phi { + +constexpr size_t GetOffset(size_t x, size_t y, size_t width) { + return y * width + x; +} + +template +struct OffsetsAndRatios { + OffsetsAndRatios() = default; + OffsetsAndRatios(std::size_t xy, + std::size_t xY, + std::size_t Xy, + std::size_t XY, + T xy_ratio, + T xY_ratio, + T Xy_ratio, + T XY_ratio) + : xy(xy), + xY(xY), + Xy(Xy), + XY(XY), + xy_ratio(xy_ratio), + xY_ratio(xY_ratio), + Xy_ratio(Xy_ratio), + XY_ratio(XY_ratio) {} + + std::size_t xy = 0; + std::size_t xY = 0; + std::size_t Xy = 0; + std::size_t XY = 0; + T xy_ratio = 0.0f; + T xY_ratio = 0.0f; + T Xy_ratio = 0.0f; + T XY_ratio = 0.0f; +}; + +template +std::vector> GetIndexesAndRatios( + std::size_t width, + std::size_t height, + const T roi_width, + const T roi_height, + const T roi_xmin, + const T roi_ymin, + std::size_t pooled_width, + std::size_t roi_bin_grid_w, + std::size_t pooled_height, + std::size_t roi_bin_grid_h) { + const auto ind_num = + pooled_width * roi_bin_grid_w * pooled_height * roi_bin_grid_h; + + std::vector> interpolation_cords; + interpolation_cords.reserve(ind_num); + + const auto bin_w = roi_width / pooled_width; + const auto bin_h = roi_height / pooled_height; + + for (std::size_t py = 0; py < pooled_height; py++) { + for (std::size_t px = 0; px < pooled_width; px++) { + for (std::size_t iy = 0; iy < roi_bin_grid_h; iy++) { + // calculate x of sample points + auto y = + roi_ymin + + bin_h * (py + + static_cast(iy + .5f) / static_cast(roi_bin_grid_h)); + for (std::size_t ix = 0; ix < roi_bin_grid_w; ix++) { + // calculate x of sample points + auto x = roi_xmin + + bin_w * (px + + static_cast(ix + .5f) / + static_cast(roi_bin_grid_w)); + + // deal with elements out of map + if (y < -1.0 || y > height || x < -1.0 || x > width) { + interpolation_cords.emplace_back(); + continue; + } + y = y <= 0 ? 0 : y; + x = x <= 0 ? 0 : x; + + std::size_t x_low_index = static_cast(x); + std::size_t x_high_index; + if (x_low_index >= width - 1) { + x_high_index = x_low_index = width - 1; + x = static_cast(x_low_index); + } else { + x_high_index = x_low_index + 1; + } + T x_ratio = x_high_index - x; + + std::size_t y_low_index = static_cast(y); + std::size_t y_high_index; + if (y_low_index >= height - 1) { + y_high_index = y_low_index = height - 1; + y = static_cast(y_low_index); + } else { + y_high_index = y_low_index + 1; + } + T y_ratio = y_high_index - y; + + auto xy = GetOffset(x_low_index, y_low_index, width); + auto xY = GetOffset(x_low_index, y_high_index, width); + auto Xy = GetOffset(x_high_index, y_low_index, width); + auto XY = GetOffset(x_high_index, y_high_index, width); + + auto xy_ratio = x_ratio * y_ratio; + auto xY_ratio = x_ratio * (1 - y_ratio); + auto Xy_ratio = (1 - x_ratio) * y_ratio; + auto XY_ratio = (1 - x_ratio) * (1 - y_ratio); + + interpolation_cords.emplace_back( + xy, xY, Xy, XY, xy_ratio, xY_ratio, Xy_ratio, XY_ratio); + } + } + } + } + return interpolation_cords; +} + +template +void Interpolate(std::vector& interpolated_values, // NOLINT + const std::vector>& interpolation_cords, + const T* data) { + for (auto& ic : interpolation_cords) { + auto xlyl_offset = ic.xy; + auto xhyl_offset = ic.Xy; + auto xlyh_offset = ic.xY; + auto xhyh_offset = ic.XY; + + auto xlyl_ratio = ic.xy_ratio; + auto xhyl_ratio = ic.Xy_ratio; + auto xlyh_ratio = ic.xY_ratio; + auto xhyh_ratio = ic.XY_ratio; + + interpolated_values.emplace_back( + xlyl_ratio * data[xlyl_offset] + xhyl_ratio * data[xhyl_offset] + + xlyh_ratio * data[xlyh_offset] + xhyh_ratio * data[xhyh_offset]); + } +} + +template +void AvgPool(const std::vector& interpolated_values, + T* output_data, + int roi_bin_grid_w, + int roi_bin_grid_h, + int pooled_width, + int pooled_height) { + const auto data_amount = pooled_width * pooled_height; + const auto grid_points = roi_bin_grid_w * roi_bin_grid_h; + const T count = 1.0 / grid_points; + auto val_begin = interpolated_values.cbegin(); + for (auto i = 0; i < data_amount; ++i) { + T sum = 0.0; + auto val_end = val_begin + grid_points; + sum = std::accumulate(val_begin, val_end, sum); + val_begin = val_end; + output_data[i] = sum * count; + } +} + +template +void ROIAlignKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& boxes, + paddle::optional boxes_num, + int pooled_height, + int pooled_width, + float spatial_scale, + int sampling_ratio, + bool aligned, + DenseTensor* out) { + auto in_dims = x.dims(); + int batch_size = in_dims[0]; + int channels = in_dims[1]; + int height = in_dims[2]; + int width = in_dims[3]; + int rois_num = boxes.dims()[0]; + + auto in_stride = phi::stride(in_dims); + auto roi_stride = phi::stride(boxes.dims()); + auto out_stride = phi::stride(out->dims()); + + const T* input_data = x.data(); + DenseTensor roi_batch_id_list = Empty(dev_ctx, {rois_num}); + int* roi_batch_id_data = roi_batch_id_list.data(); + int boxes_batch_size; + if (boxes_num) { + boxes_batch_size = boxes_num->numel(); + PADDLE_ENFORCE_EQ( + boxes_batch_size, + batch_size, + errors::InvalidArgument( + "The batch size of rois and the batch size of images " + " must be the same. But received the batch size of rois is %d, " + "and the batch size of images is %d", + boxes_batch_size, + batch_size)); + auto* boxes_num_data = boxes_num->data(); + int start = 0; + for (int n = 0; n < boxes_batch_size; ++n) { + for (int i = start; i < start + boxes_num_data[n]; ++i) { + roi_batch_id_data[i] = n; + } + start += boxes_num_data[n]; + } + } else { + auto lod = boxes.lod(); + PADDLE_ENFORCE_EQ( + lod.empty(), + false, + errors::InvalidArgument("Input(ROIs) Tensor of ROIAlignOp " + "does not contain LoD information.")); + auto boxes_lod = lod.back(); + int boxes_batch_size = boxes_lod.size() - 1; + PADDLE_ENFORCE_EQ( + boxes_batch_size, + batch_size, + errors::InvalidArgument( + "The boxes_batch_size and imgs " + "batch_size must be the same. But received boxes_batch_size = %d, " + "batch_size = %d", + boxes_batch_size, + batch_size)); + int boxes_num_with_lod = boxes_lod[boxes_batch_size]; + PADDLE_ENFORCE_EQ( + rois_num, + boxes_num_with_lod, + errors::InvalidArgument( + "The actual number of rois and the number of rois " + "provided from Input(RoIsLoD) in RoIAlign must be the same." + " But received actual number of rois is %d, and the number " + "of rois from RoIsLoD is %d", + rois_num, + boxes_num_with_lod)); + for (int n = 0; n < boxes_batch_size; ++n) { + for (std::size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) { + roi_batch_id_data[i] = n; + } + } + } + T* output_data = dev_ctx.template Alloc(out); + const T* boxes_data = boxes.data(); + T roi_offset = aligned ? T(0.5) : 0; + for (int n = 0; n < rois_num; ++n) { + int roi_batch_id = roi_batch_id_data[n]; + T roi_xmin = boxes_data[0] * spatial_scale - roi_offset; + T roi_ymin = boxes_data[1] * spatial_scale - roi_offset; + T roi_xmax = boxes_data[2] * spatial_scale - roi_offset; + T roi_ymax = boxes_data[3] * spatial_scale - roi_offset; + + T roi_width = roi_xmax - roi_xmin; + T roi_height = roi_ymax - roi_ymin; + if (!aligned) { + roi_width = std::max(roi_width, static_cast(1.)); + roi_height = std::max(roi_height, static_cast(1.)); + } + + const T* batch_data = input_data + roi_batch_id * in_stride[0]; + + int roi_bin_grid_h = (sampling_ratio > 0) + ? sampling_ratio + : ceil(roi_height / pooled_height); + int roi_bin_grid_w = + (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); + + auto interpolation_cords = GetIndexesAndRatios(width, + height, + roi_width, + roi_height, + roi_xmin, + roi_ymin, + pooled_width, + roi_bin_grid_w, + pooled_height, + roi_bin_grid_h); + + std::vector interpolated_values; + interpolated_values.reserve(interpolation_cords.size()); + for (auto channel = 0; channel < channels; ++channel) { + Interpolate(interpolated_values, interpolation_cords, batch_data); + AvgPool(interpolated_values, + output_data, + roi_bin_grid_w, + roi_bin_grid_h, + pooled_width, + pooled_height); + batch_data += in_stride[1]; + output_data += out_stride[1]; + interpolated_values.clear(); + } + boxes_data += roi_stride[0]; + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + roi_align, CPU, ALL_LAYOUT, phi::ROIAlignKernel, float, double, int) {} diff --git a/paddle/phi/kernels/cpu/searchsorted_kernel.cc b/paddle/phi/kernels/cpu/searchsorted_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..c036c2d438a36be779441f8f9aef78c0b5fbb642 --- /dev/null +++ b/paddle/phi/kernels/cpu/searchsorted_kernel.cc @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/searchsorted_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/searchsorted_kernel_impl.h" + +PD_REGISTER_KERNEL(searchsorted, + CPU, + ALL_LAYOUT, + phi::SearchsortedKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/set_value_grad_kernel.cc b/paddle/phi/kernels/cpu/set_value_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..44df36bb9fd87320db8548815b68a431e46bbcac --- /dev/null +++ b/paddle/phi/kernels/cpu/set_value_grad_kernel.cc @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/set_value_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/set_value_grad_kernel_impl.h" + +PD_REGISTER_KERNEL(set_value_grad, + CPU, + ALL_LAYOUT, + phi::SetValueGradKernel, + float, + double, + int, + int64_t, + bool) {} diff --git a/paddle/phi/kernels/cpu/softmax_kernel.cc b/paddle/phi/kernels/cpu/softmax_kernel.cc index 537b4326681a175fbad7593eed1d8b6caee9d86c..1d28669571f8d095cf53355be26135360008b0ce 100644 --- a/paddle/phi/kernels/cpu/softmax_kernel.cc +++ b/paddle/phi/kernels/cpu/softmax_kernel.cc @@ -19,4 +19,4 @@ limitations under the License. */ #include "paddle/phi/kernels/impl/softmax_kernel_impl.h" PD_REGISTER_KERNEL( - softmax, CPU, ALL_LAYOUT, phi::SoftmaxRawKernel, float, double) {} + softmax, CPU, ALL_LAYOUT, phi::SoftmaxKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/split_kernel.cc b/paddle/phi/kernels/cpu/split_kernel.cc index 324798effbe56b8b7bdf0c3d31b21cd079a8cf1c..ea8e2702c19d6edd9f63d1da647db0ef07a417f1 100644 --- a/paddle/phi/kernels/cpu/split_kernel.cc +++ b/paddle/phi/kernels/cpu/split_kernel.cc @@ -38,7 +38,7 @@ void SplitKernel(const Context& dev_ctx, out_metas_ptr.push_back(&out_metas.back()); } - phi::SplitInferMeta(x, num_or_sections, axis_scalar, out_metas_ptr, true); + phi::SplitInferMeta(x, num_or_sections, axis_scalar, out_metas_ptr); for (size_t i = 0; i < out_metas.size(); ++i) { outs[i]->Resize(out_metas[i].dims()); diff --git a/paddle/phi/kernels/cpu/triangular_solve_kernel.cc b/paddle/phi/kernels/cpu/triangular_solve_kernel.cc index 5aca5be12792387659b1c4db00e5d8ed98bc22dc..c91e7475f5b7c4ea7c420eb72cccd8cd82b0aa0c 100644 --- a/paddle/phi/kernels/cpu/triangular_solve_kernel.cc +++ b/paddle/phi/kernels/cpu/triangular_solve_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/triangular_solve_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/ddim.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc b/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc index 4247e597acef4aac14f93066a3ea6232734e0c8c..ab3d3c2376b8b05b4909f2c44df260299c2fe460 100644 --- a/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc +++ b/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc @@ -37,8 +37,13 @@ void TruncatedGaussianRandomKernel(const Context& dev_ctx, T* data = dev_ctx.template Alloc(tensor); - std::uniform_real_distribution dist(std::numeric_limits::min(), - 1.0); + auto normal_cdf = [](float x) { + return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0; + }; + float a_normal_cdf = normal_cdf((-2.0 - mean) / std); + float b_normal_cdf = normal_cdf((2.0 - mean) / std); + std::uniform_real_distribution dist(2.0 * a_normal_cdf - 1.0, + 2.0 * b_normal_cdf - 1.0); TruncatedNormal truncated_normal(mean, std); int64_t size = tensor->numel(); diff --git a/paddle/phi/kernels/diag_grad_kernel.h b/paddle/phi/kernels/diag_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..b9edab9bec44c367db2d36dfce05c425dcb07785 --- /dev/null +++ b/paddle/phi/kernels/diag_grad_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void DiagGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + int offset, + DenseTensor* x_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/eigh_kernel.h b/paddle/phi/kernels/eigh_kernel.h index dd28752d9298345101d73913e405381c1d47c6c0..19653918302412e2f7e4dfa8caf71b6c9146a83f 100644 --- a/paddle/phi/kernels/eigh_kernel.h +++ b/paddle/phi/kernels/eigh_kernel.h @@ -15,7 +15,6 @@ #pragma once #include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/device_context.h" namespace phi { diff --git a/paddle/phi/kernels/elementwise_grad_kernel.h b/paddle/phi/kernels/elementwise_grad_kernel.h index 58ae11a9c4256d18dbacf6a40b06b308acaea159..fb2633cc9fcea7c619193ad964ad62247ed654dd 100644 --- a/paddle/phi/kernels/elementwise_grad_kernel.h +++ b/paddle/phi/kernels/elementwise_grad_kernel.h @@ -124,4 +124,22 @@ void MultiplyTripleGradKernel(const Context& dev_ctx, DenseTensor* d_ddx, DenseTensor* d_ddy); +template +void ElementwiseFMaxGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out_grad, + int axis, + DenseTensor* x_grad, + DenseTensor* y_grad); + +template +void ElementwiseFMinGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out_grad, + int axis, + DenseTensor* x_grad, + DenseTensor* y_grad); + } // namespace phi diff --git a/paddle/phi/kernels/elementwise_kernel.h b/paddle/phi/kernels/elementwise_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..c1e73ad91c67d415437829d5fc731ac91a5722f5 --- /dev/null +++ b/paddle/phi/kernels/elementwise_kernel.h @@ -0,0 +1,36 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/device_context.h" + +namespace phi { + +template +void ElementwiseFMaxKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out); + +template +void ElementwiseFMinKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt index e0db7b51f8e04b561afd30b740166cee9fdd6a78..942eecae16837ad37718fef540bd73e154d5e88a 100644 --- a/paddle/phi/kernels/funcs/CMakeLists.txt +++ b/paddle/phi/kernels/funcs/CMakeLists.txt @@ -3,11 +3,12 @@ add_subdirectory(blas) add_subdirectory(lapack) add_subdirectory(detail) -math_library(math_function DEPS blas dense_tensor tensor) -math_library(segment_pooling) -math_library(sequence2batch) +math_library(concat_and_split_functor DEPS dense_tensor) math_library(gru_compute DEPS activation_functions math_function) math_library(lstm_compute DEPS activation_functions) -math_library(concat_and_split_functor DEPS dense_tensor) +math_library(math_function DEPS blas dense_tensor tensor) math_library(matrix_reduce DEPS dense_tensor) math_library(matrix_inverse DEPS dense_tensor eigen3 blas) +math_library(pooling DEPS dense_tensor) +math_library(segment_pooling) +math_library(sequence2batch) diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h index 1a36e4e132f41720b6f9fc563026082e21971d96..c8fb54bb102d389cf005bac6d0f0edb78fb845ee 100644 --- a/paddle/phi/kernels/funcs/activation_functor.h +++ b/paddle/phi/kernels/funcs/activation_functor.h @@ -513,7 +513,270 @@ struct ReluGradGradFunctor : public BaseActivationFunctor { } }; -#if defined(__NVCC__) || defined(__HIPCC__) +// tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) +template +struct TanhFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.tanh(); + } +}; + +template +struct TanhGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * (static_cast(1) - out * out); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + +template +struct TanhGradGradFunctor : public BaseActivationFunctor { + template + void operator()(const Device& dev, + const DenseTensor* Out, + const DenseTensor* ddX, + const DenseTensor* dOut, + DenseTensor* dOutNew, + DenseTensor* ddOut) const { + auto* d = dev.eigen_device(); + auto ddx = EigenVector::Flatten( + GET_DATA_SAFELY(ddX, "Input", "DDX", "TanhGradGrad")); + auto out = EigenVector::Flatten( + GET_DATA_SAFELY(Out, "Input", "Out", "TanhGradGrad")); + // tanh grad grad : ddout = (1 - out^2) * ddx, dout = - (dout_old * 2 * out + // * ddx) + if (dOutNew) { + auto dout = EigenVector::Flatten( + GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhGradGrad")); + auto dout_new = EigenVector::Flatten( + GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "TanhGradGrad")); + dout_new.device(*d) = + static_cast(-1) * dout * static_cast(2) * out * ddx; + } + if (ddOut) { + auto ddout = EigenVector::Flatten( + GET_DATA_SAFELY(ddOut, "Output", "DDOut", "TanhGradGrad")); + ddout.device(*d) = (static_cast(1) - out * out) * ddx; + } + } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; +/* + Out + DOut D_Dout + DDx -> TanhTripleGrad -> D_DDx + D_DDout d_OutNew + D_Dout_new + + D_Dout = (-2) * Out * DDx * D_Dout_new + D_DDx = (1-Out^2)*D_DDout + (-2) * Out * DOut * D_Dout_new + D_OutNew = (-2) * Out * DDx * D_DDout + (-2) * DOut * DDx * D_Dout_new + + Out, DDX, DOut, D_DDOut, D_DOut_New // input + D_OutNew, D_DOut, D_DDx // output +*/ +template +struct TanhTripleGradFunctor : public BaseActivationFunctor { + template + void operator()(const Device& dev, + const DenseTensor* Out, + const DenseTensor* ddX, + const DenseTensor* dOut, + const DenseTensor* d_DDOut, + const DenseTensor* d_dOut_New, + DenseTensor* d_d_Out, + DenseTensor* d_Out_New, + DenseTensor* d_DDx) const { + auto* d = dev.eigen_device(); + auto ddx = EigenVector::Flatten( + GET_DATA_SAFELY(ddX, "Input", "DDX", "TanhTripleGrad")); + auto out = EigenVector::Flatten( + GET_DATA_SAFELY(Out, "Input", "Out", "TanhTripleGrad")); + auto dout = EigenVector::Flatten( + GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhTripleGrad")); + auto d_ddOut = EigenVector::Flatten( + GET_DATA_SAFELY(d_DDOut, "Input", "D_DDOut", "TanhTripleGrad")); + auto d_dOutNew = EigenVector::Flatten( + GET_DATA_SAFELY(d_dOut_New, "Input", "D_DOut_New", "TanhTripleGrad")); + + if (d_Out_New) { + auto d_OutNew = EigenVector::Flatten( + GET_DATA_SAFELY(d_Out_New, "Output", "D_OutNew", "TanhTripleGrad")); + d_OutNew.device(*d) = (static_cast(-2) * out * ddx * d_ddOut) - + (static_cast(2) * dout * ddx * d_dOutNew); + } + if (d_d_Out) { + auto d_dOut = EigenVector::Flatten( + GET_DATA_SAFELY(d_d_Out, "Output", "D_DOut", "TanhTripleGrad")); + d_dOut.device(*d) = static_cast(-2) * out * ddx * d_dOutNew; + } + if (d_DDx) { + auto d_ddx = EigenVector::Flatten( + GET_DATA_SAFELY(d_DDx, "Output", "D_DDx", "TanhTripleGrad")); + d_ddx.device(*d) = (static_cast(1) - (out * out)) * d_ddOut - + static_cast(2) * out * dout * d_dOutNew; + } + } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + +template +struct BReluFunctor : public BaseActivationFunctor { + float t_min; + float t_max; + + // NOTE: Explicit hides the `BaseActivationFunctor::GetAttrs` + // not polymorphism for speed. + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"t_min", &t_min}, {"t_max", &t_max}}; + } + + template + void operator()(Device d, X x, Out out) const { + out.device(d) = + x.cwiseMax(static_cast(t_min)).cwiseMin(static_cast(t_max)); + } +}; + +template +struct BReluGradFunctor : public BaseActivationFunctor { + float t_min; + float t_max; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"t_min", &t_min}, {"t_max", &t_max}}; + } + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * + ((x > static_cast(t_min)) * (x < static_cast(t_max))) + .template cast(); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct LeakyReluFunctor : public BaseActivationFunctor { + float alpha; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + + template + void operator()(Device d, X x, Out out) const { + if (alpha < 1.f) { + out.device(d) = x.cwiseMax(static_cast(alpha) * x); + } else { + out.device(d) = x.cwiseMin(static_cast(alpha) * x); + } + } +}; + +template +struct LeakyReluGradFunctor : public BaseActivationFunctor { + float alpha; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto temp1 = + static_cast(alpha) * (x < static_cast(0)).template cast(); + auto temp2 = (x >= static_cast(0)).template cast(); + dx.device(d) = dout * (temp1 + temp2).template cast(); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct LeakyReluGradGradFunctor : public BaseActivationFunctor { + float alpha; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + template + void operator()(const Device& dev, + const DenseTensor* X, + const DenseTensor* Out, + const DenseTensor* ddX, + DenseTensor* ddOut, + DenseTensor* dOut, + DenseTensor* dX) const { + if (ddOut) { + auto* d = dev.eigen_device(); + auto ddx = EigenVector::Flatten( + GET_DATA_SAFELY(ddX, "Input", "DDX", "LeakyReluGradGrad")); + auto x = EigenVector::Flatten( + GET_DATA_SAFELY(X, "Input", "X", "LeakyReluGradGrad")); + auto ddout = EigenVector::Flatten( + GET_DATA_SAFELY(ddOut, "Output", "DOut", "LeakyReluGradGrad")); + ddout.device(*d) = + ddx * + ((x > static_cast(0)).template cast() + + static_cast(alpha) * (x <= static_cast(0)).template cast()) + .template cast(); + } + } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct ThresholdedReluFunctor : public BaseActivationFunctor { + float threshold; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + template + void operator()(Device d, X x, Out out) const { + auto th = static_cast(threshold); + out.device(d) = (x > th).template cast() * x; + } +}; + +template +struct ThresholdedReluGradFunctor : public BaseActivationFunctor { + float threshold; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto th = static_cast(threshold); + dx.device(d) = dout * (x > th).template cast(); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) template struct CudaReluFunctor : public BaseActivationFunctor { T zero = static_cast(0.0f); @@ -824,6 +1087,133 @@ struct CudaAtanGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; +template +struct CudaTanhFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + + // tanh(x) = tanh(x) + __device__ __forceinline__ T operator()(const T arg_x) const { + MPType x = static_cast(arg_x); + return static_cast(tanh(x)); + } +}; + +template +struct CudaTanhGradFunctor : public BaseActivationFunctor { + T one = static_cast(1.0f); + + // dx = dout * (1 - out^2) + __device__ __forceinline__ T operator()(const T dout, const T out) const { + return dout * (one - out * out); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + +template +struct CudaBReluFunctor : public BaseActivationFunctor { + float t_min; + float t_max; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"t_min", &t_min}, {"t_max", &t_max}}; + } + + // brelu(x) = min(max(x, t_min), t_max) + __device__ __forceinline__ T operator()(const T x) const { + T t_min_cast = static_cast(t_min); + T t_max_cast = static_cast(t_max); + T temp_max = x > t_min_cast ? x : t_min_cast; + T temp_min = temp_max < t_max_cast ? temp_max : t_max_cast; + return temp_min; + } +}; + +template +struct CudaBReluGradFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + float t_min; + float t_max; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"t_min", &t_min}, {"t_max", &t_max}}; + } + + // dx = (x > t_min && x < t_max) ? dout : 0 + __device__ __forceinline__ T operator()(const T dout, const T x) const { + T t_min_cast = static_cast(t_min); + T t_max_cast = static_cast(t_max); + return (x > t_min_cast && x < t_max_cast) ? dout : zero; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct CudaThresholdedReluFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + // thresholded_relu(x) = x > threshold ? x : 0 + __device__ __forceinline__ T operator()(const T x) const { + return x > static_cast(threshold) ? x : zero; + } +}; + +template +struct CudaThresholdedReluGradFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + // dx = x > threshold ? dout : 0 + __device__ __forceinline__ T operator()(const T dout, const T x) const { + return x > static_cast(threshold) ? dout : zero; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct CudaLeakyReluFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + float alpha; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + + // leakyrelu(x) = x > 0 ? x : alpha * x + __device__ __forceinline__ T operator()(const T x) const { + return x > zero ? x : static_cast(alpha) * x; + } +}; + +template +struct CudaLeakyReluGradFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + float alpha; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + + // dx = dout * (x > 0 ? 1 : alpha) + __device__ __forceinline__ T operator()(const T dout, const T x) const { + return x > zero ? dout : static_cast(alpha) * dout; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; #endif } // namespace funcs diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.cu b/paddle/phi/kernels/funcs/concat_and_split_functor.cu index 840c8872f50f83c2859f07be2e0e7242a74004a7..06be592dd9375902cdbd0289caa347bc11015bd2 100644 --- a/paddle/phi/kernels/funcs/concat_and_split_functor.cu +++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cu @@ -395,6 +395,8 @@ struct ConcatFunctor { auto* data_alloc_released = data_alloc.release(); auto* col_alloc_released = col_alloc.release(); context.AddStreamCallback([data_alloc_released, col_alloc_released] { + VLOG(4) << "Delete cuda pinned at " << data_alloc_released; + VLOG(4) << "Delete cuda pinned at " << col_alloc_released; paddle::memory::allocation::Allocator::AllocationDeleter( data_alloc_released); paddle::memory::allocation::Allocator::AllocationDeleter( diff --git a/paddle/phi/kernels/funcs/elementwise_functor.h b/paddle/phi/kernels/funcs/elementwise_functor.h index b01d50015f01ad2fb2b1ab7c0c0be6f4f1b5acb8..f9e66836a62699f16f0ea32c2af9175d1a1b88b2 100644 --- a/paddle/phi/kernels/funcs/elementwise_functor.h +++ b/paddle/phi/kernels/funcs/elementwise_functor.h @@ -159,6 +159,219 @@ struct DivGradYFunctor> { return -a * out_div_c_conj; } }; +// Fmin +template +struct FMinFunctor { + inline HOSTDEVICE T operator()(const T a, const T b) const { + return std::fmin(a, b); + } +}; + +template <> +struct FMinFunctor { + inline HOSTDEVICE dtype::float16 operator()(const dtype::float16 a, + const dtype::float16 b) const { + float float_a = static_cast(a); + float float_b = static_cast(b); + auto result = std::fmin(float_a, float_b); + return static_cast(result); + } +}; + +template <> +struct FMinFunctor { + inline HOSTDEVICE int operator()(const int a, const int b) const { + float float_a = static_cast(a); + float float_b = static_cast(b); + auto result = std::fmin(float_a, float_b); + return std::lrint(result); + } +}; + +template <> +struct FMinFunctor { + inline HOSTDEVICE int64_t operator()(const int64_t a, const int64_t b) const { + double double_a = static_cast(a); + double double_b = static_cast(b); + auto result = std::fmin(double_a, double_b); + return std::llrint(result); + } +}; + +// Fmax +template +struct FMaxFunctor { + inline HOSTDEVICE T operator()(const T a, const T b) const { + return std::fmax(a, b); + } +}; + +template <> +struct FMaxFunctor { + inline HOSTDEVICE dtype::float16 operator()(const dtype::float16 a, + const dtype::float16 b) const { + float float_a = static_cast(a); + float float_b = static_cast(b); + auto result = std::fmax(float_a, float_b); + return static_cast(result); + } +}; + +template <> +struct FMaxFunctor { + inline HOSTDEVICE int operator()(const int a, const int b) const { + float float_a = static_cast(a); + float float_b = static_cast(b); + auto result = std::fmax(float_a, float_b); + return std::lrint(result); + } +}; + +template <> +struct FMaxFunctor { + inline HOSTDEVICE int64_t operator()(const int64_t a, const int64_t b) const { + double double_a = static_cast(a); + double double_b = static_cast(b); + auto result = std::fmax(double_a, double_b); + return std::llrint(result); + } +}; + +template +struct FMaxGradDx { + HOSTDEVICE T operator()(T x, T y, T out, T dout) const { + return dout * static_cast((x >= y) || isnan(y)); + } +}; + +template <> +struct FMaxGradDx { + HOSTDEVICE dtype::float16 operator()(dtype::float16 x, + dtype::float16 y, + dtype::float16 out, + dtype::float16 dout) const { + return dout * static_cast((x >= y) || dtype::isnan(y)); + } +}; + +template <> +struct FMaxGradDx { + HOSTDEVICE int operator()(int x, int y, int out, int dout) const { + return dout * static_cast((x >= y)); + } +}; + +template <> +struct FMaxGradDx { + HOSTDEVICE int64_t operator()(int64_t x, + int64_t y, + int64_t out, + int64_t dout) const { + return dout * static_cast((x >= y)); + } +}; + +template +struct FMaxGradDy { + HOSTDEVICE T operator()(T x, T y, T out, T dout) const { + return dout * static_cast(!((x >= y) || isnan(y))); + } +}; + +template <> +struct FMaxGradDy { + HOSTDEVICE dtype::float16 operator()(dtype::float16 x, + dtype::float16 y, + dtype::float16 out, + dtype::float16 dout) const { + return dout * static_cast(!((x >= y) || dtype::isnan(y))); + } +}; + +template <> +struct FMaxGradDy { + HOSTDEVICE int64_t operator()(int64_t x, + int64_t y, + int64_t out, + int64_t dout) const { + return dout * static_cast(!((x >= y))); + } +}; + +template <> +struct FMaxGradDy { + HOSTDEVICE int operator()(int x, int y, int out, int dout) const { + return dout * static_cast(!((x >= y))); + } +}; + +template +struct FMinGradDx { + HOSTDEVICE T operator()(T x, T y, T out, T dout) const { + return dout * static_cast((x <= y) || isnan(y)); + } +}; + +template <> +struct FMinGradDx { + HOSTDEVICE dtype::float16 operator()(dtype::float16 x, + dtype::float16 y, + dtype::float16 out, + dtype::float16 dout) const { + return dout * static_cast((x <= y) || dtype::isnan(y)); + } +}; + +template <> +struct FMinGradDx { + HOSTDEVICE int operator()(int x, int y, int out, int dout) const { + return dout * static_cast((x <= y)); + } +}; + +template <> +struct FMinGradDx { + HOSTDEVICE int64_t operator()(int64_t x, + int64_t y, + int64_t out, + int64_t dout) const { + return dout * static_cast((x <= y)); + } +}; + +template +struct FMinGradDy { + HOSTDEVICE T operator()(T x, T y, T out, T dout) const { + return dout * static_cast(!((x <= y) || isnan(y))); + } +}; + +template <> +struct FMinGradDy { + HOSTDEVICE dtype::float16 operator()(dtype::float16 x, + dtype::float16 y, + dtype::float16 out, + dtype::float16 dout) const { + return dout * static_cast(!((x <= y) || dtype::isnan(y))); + } +}; + +template <> +struct FMinGradDy { + HOSTDEVICE int operator()(int x, int y, int out, int dout) const { + return dout * static_cast(!((x <= y))); + } +}; + +template <> +struct FMinGradDy { + HOSTDEVICE int64_t operator()(int64_t x, + int64_t y, + int64_t out, + int64_t dout) const { + return dout * static_cast(!((x <= y))); + } +}; template struct MultiplyGradFunctor { diff --git a/paddle/phi/kernels/funcs/matrix_inverse.h b/paddle/phi/kernels/funcs/matrix_inverse.h index c5b04a8106561962b6916907d86450a63c763830..1c6756f1720a23ada5bb4ff2fdb4f4840660ed58 100644 --- a/paddle/phi/kernels/funcs/matrix_inverse.h +++ b/paddle/phi/kernels/funcs/matrix_inverse.h @@ -39,7 +39,7 @@ void ComputeInverseEigen(const Context& dev_ctx, int batch_size = rank > 2 ? a.numel() / (n * n) : 1; const T* a_ptr = a.data(); - T* a_inv_ptr = a_inv->mutable_data(dev_ctx.GetPlace()); + T* a_inv_ptr = dev_ctx.template Alloc(a_inv); for (int i = 0; i < batch_size; ++i) { ConstEigenMatrixMap mat(a_ptr + i * n * n, n, n); diff --git a/paddle/fluid/operators/math/pooling.cc b/paddle/phi/kernels/funcs/pooling.cc similarity index 83% rename from paddle/fluid/operators/math/pooling.cc rename to paddle/phi/kernels/funcs/pooling.cc index f2e5e955ec487585deee1cbebba3d2932ee1b05d..10c88b9798c6ff69b755aa2c7423558c35afe859 100644 --- a/paddle/fluid/operators/math/pooling.cc +++ b/paddle/phi/kernels/funcs/pooling.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -11,11 +11,15 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/math/pooling.h" -namespace paddle { -namespace operators { -namespace math { +#include "paddle/phi/kernels/funcs/pooling.h" + +#include +#include +#include "paddle/phi/backends/cpu/cpu_context.h" + +namespace phi { +namespace funcs { /* * Tensors are in NCHW or NHWC format. @@ -25,13 +29,16 @@ namespace math { * height_down, width_left and width_right, respectively. */ template -class Pool2dFunctor { +class Pool2dFunctor { public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, const std::vector& ksize, + void operator()(const CPUContext& context, + const DenseTensor& input, + const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, bool exclusive, - bool adaptive, framework::Tensor* output, + const std::vector& paddings, + bool exclusive, + bool adaptive, + DenseTensor* output, PoolProcess pool_process) { const int batch_size = input.dims()[0]; const int input_height = input.dims()[2]; @@ -50,7 +57,7 @@ class Pool2dFunctor { const int output_stride = output_height * output_width; const T* input_data = input.data(); - T* output_data = output->mutable_data(context.GetPlace()); + T* output_data = context.template Alloc(output); int hstart = 0, hend = 1; int wstart = 0, wend = 1; @@ -101,12 +108,16 @@ class Pool2dFunctor { } } - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, const std::vector& ksize, + void operator()(const CPUContext& context, + const DenseTensor& input, + const std::vector& ksize, const std::vector& strides, const std::vector& paddings, - const std::string data_format, bool exclusive, bool adaptive, - framework::Tensor* output, PoolProcess pool_process) { + const std::string data_format, + bool exclusive, + bool adaptive, + DenseTensor* output, + PoolProcess pool_process) { bool channel_last = (data_format == "NHWC"); const int batch_size = input.dims()[0]; @@ -131,7 +142,7 @@ class Pool2dFunctor { const int padding_width = paddings[1]; const T* input_data = input.data(); - T* output_data = output->mutable_data(context.GetPlace()); + T* output_data = context.template Alloc(output); int hstart = 0, hend = 1; int wstart = 0, wend = 1; @@ -244,14 +255,19 @@ class Pool2dFunctor { * height_down, width_left and width_right, respectively. */ template -class Pool2dGradFunctor { +class Pool2dGradFunctor { public: - void operator()( - const platform::CPUDeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, const framework::Tensor& output_grad, - const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, bool exclusive, bool adaptive, - framework::Tensor* input_grad, PoolProcess pool_grad_process) { + void operator()(const CPUContext& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + bool exclusive, + bool adaptive, + DenseTensor* input_grad, + PoolProcess pool_grad_process) { const int batch_size = input.dims()[0]; const int input_height = input.dims()[2]; const int input_width = input.dims()[3]; @@ -270,7 +286,7 @@ class Pool2dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T* input_grad_data = context.template Alloc(input_grad); int hstart = 0, hend = 1; int wstart = 0, wend = 1; @@ -324,13 +340,18 @@ class Pool2dGradFunctor { } } - void operator()( - const platform::CPUDeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, const framework::Tensor& output_grad, - const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, const std::string data_format, - bool exclusive, bool adaptive, framework::Tensor* input_grad, - PoolProcess pool_grad_process) { + void operator()(const CPUContext& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::string data_format, + bool exclusive, + bool adaptive, + DenseTensor* input_grad, + PoolProcess pool_grad_process) { bool channel_last = (data_format == "NHWC"); const int batch_size = input.dims()[0]; @@ -357,7 +378,7 @@ class Pool2dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T* input_grad_data = context.template Alloc(input_grad); int hstart = 0, hend = 1; int wstart = 0, wend = 1; @@ -451,10 +472,11 @@ class Pool2dGradFunctor { h * input_width * input_channels + w * input_channels + c; auto output_idx = ph * output_width * output_channels + pw * output_channels + c; - pool_grad_process.compute( - input_data[input_idx], output_data[output_idx], - output_grad_data[output_idx], static_cast(scale), - input_grad_data + input_idx); + pool_grad_process.compute(input_data[input_idx], + output_data[output_idx], + output_grad_data[output_idx], + static_cast(scale), + input_grad_data + input_idx); } } } @@ -477,13 +499,16 @@ class Pool2dGradFunctor { * height_down, width_left and width_right, respectively. */ template -class MaxPool2dGradFunctor { +class MaxPool2dGradFunctor { public: - void operator()( - const platform::CPUDeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, const framework::Tensor& output_grad, - const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, framework::Tensor* input_grad) { + void operator()(const CPUContext& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + DenseTensor* input_grad) { const int batch_size = input.dims()[0]; const int input_height = input.dims()[2]; const int input_width = input.dims()[3]; @@ -502,7 +527,7 @@ class MaxPool2dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T* input_grad_data = context.template Alloc(input_grad); for (int i = 0; i < batch_size; i++) { for (int c = 0; c < output_channels; ++c) { @@ -536,12 +561,15 @@ class MaxPool2dGradFunctor { } } - void operator()( - const platform::CPUDeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, const framework::Tensor& output_grad, - const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, const std::string data_format, - framework::Tensor* input_grad) { + void operator()(const CPUContext& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::string data_format, + DenseTensor* input_grad) { bool channel_last = (data_format == "NHWC"); const int batch_size = input.dims()[0]; @@ -568,7 +596,7 @@ class MaxPool2dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T* input_grad_data = context.template Alloc(input_grad); if (!channel_last) { const int input_stride = input_height * input_width; @@ -641,29 +669,17 @@ class MaxPool2dGradFunctor { } } }; -template class MaxPool2dGradFunctor; -template class MaxPool2dGradFunctor; - -template class Pool2dFunctor, float>; -template class Pool2dFunctor, float>; -template class Pool2dGradFunctor, - float>; -template class Pool2dGradFunctor, - float>; -template class Pool2dFunctor, double>; -template class Pool2dFunctor, double>; -template class Pool2dGradFunctor, - double>; -template class Pool2dGradFunctor, - double>; +template class MaxPool2dGradFunctor; +template class MaxPool2dGradFunctor; + +template class Pool2dFunctor, float>; +template class Pool2dFunctor, float>; +template class Pool2dGradFunctor, float>; +template class Pool2dGradFunctor, float>; +template class Pool2dFunctor, double>; +template class Pool2dFunctor, double>; +template class Pool2dGradFunctor, double>; +template class Pool2dGradFunctor, double>; /* * Tensors are in NCDHW or NDHWC format. @@ -674,13 +690,16 @@ template class Pool2dGradFunctor -class Pool3dFunctor { +class Pool3dFunctor { public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, const std::vector& ksize, + void operator()(const CPUContext& context, + const DenseTensor& input, + const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, bool exclusive, - bool adaptive, framework::Tensor* output, + const std::vector& paddings, + bool exclusive, + bool adaptive, + DenseTensor* output, PoolProcess pool_process) { const int batch_size = input.dims()[0]; const int input_depth = input.dims()[2]; @@ -704,7 +723,7 @@ class Pool3dFunctor { const int output_stride = output_depth * output_height * output_width; const T* input_data = input.data(); - T* output_data = output->mutable_data(context.GetPlace()); + T* output_data = context.template Alloc(output); int dstart = 0, dend = 1; int hstart = 0, hend = 1; @@ -771,12 +790,16 @@ class Pool3dFunctor { } } } - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, const std::vector& ksize, + void operator()(const CPUContext& context, + const DenseTensor& input, + const std::vector& ksize, const std::vector& strides, const std::vector& paddings, - const std::string data_format, bool exclusive, bool adaptive, - framework::Tensor* output, PoolProcess pool_process) { + const std::string data_format, + bool exclusive, + bool adaptive, + DenseTensor* output, + PoolProcess pool_process) { bool channel_last = (data_format == "NDHWC"); const int batch_size = input.dims()[0]; @@ -807,7 +830,7 @@ class Pool3dFunctor { const int padding_width = paddings[2]; const T* input_data = input.data(); - T* output_data = output->mutable_data(context.GetPlace()); + T* output_data = context.template Alloc(output); int dstart = 0, dend = 1; int hstart = 0, hend = 1; @@ -966,14 +989,19 @@ class Pool3dFunctor { * height_up, height_down, width_left and width_right, respectively. */ template -class Pool3dGradFunctor { +class Pool3dGradFunctor { public: - void operator()( - const platform::CPUDeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, const framework::Tensor& output_grad, - const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, bool exclusive, bool adaptive, - framework::Tensor* input_grad, PoolProcess pool_grad_process) { + void operator()(const CPUContext& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + bool exclusive, + bool adaptive, + DenseTensor* input_grad, + PoolProcess pool_grad_process) { const int batch_size = input.dims()[0]; const int input_depth = input.dims()[2]; const int input_height = input.dims()[3]; @@ -997,7 +1025,7 @@ class Pool3dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T* input_grad_data = context.template Alloc(input_grad); int dstart = 0, dend = 1; int hstart = 0, hend = 1; @@ -1051,10 +1079,11 @@ class Pool3dGradFunctor { int input_idx = (d * input_height + h) * input_width + w; int output_idx = (pd * output_height + ph) * output_width + pw; - pool_grad_process.compute( - input_data[input_idx], output_data[output_idx], - output_grad_data[output_idx], static_cast(scale), - input_grad_data + input_idx); + pool_grad_process.compute(input_data[input_idx], + output_data[output_idx], + output_grad_data[output_idx], + static_cast(scale), + input_grad_data + input_idx); } } } @@ -1068,13 +1097,18 @@ class Pool3dGradFunctor { } } } - void operator()( - const platform::CPUDeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, const framework::Tensor& output_grad, - const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, const std::string data_format, - bool exclusive, bool adaptive, framework::Tensor* input_grad, - PoolProcess pool_grad_process) { + void operator()(const CPUContext& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::string data_format, + bool exclusive, + bool adaptive, + DenseTensor* input_grad, + PoolProcess pool_grad_process) { bool channel_last = (data_format == "NDHWC"); const int batch_size = input.dims()[0]; @@ -1105,7 +1139,7 @@ class Pool3dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T* input_grad_data = context.template Alloc(input_grad); int dstart = 0, dend = 1; int hstart = 0, hend = 1; @@ -1164,10 +1198,11 @@ class Pool3dGradFunctor { int input_idx = (d * input_height + h) * input_width + w; int output_idx = (pd * output_height + ph) * output_width + pw; - pool_grad_process.compute( - input_data[input_idx], output_data[output_idx], - output_grad_data[output_idx], static_cast(scale), - input_grad_data + input_idx); + pool_grad_process.compute(input_data[input_idx], + output_data[output_idx], + output_grad_data[output_idx], + static_cast(scale), + input_grad_data + input_idx); } } } @@ -1241,10 +1276,11 @@ class Pool3dGradFunctor { ((pd * output_height + ph) * output_width + pw) * output_channels + c; - pool_grad_process.compute( - input_data[input_idx], output_data[output_idx], - output_grad_data[output_idx], static_cast(scale), - input_grad_data + input_idx); + pool_grad_process.compute(input_data[input_idx], + output_data[output_idx], + output_grad_data[output_idx], + static_cast(scale), + input_grad_data + input_idx); } } } @@ -1270,13 +1306,16 @@ class Pool3dGradFunctor { * height_up, height_down, width_left and width_right, respectively. */ template -class MaxPool3dGradFunctor { +class MaxPool3dGradFunctor { public: - void operator()( - const platform::CPUDeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, const framework::Tensor& output_grad, - const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, framework::Tensor* input_grad) { + void operator()(const CPUContext& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + DenseTensor* input_grad) { const int batch_size = input.dims()[0]; const int input_depth = input.dims()[2]; const int input_height = input.dims()[3]; @@ -1300,7 +1339,7 @@ class MaxPool3dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T* input_grad_data = context.template Alloc(input_grad); for (int i = 0; i < batch_size; i++) { for (int c = 0; c < output_channels; ++c) { @@ -1342,12 +1381,15 @@ class MaxPool3dGradFunctor { } } } - void operator()( - const platform::CPUDeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, const framework::Tensor& output_grad, - const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, const std::string data_format, - framework::Tensor* input_grad) { + void operator()(const CPUContext& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::string data_format, + DenseTensor* input_grad) { bool channel_last = (data_format == "NDHWC"); const int batch_size = input.dims()[0]; @@ -1378,7 +1420,7 @@ class MaxPool3dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T* input_grad_data = context.template Alloc(input_grad); if (!channel_last) { const int input_stride = input_depth * input_height * input_width; @@ -1475,29 +1517,17 @@ class MaxPool3dGradFunctor { } } }; -template class MaxPool3dGradFunctor; -template class MaxPool3dGradFunctor; - -template class Pool3dFunctor, float>; -template class Pool3dFunctor, float>; -template class Pool3dGradFunctor, - float>; -template class Pool3dGradFunctor, - float>; -template class Pool3dFunctor, double>; -template class Pool3dFunctor, double>; -template class Pool3dGradFunctor, - double>; -template class Pool3dGradFunctor, - double>; +template class MaxPool3dGradFunctor; +template class MaxPool3dGradFunctor; + +template class Pool3dFunctor, float>; +template class Pool3dFunctor, float>; +template class Pool3dGradFunctor, float>; +template class Pool3dGradFunctor, float>; +template class Pool3dFunctor, double>; +template class Pool3dFunctor, double>; +template class Pool3dGradFunctor, double>; +template class Pool3dGradFunctor, double>; /* * All tensors are in NCHW format. @@ -1505,13 +1535,16 @@ template class Pool3dGradFunctor -class MaxPool2dWithIndexFunctor { +class MaxPool2dWithIndexFunctor { public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, const std::vector& ksize, + void operator()(const CPUContext& context, + const DenseTensor& input, + const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, bool adaptive, - framework::Tensor* output, framework::Tensor* mask) { + const std::vector& paddings, + bool adaptive, + DenseTensor* output, + DenseTensor* mask) { const int batch_size = input.dims()[0]; const int input_height = input.dims()[2]; const int input_width = input.dims()[3]; @@ -1528,8 +1561,8 @@ class MaxPool2dWithIndexFunctor { const int output_stride = output_height * output_width; const T1* input_data = input.data(); - T1* output_data = output->mutable_data(context.GetPlace()); - T2* mask_data = mask->mutable_data(context.GetPlace()); + T1* output_data = context.template Alloc(output); + T2* mask_data = context.template Alloc(mask); int hstart, hend; int wstart, wend; @@ -1583,14 +1616,16 @@ class MaxPool2dWithIndexFunctor { * height and width, respectively. */ template -class MaxPool2dWithIndexGradFunctor { +class MaxPool2dWithIndexGradFunctor { public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& output_grad, - const framework::Tensor& mask, const std::vector& ksize, + void operator()(const CPUContext& context, + const DenseTensor& output_grad, + const DenseTensor& mask, + const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, bool adaptive, - framework::Tensor* input_grad) { + const std::vector& paddings, + bool adaptive, + DenseTensor* input_grad) { const int batch_size = input_grad->dims()[0]; const int input_height = input_grad->dims()[2]; const int input_width = input_grad->dims()[3]; @@ -1602,7 +1637,7 @@ class MaxPool2dWithIndexGradFunctor { const T2* mask_data = mask.data(); const T1* output_grad_data = output_grad.data(); - T1* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T1* input_grad_data = context.template Alloc(input_grad); for (int n = 0; n < batch_size; ++n) { for (int c = 0; c < output_channels; ++c) { @@ -1622,14 +1657,10 @@ class MaxPool2dWithIndexGradFunctor { } }; -template class MaxPool2dWithIndexFunctor; -template class MaxPool2dWithIndexGradFunctor; -template class MaxPool2dWithIndexFunctor; -template class MaxPool2dWithIndexGradFunctor; +template class MaxPool2dWithIndexFunctor; +template class MaxPool2dWithIndexGradFunctor; +template class MaxPool2dWithIndexFunctor; +template class MaxPool2dWithIndexGradFunctor; /* * All tensors are in NCDHW format. @@ -1637,13 +1668,16 @@ template class MaxPool2dWithIndexGradFunctor -class MaxPool3dWithIndexFunctor { +class MaxPool3dWithIndexFunctor { public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, const std::vector& ksize, + void operator()(const CPUContext& context, + const DenseTensor& input, + const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, bool adaptive, - framework::Tensor* output, framework::Tensor* mask) { + const std::vector& paddings, + bool adaptive, + DenseTensor* output, + DenseTensor* mask) { const int batch_size = input.dims()[0]; const int input_depth = input.dims()[2]; const int input_height = input.dims()[3]; @@ -1665,8 +1699,8 @@ class MaxPool3dWithIndexFunctor { const int output_stride = output_depth * output_height * output_width; const T1* input_data = input.data(); - T1* output_data = output->mutable_data(context.GetPlace()); - T2* mask_data = mask->mutable_data(context.GetPlace()); + T1* output_data = context.template Alloc(output); + T2* mask_data = context.template Alloc(mask); int dstart, dend; int hstart, hend; @@ -1735,14 +1769,16 @@ class MaxPool3dWithIndexFunctor { * depth, height and width, respectively. */ template -class MaxPool3dWithIndexGradFunctor { +class MaxPool3dWithIndexGradFunctor { public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& output_grad, - const framework::Tensor& mask, const std::vector& ksize, + void operator()(const CPUContext& context, + const DenseTensor& output_grad, + const DenseTensor& mask, + const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, bool adaptive, - framework::Tensor* input_grad) { + const std::vector& paddings, + bool adaptive, + DenseTensor* input_grad) { const int batch_size = input_grad->dims()[0]; const int input_depth = input_grad->dims()[2]; const int input_height = input_grad->dims()[3]; @@ -1756,7 +1792,7 @@ class MaxPool3dWithIndexGradFunctor { const T2* mask_data = mask.data(); const T1* output_grad_data = output_grad.data(); - T1* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T1* input_grad_data = context.template Alloc(input_grad); for (int n = 0; n < batch_size; ++n) { for (int c = 0; c < output_channels; ++c) { @@ -1779,14 +1815,9 @@ class MaxPool3dWithIndexGradFunctor { } }; -template class MaxPool3dWithIndexFunctor; -template class MaxPool3dWithIndexGradFunctor; -template class MaxPool3dWithIndexFunctor; -template class MaxPool3dWithIndexGradFunctor; -} // namespace math -} // namespace operators -} // namespace paddle +template class MaxPool3dWithIndexFunctor; +template class MaxPool3dWithIndexGradFunctor; +template class MaxPool3dWithIndexFunctor; +template class MaxPool3dWithIndexGradFunctor; +} // namespace funcs +} // namespace phi diff --git a/paddle/fluid/operators/math/pooling.cu b/paddle/phi/kernels/funcs/pooling.cu similarity index 54% rename from paddle/fluid/operators/math/pooling.cu rename to paddle/phi/kernels/funcs/pooling.cu index 9d96345eb1f6dca6fc5eb6cf5847baaf1a9019da..4cf5e1c02c59757ee8bd0ae91c18d0882b702da1 100644 --- a/paddle/fluid/operators/math/pooling.cu +++ b/paddle/phi/kernels/funcs/pooling.cu @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserved. +/* Copyright (c) 2022 paddlepaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -12,63 +12,72 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/phi/kernels/funcs/pooling.h" + #include #include - -#include "paddle/fluid/operators/math/pooling.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/fast_divmod.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" -namespace paddle { -namespace operators { -namespace math { +namespace phi { +namespace funcs { struct FastDivModForPooling { public: - platform::FastDivMod channel; - platform::FastDivMod width; - platform::FastDivMod height; + paddle::platform::FastDivMod channel; + paddle::platform::FastDivMod width; + paddle::platform::FastDivMod height; explicit HOSTDEVICE FastDivModForPooling(const int channels, const int output_width, const int output_height) { - channel = platform::FastDivMod(channels); - width = platform::FastDivMod(output_width); - height = platform::FastDivMod(output_height); + channel = paddle::platform::FastDivMod(channels); + width = paddle::platform::FastDivMod(output_width); + height = paddle::platform::FastDivMod(output_height); } }; struct FastDivModForPoolingWithMoreStaff { public: - platform::FastDivMod channel; - platform::FastDivMod width; - platform::FastDivMod height; - platform::FastDivMod ksize_w; - platform::FastDivMod ksize_h; - platform::FastDivMod stride_w; - platform::FastDivMod stride_h; + paddle::platform::FastDivMod channel; + paddle::platform::FastDivMod width; + paddle::platform::FastDivMod height; + paddle::platform::FastDivMod ksize_w; + paddle::platform::FastDivMod ksize_h; + paddle::platform::FastDivMod stride_w; + paddle::platform::FastDivMod stride_h; explicit HOSTDEVICE FastDivModForPoolingWithMoreStaff( - const int channels, const int input_width, const int input_height, - const int ksize_width, const int ksize_height, const int stride_width, + const int channels, + const int input_width, + const int input_height, + const int ksize_width, + const int ksize_height, + const int stride_width, const int stride_height) { - channel = platform::FastDivMod(channels); - width = platform::FastDivMod(input_width); - height = platform::FastDivMod(input_height); - ksize_w = platform::FastDivMod(ksize_width); - ksize_h = platform::FastDivMod(ksize_height); - stride_w = platform::FastDivMod(stride_width); - stride_h = platform::FastDivMod(stride_height); + channel = paddle::platform::FastDivMod(channels); + width = paddle::platform::FastDivMod(input_width); + height = paddle::platform::FastDivMod(input_height); + ksize_w = paddle::platform::FastDivMod(ksize_width); + ksize_h = paddle::platform::FastDivMod(ksize_height); + stride_w = paddle::platform::FastDivMod(stride_width); + stride_h = paddle::platform::FastDivMod(stride_height); } }; template -__device__ void OffsetPreparationFor4Dimension( - int index, bool channel_last, FastDivModForPooling divmods, - const int pad_width, const int pad_height, const int aux_width, - const int aux_height, int* w_offset, int* h_offset, int* c_offset, - int* stride) { +__device__ void OffsetPreparationFor4Dimension(int index, + bool channel_last, + FastDivModForPooling divmods, + const int pad_width, + const int pad_height, + const int aux_width, + const int aux_height, + int* w_offset, + int* h_offset, + int* c_offset, + int* stride) { if (!channel_last) { /* NCHW */ auto input_width_divmod = divmods.width.Divmod(index); auto input_height_divmod = divmods.height.Divmod(input_width_divmod.val[0]); @@ -91,21 +100,40 @@ __device__ void OffsetPreparationFor4Dimension( } template -__global__ void KernelPool2D( - const int nthreads, const T* input_data, const int channels, - const int input_height, const int input_width, const int output_height, - const int output_width, const int ksize_height, const int ksize_width, - const int stride_height, const int stride_width, const int padding_height, - const int padding_width, FastDivModForPooling divmods, - PoolProcess pool_process, bool exclusive, bool adaptive, T* output_data, - bool channel_last = false) { +__global__ void KernelPool2D(const int nthreads, + const T* input_data, + const int channels, + const int input_height, + const int input_width, + const int output_height, + const int output_width, + const int ksize_height, + const int ksize_width, + const int stride_height, + const int stride_width, + const int padding_height, + const int padding_width, + FastDivModForPooling divmods, + PoolProcess pool_process, + bool exclusive, + bool adaptive, + T* output_data, + bool channel_last = false) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { int hstart, hend, wstart, wend; int w_offset, h_offset, c_offset, input_offset; - OffsetPreparationFor4Dimension( - index, channel_last, divmods, 0, 0, input_width, input_height, - &w_offset, &h_offset, &c_offset, &input_offset); + OffsetPreparationFor4Dimension(index, + channel_last, + divmods, + 0, + 0, + input_width, + input_height, + &w_offset, + &h_offset, + &c_offset, + &input_offset); input_data += input_offset; if (adaptive) { @@ -139,25 +167,43 @@ __global__ void KernelPool2D( } template -__global__ void KernelPool2DGrad( - const int nthreads, const T* __restrict__ input_data, - const T* __restrict__ output_data, const const T* __restrict__ output_grad, - const int output_width, const int output_height, const int input_width, - const int input_height, const int ksize_width, const int ksize_height, - const int stride_width, const int stride_height, const int padding_width, - const int padding_height, FastDivModForPoolingWithMoreStaff divmods, - PoolProcess pool_process, bool exclusive, bool adaptive, - T* __restrict__ input_grad, bool channel_last = false) { +__global__ void KernelPool2DGrad(const int nthreads, + const T* __restrict__ input_data, + const T* __restrict__ output_data, + const const T* __restrict__ output_grad, + const int output_width, + const int output_height, + const int input_width, + const int input_height, + const int ksize_width, + const int ksize_height, + const int stride_width, + const int stride_height, + const int padding_width, + const int padding_height, + FastDivModForPoolingWithMoreStaff divmods, + PoolProcess pool_process, + bool exclusive, + bool adaptive, + T* __restrict__ input_grad, + bool channel_last = false) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { T input = static_cast(0); T input_grad_data = static_cast(0); int phstart, phend, pwstart, pwend; int w_offset, h_offset, c_offset, output_offset; - OffsetPreparationFor4Dimension<>(index, channel_last, divmods, - padding_width, padding_height, - output_width, output_height, &w_offset, - &h_offset, &c_offset, &output_offset); + OffsetPreparationFor4Dimension<>(index, + channel_last, + divmods, + padding_width, + padding_height, + output_width, + output_height, + &w_offset, + &h_offset, + &c_offset, + &output_offset); if (pool_process.use_x) { input = input_data[index]; output_data += output_offset; @@ -188,7 +234,9 @@ __global__ void KernelPool2DGrad( : tmp_idx; T ouput_value = pool_process.use_x ? output_data[output_sub_idx] : static_cast(0); - pool_process.compute(input, ouput_value, output_grad[output_sub_idx], + pool_process.compute(input, + ouput_value, + output_grad[output_sub_idx], static_cast(1.0 / pool_size), &input_grad_data); } @@ -217,9 +265,11 @@ __global__ void KernelPool2DGrad( : tmp_idx; T ouput_value = pool_process.use_x ? output_data[output_sub_idx] : static_cast(0); - pool_process.compute( - input, ouput_value, output_grad[output_sub_idx], - static_cast(1.0 / pool_size), &input_grad_data); + pool_process.compute(input, + ouput_value, + output_grad[output_sub_idx], + static_cast(1.0 / pool_size), + &input_grad_data); } } } else { @@ -232,9 +282,11 @@ __global__ void KernelPool2DGrad( : tmp_idx; T ouput_value = pool_process.use_x ? output_data[output_sub_idx] : static_cast(0); - pool_process.compute( - input, ouput_value, output_grad[output_sub_idx], - static_cast(1.0 / pool_size), &input_grad_data); + pool_process.compute(input, + ouput_value, + output_grad[output_sub_idx], + static_cast(1.0 / pool_size), + &input_grad_data); } } } @@ -244,19 +296,38 @@ __global__ void KernelPool2DGrad( } template -__global__ void KernelMaxPool2DGrad( - const int nthreads, const T* input_data, const T* output_data, - const T* output_grad, const int channels, const int input_height, - const int input_width, const int output_height, const int output_width, - const int ksize_height, const int ksize_width, const int stride_height, - const int stride_width, const int padding_height, const int padding_width, - T* input_grad, FastDivModForPooling divmods, bool channel_last = false) { +__global__ void KernelMaxPool2DGrad(const int nthreads, + const T* input_data, + const T* output_data, + const T* output_grad, + const int channels, + const int input_height, + const int input_width, + const int output_height, + const int output_width, + const int ksize_height, + const int ksize_width, + const int stride_height, + const int stride_width, + const int padding_height, + const int padding_width, + T* input_grad, + FastDivModForPooling divmods, + bool channel_last = false) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { int w_offset, h_offset, c_offset, input_offset; - OffsetPreparationFor4Dimension( - index, channel_last, divmods, 0, 0, input_width, input_height, - &w_offset, &h_offset, &c_offset, &input_offset); + OffsetPreparationFor4Dimension(index, + channel_last, + divmods, + 0, + 0, + input_width, + input_height, + &w_offset, + &h_offset, + &c_offset, + &input_offset); input_data += input_offset; input_grad += input_offset; @@ -285,17 +356,24 @@ __global__ void KernelMaxPool2DGrad( if (maxIndex != -1) { // atomic add - platform::CudaAtomicAdd(input_grad + maxIndex, output_grad[index]); + paddle::platform::CudaAtomicAdd(input_grad + maxIndex, + output_grad[index]); } } } template void Pool2dDirectCUDAFunctor::operator()( - const T* input, const std::vector& input_shape, - const std::vector& output_shape, const std::vector& ksize, - const std::vector& strides, const std::vector& paddings, - bool exclusive, bool adaptive, T* output, gpuStream_t stream, + const T* input, + const std::vector& input_shape, + const std::vector& output_shape, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + bool exclusive, + bool adaptive, + T* output, + gpuStream_t stream, PoolProcess pool_compute) { const int batch_size = input_shape[0]; const int input_channels = input_shape[1]; @@ -314,7 +392,7 @@ void Pool2dDirectCUDAFunctor::operator()( int nthreads = batch_size * output_channels * output_height * output_width; int thread_num = 1024; #ifdef WITH_NV_JETSON - // platform::ChangeThreadNum(context, &thread_num); + // paddle::platform::ChangeThreadNum(context, &thread_num); thread_num = 512; #endif int blocks = (nthreads + thread_num - 1) / thread_num; @@ -323,11 +401,24 @@ void Pool2dDirectCUDAFunctor::operator()( auto pool_divmods = FastDivModForPooling(input_channels, output_width, output_height); - KernelPool2D<<>>( - nthreads, input, input_channels, input_height, input_width, output_height, - output_width, ksize_height, ksize_width, stride_height, stride_width, - padding_height, padding_width, pool_divmods, pool_compute, exclusive, - adaptive, output); + KernelPool2D<<>>(nthreads, + input, + input_channels, + input_height, + input_width, + output_height, + output_width, + ksize_height, + ksize_width, + stride_height, + stride_width, + padding_height, + padding_width, + pool_divmods, + pool_compute, + exclusive, + adaptive, + output); } /* @@ -338,13 +429,16 @@ void Pool2dDirectCUDAFunctor::operator()( * height_down, width_left and width_right, respectively. */ template -class Pool2dFunctor { +class Pool2dFunctor { public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, const std::vector& ksize, + void operator()(const phi::GPUContext& context, + const DenseTensor& input, + const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, bool exclusive, - bool adaptive, framework::Tensor* output, + const std::vector& paddings, + bool exclusive, + bool adaptive, + DenseTensor* output, PoolProcess pool_process) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; @@ -361,12 +455,12 @@ class Pool2dFunctor { const int padding_width = paddings[1]; const T* input_data = input.data(); - T* output_data = output->mutable_data(context.GetPlace()); + T* output_data = context.template Alloc(output); int nthreads = batch_size * output_channels * output_height * output_width; int thread_num = 1024; #ifdef WITH_NV_JETSON - platform::ChangeThreadNum(context, &thread_num); + paddle::platform::ChangeThreadNum(context, &thread_num); #endif int blocks = (nthreads + thread_num - 1) / thread_num; dim3 threads(thread_num, 1); @@ -375,17 +469,35 @@ class Pool2dFunctor { auto pool_divmods = FastDivModForPooling(input_channels, output_width, output_height); KernelPool2D<<>>( - nthreads, input_data, input_channels, input_height, input_width, - output_height, output_width, ksize_height, ksize_width, stride_height, - stride_width, padding_height, padding_width, pool_divmods, pool_process, - exclusive, adaptive, output_data); + nthreads, + input_data, + input_channels, + input_height, + input_width, + output_height, + output_width, + ksize_height, + ksize_width, + stride_height, + stride_width, + padding_height, + padding_width, + pool_divmods, + pool_process, + exclusive, + adaptive, + output_data); } - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, const std::vector& ksize, + void operator()(const phi::GPUContext& context, + const DenseTensor& input, + const std::vector& ksize, const std::vector& strides, const std::vector& paddings, - const std::string data_format, bool exclusive, bool adaptive, - framework::Tensor* output, PoolProcess pool_process) { + const std::string data_format, + bool exclusive, + bool adaptive, + DenseTensor* output, + PoolProcess pool_process) { bool channel_last = (data_format == "NHWC"); const int batch_size = input.dims()[0]; @@ -410,12 +522,12 @@ class Pool2dFunctor { const int padding_width = paddings[1]; const T* input_data = input.data(); - T* output_data = output->mutable_data(context.GetPlace()); + T* output_data = context.template Alloc(output); int nthreads = batch_size * output_channels * output_height * output_width; int thread_num = 1024; #ifdef WITH_NV_JETSON - platform::ChangeThreadNum(context, &thread_num); + paddle::platform::ChangeThreadNum(context, &thread_num); #endif int blocks = (nthreads + thread_num - 1) / thread_num; dim3 threads(thread_num, 1); @@ -424,10 +536,25 @@ class Pool2dFunctor { auto pool_divmods = FastDivModForPooling(input_channels, output_width, output_height); KernelPool2D<<>>( - nthreads, input_data, input_channels, input_height, input_width, - output_height, output_width, ksize_height, ksize_width, stride_height, - stride_width, padding_height, padding_width, pool_divmods, pool_process, - exclusive, adaptive, output_data, channel_last); + nthreads, + input_data, + input_channels, + input_height, + input_width, + output_height, + output_width, + ksize_height, + ksize_width, + stride_height, + stride_width, + padding_height, + padding_width, + pool_divmods, + pool_process, + exclusive, + adaptive, + output_data, + channel_last); } }; /* @@ -438,16 +565,18 @@ class Pool2dFunctor { * height_down, width_left and width_right, respectively. */ template -class Pool2dGradFunctor { +class Pool2dGradFunctor { public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, + void operator()(const phi::GPUContext& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, bool exclusive, - bool adaptive, framework::Tensor* input_grad, + const std::vector& paddings, + bool exclusive, + bool adaptive, + DenseTensor* input_grad, PoolProcess pool_process) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; @@ -465,30 +594,53 @@ class Pool2dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T* input_grad_data = context.template Alloc(input_grad); int nthreads = batch_size * input_channels * input_height * input_width; - auto pool_divmods = FastDivModForPoolingWithMoreStaff( - input_channels, input_width, input_height, ksize_width, ksize_height, - stride_width, stride_height); - - auto config = GetGpuLaunchConfig1D(context, nthreads); - KernelPool2DGrad<<< - config.block_per_grid, config.thread_per_block, 0, context.stream()>>>( - nthreads, input_data, output_data, output_grad_data, output_width, - output_height, input_width, input_height, ksize_width, ksize_height, - stride_width, stride_height, padding_width, padding_height, - pool_divmods, pool_process, exclusive, adaptive, input_grad_data); + auto pool_divmods = FastDivModForPoolingWithMoreStaff(input_channels, + input_width, + input_height, + ksize_width, + ksize_height, + stride_width, + stride_height); + + auto config = phi::backends::gpu::GetGpuLaunchConfig1D(context, nthreads); + KernelPool2DGrad<<>>(nthreads, + input_data, + output_data, + output_grad_data, + output_width, + output_height, + input_width, + input_height, + ksize_width, + ksize_height, + stride_width, + stride_height, + padding_width, + padding_height, + pool_divmods, + pool_process, + exclusive, + adaptive, + input_grad_data); } - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, + void operator()(const phi::GPUContext& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, const std::vector& ksize, const std::vector& strides, const std::vector& paddings, - const std::string data_format, bool exclusive, bool adaptive, - framework::Tensor* input_grad, PoolProcess pool_process) { + const std::string data_format, + bool exclusive, + bool adaptive, + DenseTensor* input_grad, + PoolProcess pool_process) { bool channel_last = (data_format == "NHWC"); const int batch_size = input.dims()[0]; @@ -514,21 +666,41 @@ class Pool2dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T* input_grad_data = context.template Alloc(input_grad); int nthreads = batch_size * input_channels * input_height * input_width; - auto pool_divmods = FastDivModForPoolingWithMoreStaff( - input_channels, input_width, input_height, ksize_width, ksize_height, - stride_width, stride_height); - - auto config = GetGpuLaunchConfig1D(context, nthreads); - KernelPool2DGrad<<< - config.block_per_grid, config.thread_per_block, 0, context.stream()>>>( - nthreads, input_data, output_data, output_grad_data, output_width, - output_height, input_width, input_height, ksize_width, ksize_height, - stride_width, stride_height, padding_width, padding_height, - pool_divmods, pool_process, exclusive, adaptive, input_grad_data, - channel_last); + auto pool_divmods = FastDivModForPoolingWithMoreStaff(input_channels, + input_width, + input_height, + ksize_width, + ksize_height, + stride_width, + stride_height); + + auto config = phi::backends::gpu::GetGpuLaunchConfig1D(context, nthreads); + KernelPool2DGrad<<>>(nthreads, + input_data, + output_data, + output_grad_data, + output_width, + output_height, + input_width, + input_height, + ksize_width, + ksize_height, + stride_width, + stride_height, + padding_width, + padding_height, + pool_divmods, + pool_process, + exclusive, + adaptive, + input_grad_data, + channel_last); } }; @@ -540,16 +712,16 @@ class Pool2dGradFunctor { * height_down, width_left and width_right, respectively. */ template -class MaxPool2dGradFunctor { +class MaxPool2dGradFunctor { public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, + void operator()(const phi::GPUContext& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, const std::vector& ksize, const std::vector& strides, const std::vector& paddings, - framework::Tensor* input_grad) { + DenseTensor* input_grad) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; const int input_height = input.dims()[2]; @@ -567,7 +739,7 @@ class MaxPool2dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T* input_grad_data = context.template Alloc(input_grad); int nthreads = batch_size * output_channels * output_height * output_width; int blocks = (nthreads + 1024 - 1) / 1024; @@ -577,17 +749,33 @@ class MaxPool2dGradFunctor { auto pool_divmods = FastDivModForPooling(input_channels, output_width, output_height); KernelMaxPool2DGrad<<>>( - nthreads, input_data, output_data, output_grad_data, input_channels, - input_height, input_width, output_height, output_width, ksize_height, - ksize_width, stride_height, stride_width, padding_height, padding_width, - input_grad_data, pool_divmods); + nthreads, + input_data, + output_data, + output_grad_data, + input_channels, + input_height, + input_width, + output_height, + output_width, + ksize_height, + ksize_width, + stride_height, + stride_width, + padding_height, + padding_width, + input_grad_data, + pool_divmods); } - void operator()( - const platform::CUDADeviceContext& context, - const framework::Tensor& input, const framework::Tensor& output, - const framework::Tensor& output_grad, const std::vector& ksize, - const std::vector& strides, const std::vector& paddings, - const std::string data_format, framework::Tensor* input_grad) { + void operator()(const phi::GPUContext& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::string data_format, + DenseTensor* input_grad) { bool channel_last = (data_format == "NHWC"); const int batch_size = input.dims()[0]; @@ -614,7 +802,7 @@ class MaxPool2dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T* input_grad_data = context.template Alloc(input_grad); int nthreads = batch_size * output_channels * output_height * output_width; int blocks = (nthreads + 1024 - 1) / 1024; @@ -625,71 +813,80 @@ class MaxPool2dGradFunctor { FastDivModForPooling(input_channels, output_width, output_height); KernelMaxPool2DGrad<<>>( - nthreads, input_data, output_data, output_grad_data, input_channels, - input_height, input_width, output_height, output_width, ksize_height, - ksize_width, stride_height, stride_width, padding_height, padding_width, - input_grad_data, pool_divmods, channel_last); + nthreads, + input_data, + output_data, + output_grad_data, + input_channels, + input_height, + input_width, + output_height, + output_width, + ksize_height, + ksize_width, + stride_height, + stride_width, + padding_height, + padding_width, + input_grad_data, + pool_divmods, + channel_last); } }; -template class Pool2dDirectCUDAFunctor, - float>; -template class Pool2dDirectCUDAFunctor, - float>; - -template class MaxPool2dGradFunctor; -template class MaxPool2dGradFunctor; -template class MaxPool2dGradFunctor; - -template class Pool2dFunctor, float>; -template class Pool2dFunctor, float>; -template class Pool2dGradFunctor, - float>; -template class Pool2dGradFunctor, - float>; -template class Pool2dFunctor, double>; -template class Pool2dFunctor, double>; -template class Pool2dGradFunctor, - double>; -template class Pool2dGradFunctor, - double>; - -template class Pool2dFunctor< - platform::CUDADeviceContext, - paddle::operators::math::MaxPool, - paddle::platform::float16>; -template class Pool2dFunctor< - platform::CUDADeviceContext, - paddle::operators::math::AvgPool, - paddle::platform::float16>; -template class Pool2dGradFunctor< - platform::CUDADeviceContext, - paddle::operators::math::MaxPoolGrad, - paddle::platform::float16>; -template class Pool2dGradFunctor< - platform::CUDADeviceContext, - paddle::operators::math::AvgPoolGrad, - paddle::platform::float16>; +template class Pool2dDirectCUDAFunctor, float>; +template class Pool2dDirectCUDAFunctor, float>; + +template class MaxPool2dGradFunctor; +template class MaxPool2dGradFunctor; +template class MaxPool2dGradFunctor; + +template class Pool2dFunctor, float>; +template class Pool2dFunctor, float>; +template class Pool2dGradFunctor, float>; +template class Pool2dGradFunctor, float>; +template class Pool2dFunctor, double>; +template class Pool2dFunctor, double>; +template class Pool2dGradFunctor, double>; +template class Pool2dGradFunctor, double>; + +template class Pool2dFunctor, + dtype::float16>; +template class Pool2dFunctor, + dtype::float16>; +template class Pool2dGradFunctor, + dtype::float16>; +template class Pool2dGradFunctor, + dtype::float16>; template -__global__ void KernelPool3D( - const int nthreads, const T* input_data, const int channels, - const int input_depth, const int input_height, const int input_width, - const int output_depth, const int output_height, const int output_width, - const int ksize_depth, const int ksize_height, const int ksize_width, - const int stride_depth, const int stride_height, const int stride_width, - const int padding_depth, const int padding_height, const int padding_width, - PoolProcess pool_process, bool exclusive, bool adaptive, T* output_data, - bool channel_last = false) { +__global__ void KernelPool3D(const int nthreads, + const T* input_data, + const int channels, + const int input_depth, + const int input_height, + const int input_width, + const int output_depth, + const int output_height, + const int output_width, + const int ksize_depth, + const int ksize_height, + const int ksize_width, + const int stride_depth, + const int stride_height, + const int stride_width, + const int padding_depth, + const int padding_height, + const int padding_width, + PoolProcess pool_process, + bool exclusive, + bool adaptive, + T* output_data, + bool channel_last = false) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { int pw, ph, pd, c, batch_idx; @@ -764,16 +961,31 @@ __global__ void KernelPool3D( } template -__global__ void KernelPool3DGrad( - const int nthreads, const T* __restrict__ input_data, - const T* __restrict__ output_data, const T* __restrict__ output_grad, - const int channels, const int input_depth, const int input_height, - const int input_width, const int output_depth, const int output_height, - const int output_width, const int ksize_depth, const int ksize_height, - const int ksize_width, const int stride_depth, const int stride_height, - const int stride_width, const int padding_depth, const int padding_height, - const int padding_width, PoolProcess pool_process, bool exclusive, - bool adaptive, T* input_grad, bool channel_last = false) { +__global__ void KernelPool3DGrad(const int nthreads, + const T* __restrict__ input_data, + const T* __restrict__ output_data, + const T* __restrict__ output_grad, + const int channels, + const int input_depth, + const int input_height, + const int input_width, + const int output_depth, + const int output_height, + const int output_width, + const int ksize_depth, + const int ksize_height, + const int ksize_width, + const int stride_depth, + const int stride_height, + const int stride_width, + const int padding_depth, + const int padding_height, + const int padding_width, + PoolProcess pool_process, + bool exclusive, + bool adaptive, + T* input_grad, + bool channel_last = false) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { int w_offset, h_offset, d_offset, c_offset, batch_idx, output_stride; @@ -867,7 +1079,9 @@ __global__ void KernelPool3DGrad( : (pd * output_height + ph) * output_width + pw; T ouput_value = pool_process.use_x ? output_data[output_sub_idx] : static_cast(0); - pool_process.compute(input, ouput_value, output_grad[output_sub_idx], + pool_process.compute(input, + ouput_value, + output_grad[output_sub_idx], static_cast(1.0 / pool_size), &input_grad_data); } @@ -878,15 +1092,28 @@ __global__ void KernelPool3DGrad( } template -__global__ void KernelMaxPool3DGrad( - const int nthreads, const T* input_data, const T* output_data, - const T* output_grad, const int channels, const int input_depth, - const int input_height, const int input_width, const int output_depth, - const int output_height, const int output_width, const int ksize_depth, - const int ksize_height, const int ksize_width, const int stride_depth, - const int stride_height, const int stride_width, const int padding_depth, - const int padding_height, const int padding_width, T* input_grad, - bool channel_last = false) { +__global__ void KernelMaxPool3DGrad(const int nthreads, + const T* input_data, + const T* output_data, + const T* output_grad, + const int channels, + const int input_depth, + const int input_height, + const int input_width, + const int output_depth, + const int output_height, + const int output_width, + const int ksize_depth, + const int ksize_height, + const int ksize_width, + const int stride_depth, + const int stride_height, + const int stride_width, + const int padding_depth, + const int padding_height, + const int padding_width, + T* input_grad, + bool channel_last = false) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { int pw, ph, pd, c, batch_idx; @@ -949,17 +1176,23 @@ __global__ void KernelMaxPool3DGrad( } if (maxIdx != -1) { // atomic add - platform::CudaAtomicAdd(input_grad + maxIdx, output_grad[index]); + paddle::platform::CudaAtomicAdd(input_grad + maxIdx, output_grad[index]); } } } template void Pool3dDirectCUDAFunctor::operator()( - const T* input, const std::vector& input_shape, - const std::vector& output_shape, const std::vector& ksize, - const std::vector& strides, const std::vector& paddings, - bool exclusive, bool adaptive, T* output, gpuStream_t stream, + const T* input, + const std::vector& input_shape, + const std::vector& output_shape, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + bool exclusive, + bool adaptive, + T* output, + gpuStream_t stream, PoolProcess pool_compute) { const int batch_size = input_shape[0]; const int input_channels = input_shape[1]; @@ -990,11 +1223,28 @@ void Pool3dDirectCUDAFunctor::operator()( dim3 threads(thread_num, 1); dim3 grid(blocks, 1); - KernelPool3D<<>>( - nthreads, input, input_channels, input_depth, input_height, input_width, - output_depth, output_height, output_width, ksize_depth, ksize_height, - ksize_width, stride_depth, stride_height, stride_width, padding_depth, - padding_height, padding_width, pool_compute, exclusive, adaptive, output); + KernelPool3D<<>>(nthreads, + input, + input_channels, + input_depth, + input_height, + input_width, + output_depth, + output_height, + output_width, + ksize_depth, + ksize_height, + ksize_width, + stride_depth, + stride_height, + stride_width, + padding_depth, + padding_height, + padding_width, + pool_compute, + exclusive, + adaptive, + output); } /* @@ -1006,13 +1256,16 @@ void Pool3dDirectCUDAFunctor::operator()( * height_up, height_down, width_left and width_right, respectively. */ template -class Pool3dFunctor { +class Pool3dFunctor { public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, const std::vector& ksize, + void operator()(const phi::GPUContext& context, + const DenseTensor& input, + const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, bool exclusive, - bool adaptive, framework::Tensor* output, + const std::vector& paddings, + bool exclusive, + bool adaptive, + DenseTensor* output, PoolProcess pool_process) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; @@ -1034,31 +1287,52 @@ class Pool3dFunctor { const int padding_width = paddings[2]; const T* input_data = input.data(); - T* output_data = output->mutable_data(context.GetPlace()); + T* output_data = context.template Alloc(output); int nthreads = batch_size * output_channels * output_depth * output_height * output_width; int thread_num = 1024; #ifdef WITH_NV_JETSON - platform::ChangeThreadNum(context, &thread_num); + paddle::platform::ChangeThreadNum(context, &thread_num); #endif int blocks = (nthreads + thread_num - 1) / thread_num; dim3 threads(thread_num, 1); dim3 grid(blocks, 1); KernelPool3D<<>>( - nthreads, input_data, input_channels, input_depth, input_height, - input_width, output_depth, output_height, output_width, ksize_depth, - ksize_height, ksize_width, stride_depth, stride_height, stride_width, - padding_depth, padding_height, padding_width, pool_process, exclusive, - adaptive, output_data); + nthreads, + input_data, + input_channels, + input_depth, + input_height, + input_width, + output_depth, + output_height, + output_width, + ksize_depth, + ksize_height, + ksize_width, + stride_depth, + stride_height, + stride_width, + padding_depth, + padding_height, + padding_width, + pool_process, + exclusive, + adaptive, + output_data); } - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, const std::vector& ksize, + void operator()(const phi::GPUContext& context, + const DenseTensor& input, + const std::vector& ksize, const std::vector& strides, const std::vector& paddings, - const std::string data_format, bool exclusive, bool adaptive, - framework::Tensor* output, PoolProcess pool_process) { + const std::string data_format, + bool exclusive, + bool adaptive, + DenseTensor* output, + PoolProcess pool_process) { bool channel_last = (data_format == "NDHWC"); const int batch_size = input.dims()[0]; @@ -1089,24 +1363,42 @@ class Pool3dFunctor { const int padding_width = paddings[2]; const T* input_data = input.data(); - T* output_data = output->mutable_data(context.GetPlace()); + T* output_data = context.template Alloc(output); int nthreads = batch_size * output_channels * output_depth * output_height * output_width; int thread_num = 1024; #ifdef WITH_NV_JETSON - platform::ChangeThreadNum(context, &thread_num); + paddle::platform::ChangeThreadNum(context, &thread_num); #endif int blocks = (nthreads + thread_num - 1) / thread_num; dim3 threads(thread_num, 1); dim3 grid(blocks, 1); KernelPool3D<<>>( - nthreads, input_data, input_channels, input_depth, input_height, - input_width, output_depth, output_height, output_width, ksize_depth, - ksize_height, ksize_width, stride_depth, stride_height, stride_width, - padding_depth, padding_height, padding_width, pool_process, exclusive, - adaptive, output_data, channel_last); + nthreads, + input_data, + input_channels, + input_depth, + input_height, + input_width, + output_depth, + output_height, + output_width, + ksize_depth, + ksize_height, + ksize_width, + stride_depth, + stride_height, + stride_width, + padding_depth, + padding_height, + padding_width, + pool_process, + exclusive, + adaptive, + output_data, + channel_last); } }; @@ -1119,16 +1411,18 @@ class Pool3dFunctor { * height_up, height_down, width_left and width_right, respectively. */ template -class Pool3dGradFunctor { +class Pool3dGradFunctor { public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, + void operator()(const phi::GPUContext& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, bool exclusive, - bool adaptive, framework::Tensor* input_grad, + const std::vector& paddings, + bool exclusive, + bool adaptive, + DenseTensor* input_grad, PoolProcess pool_process) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; @@ -1152,7 +1446,7 @@ class Pool3dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T* input_grad_data = context.template Alloc(input_grad); int nthreads = batch_size * input_channels * input_depth * input_height * input_width; @@ -1161,21 +1455,43 @@ class Pool3dGradFunctor { dim3 grid(blocks, 1); KernelPool3DGrad<<>>( - nthreads, input_data, output_data, output_grad_data, input_channels, - input_depth, input_height, input_width, output_depth, output_height, - output_width, ksize_depth, ksize_height, ksize_width, stride_depth, - stride_height, stride_width, padding_depth, padding_height, - padding_width, pool_process, exclusive, adaptive, input_grad_data); + nthreads, + input_data, + output_data, + output_grad_data, + input_channels, + input_depth, + input_height, + input_width, + output_depth, + output_height, + output_width, + ksize_depth, + ksize_height, + ksize_width, + stride_depth, + stride_height, + stride_width, + padding_depth, + padding_height, + padding_width, + pool_process, + exclusive, + adaptive, + input_grad_data); } - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, + void operator()(const phi::GPUContext& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, const std::vector& ksize, const std::vector& strides, const std::vector& paddings, - const std::string data_format, bool exclusive, bool adaptive, - framework::Tensor* input_grad, PoolProcess pool_process) { + const std::string data_format, + bool exclusive, + bool adaptive, + DenseTensor* input_grad, + PoolProcess pool_process) { bool channel_last = (data_format == "NDHWC"); const int batch_size = input.dims()[0]; @@ -1206,7 +1522,7 @@ class Pool3dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T* input_grad_data = context.template Alloc(input_grad); int nthreads = batch_size * input_channels * input_depth * input_height * input_width; @@ -1215,11 +1531,30 @@ class Pool3dGradFunctor { dim3 grid(blocks, 1); KernelPool3DGrad<<>>( - nthreads, input_data, output_data, output_grad_data, input_channels, - input_depth, input_height, input_width, output_depth, output_height, - output_width, ksize_depth, ksize_height, ksize_width, stride_depth, - stride_height, stride_width, padding_depth, padding_height, - padding_width, pool_process, exclusive, adaptive, input_grad_data, + nthreads, + input_data, + output_data, + output_grad_data, + input_channels, + input_depth, + input_height, + input_width, + output_depth, + output_height, + output_width, + ksize_depth, + ksize_height, + ksize_width, + stride_depth, + stride_height, + stride_width, + padding_depth, + padding_height, + padding_width, + pool_process, + exclusive, + adaptive, + input_grad_data, channel_last); // add channel_last } }; @@ -1233,16 +1568,16 @@ class Pool3dGradFunctor { * height_up, height_down, width_left and width_right, respectively. */ template -class MaxPool3dGradFunctor { +class MaxPool3dGradFunctor { public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, + void operator()(const phi::GPUContext& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, const std::vector& ksize, const std::vector& strides, const std::vector& paddings, - framework::Tensor* input_grad) { + DenseTensor* input_grad) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; const int input_depth = input.dims()[2]; @@ -1265,7 +1600,7 @@ class MaxPool3dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T* input_grad_data = context.template Alloc(input_grad); int nthreads = batch_size * output_channels * output_depth * output_height * output_width; @@ -1274,18 +1609,37 @@ class MaxPool3dGradFunctor { dim3 grid(blocks, 1); KernelMaxPool3DGrad<<>>( - nthreads, input_data, output_data, output_grad_data, input_channels, - input_depth, input_height, input_width, output_depth, output_height, - output_width, ksize_depth, ksize_height, ksize_width, stride_depth, - stride_height, stride_width, padding_depth, padding_height, - padding_width, input_grad_data); + nthreads, + input_data, + output_data, + output_grad_data, + input_channels, + input_depth, + input_height, + input_width, + output_depth, + output_height, + output_width, + ksize_depth, + ksize_height, + ksize_width, + stride_depth, + stride_height, + stride_width, + padding_depth, + padding_height, + padding_width, + input_grad_data); } - void operator()( - const platform::CUDADeviceContext& context, - const framework::Tensor& input, const framework::Tensor& output, - const framework::Tensor& output_grad, const std::vector& ksize, - const std::vector& strides, const std::vector& paddings, - const std::string data_format, framework::Tensor* input_grad) { + void operator()(const phi::GPUContext& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::string data_format, + DenseTensor* input_grad) { bool channel_last = (data_format == "NDHWC"); const int batch_size = input.dims()[0]; @@ -1316,7 +1670,7 @@ class MaxPool3dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T* input_grad_data = context.template Alloc(input_grad); int nthreads = batch_size * output_channels * output_depth * output_height * output_width; @@ -1325,77 +1679,93 @@ class MaxPool3dGradFunctor { dim3 grid(blocks, 1); KernelMaxPool3DGrad<<>>( - nthreads, input_data, output_data, output_grad_data, input_channels, - input_depth, input_height, input_width, output_depth, output_height, - output_width, ksize_depth, ksize_height, ksize_width, stride_depth, - stride_height, stride_width, padding_depth, padding_height, - padding_width, input_grad_data, channel_last); // add channel_last + nthreads, + input_data, + output_data, + output_grad_data, + input_channels, + input_depth, + input_height, + input_width, + output_depth, + output_height, + output_width, + ksize_depth, + ksize_height, + ksize_width, + stride_depth, + stride_height, + stride_width, + padding_depth, + padding_height, + padding_width, + input_grad_data, + channel_last); // add channel_last } }; -template class Pool3dDirectCUDAFunctor, - float>; -template class Pool3dDirectCUDAFunctor, - float>; - -template class MaxPool3dGradFunctor; -template class MaxPool3dGradFunctor; -template class MaxPool3dGradFunctor; - -template class Pool3dFunctor, float>; -template class Pool3dFunctor, float>; -template class Pool3dGradFunctor, - float>; -template class Pool3dGradFunctor, - float>; -template class Pool3dFunctor, double>; -template class Pool3dFunctor, double>; -template class Pool3dGradFunctor, - double>; -template class Pool3dGradFunctor, - double>; - -template class Pool3dFunctor< - platform::CUDADeviceContext, - paddle::operators::math::MaxPool, - paddle::platform::float16>; -template class Pool3dFunctor< - platform::CUDADeviceContext, - paddle::operators::math::AvgPool, - paddle::platform::float16>; -template class Pool3dGradFunctor< - platform::CUDADeviceContext, - paddle::operators::math::MaxPoolGrad, - paddle::platform::float16>; -template class Pool3dGradFunctor< - platform::CUDADeviceContext, - paddle::operators::math::AvgPoolGrad, - paddle::platform::float16>; +template class Pool3dDirectCUDAFunctor, float>; +template class Pool3dDirectCUDAFunctor, float>; + +template class MaxPool3dGradFunctor; +template class MaxPool3dGradFunctor; +template class MaxPool3dGradFunctor; + +template class Pool3dFunctor, float>; +template class Pool3dFunctor, float>; +template class Pool3dGradFunctor, float>; +template class Pool3dGradFunctor, float>; +template class Pool3dFunctor, double>; +template class Pool3dFunctor, double>; +template class Pool3dGradFunctor, double>; +template class Pool3dGradFunctor, double>; + +template class Pool3dFunctor, + dtype::float16>; +template class Pool3dFunctor, + dtype::float16>; +template class Pool3dGradFunctor, + dtype::float16>; +template class Pool3dGradFunctor, + dtype::float16>; template -__global__ void KernelMaxPool2dWithIdx( - const int nthreads, const T1* input_data, const int channels, - const int input_height, const int input_width, const int output_height, - const int output_width, const int ksize_height, const int ksize_width, - const int stride_height, const int stride_width, const int padding_height, - const int padding_width, bool adaptive, T1* output_data, T2* mask_data, - FastDivModForPooling divmods) { +__global__ void KernelMaxPool2dWithIdx(const int nthreads, + const T1* input_data, + const int channels, + const int input_height, + const int input_width, + const int output_height, + const int output_width, + const int ksize_height, + const int ksize_width, + const int stride_height, + const int stride_width, + const int padding_height, + const int padding_width, + bool adaptive, + T1* output_data, + T2* mask_data, + FastDivModForPooling divmods) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { int hstart, hend, wstart, wend; int w_offset, h_offset, c_offset, input_offset; - OffsetPreparationFor4Dimension( - index, false, divmods, 0, 0, input_width, input_height, &w_offset, - &h_offset, &c_offset, &input_offset); + OffsetPreparationFor4Dimension(index, + false, + divmods, + 0, + 0, + input_width, + input_height, + &w_offset, + &h_offset, + &c_offset, + &input_offset); input_data += input_offset; if (adaptive) { @@ -1431,20 +1801,38 @@ __global__ void KernelMaxPool2dWithIdx( } template -__global__ void KernelMaxPool2DWithIdxGrad( - const int nthreads, const T1* output_grad, const T2* mask_data, - const int channels, const int input_height, const int input_width, - const int output_height, const int output_width, const int ksize_height, - const int ksize_width, const int stride_height, const int stride_width, - const int padding_height, const int padding_width, bool adaptive, - T1* input_grad, FastDivModForPooling divmods) { +__global__ void KernelMaxPool2DWithIdxGrad(const int nthreads, + const T1* output_grad, + const T2* mask_data, + const int channels, + const int input_height, + const int input_width, + const int output_height, + const int output_width, + const int ksize_height, + const int ksize_width, + const int stride_height, + const int stride_width, + const int padding_height, + const int padding_width, + bool adaptive, + T1* input_grad, + FastDivModForPooling divmods) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { int phstart, phend, pwstart, pwend; int w_offset, h_offset, c_offset, output_offset; - OffsetPreparationFor4Dimension( - index, false, divmods, 0, 0, output_width, output_height, &w_offset, - &h_offset, &c_offset, &output_offset); + OffsetPreparationFor4Dimension(index, + false, + divmods, + 0, + 0, + output_width, + output_height, + &w_offset, + &h_offset, + &c_offset, + &output_offset); mask_data += output_offset; output_grad += output_offset; @@ -1487,13 +1875,16 @@ __global__ void KernelMaxPool2DWithIdxGrad( * height and width, respectively. */ template -class MaxPool2dWithIndexFunctor { +class MaxPool2dWithIndexFunctor { public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, const std::vector& ksize, + void operator()(const phi::GPUContext& context, + const DenseTensor& input, + const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, bool adaptive, - framework::Tensor* output, framework::Tensor* mask) { + const std::vector& paddings, + bool adaptive, + DenseTensor* output, + DenseTensor* mask) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; const int input_height = input.dims()[2]; @@ -1509,13 +1900,13 @@ class MaxPool2dWithIndexFunctor { const int padding_width = paddings[1]; const T1* input_data = input.data(); - T1* output_data = output->mutable_data(context.GetPlace()); - T2* mask_data = mask->mutable_data(context.GetPlace()); + T1* output_data = context.template Alloc(output); + T2* mask_data = context.template Alloc(mask); int nthreads = batch_size * output_channels * output_height * output_width; int thread_num = 1024; #ifdef WITH_NV_JETSON - platform::ChangeThreadNum(context, &thread_num); + paddle::platform::ChangeThreadNum(context, &thread_num); #endif int blocks = (nthreads + thread_num - 1) / thread_num; @@ -1525,10 +1916,23 @@ class MaxPool2dWithIndexFunctor { auto pool_divmods = FastDivModForPooling(input_channels, output_width, output_height); KernelMaxPool2dWithIdx<<>>( - nthreads, input_data, input_channels, input_height, input_width, - output_height, output_width, ksize_height, ksize_width, stride_height, - stride_width, padding_height, padding_width, adaptive, output_data, - mask_data, pool_divmods); + nthreads, + input_data, + input_channels, + input_height, + input_width, + output_height, + output_width, + ksize_height, + ksize_width, + stride_height, + stride_width, + padding_height, + padding_width, + adaptive, + output_data, + mask_data, + pool_divmods); } }; @@ -1538,14 +1942,16 @@ class MaxPool2dWithIndexFunctor { * height and width, respectively. */ template -class MaxPool2dWithIndexGradFunctor { +class MaxPool2dWithIndexGradFunctor { public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& output_grad, - const framework::Tensor& mask, const std::vector& ksize, + void operator()(const phi::GPUContext& context, + const DenseTensor& output_grad, + const DenseTensor& mask, + const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, bool adaptive, - framework::Tensor* input_grad) { + const std::vector& paddings, + bool adaptive, + DenseTensor* input_grad) { const int batch_size = input_grad->dims()[0]; const int input_channels = input_grad->dims()[1]; const int input_height = input_grad->dims()[2]; @@ -1561,7 +1967,7 @@ class MaxPool2dWithIndexGradFunctor { const T2* mask_data = mask.data(); const T1* output_grad_data = output_grad.data(); - T1* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T1* input_grad_data = context.template Alloc(input_grad); int nthreads = batch_size * input_channels * input_height * input_width; int blocks = (nthreads + 1024 - 1) / 1024; @@ -1571,31 +1977,53 @@ class MaxPool2dWithIndexGradFunctor { auto pool_divmods = FastDivModForPooling(input_channels, input_width, input_height); KernelMaxPool2DWithIdxGrad<<>>( - nthreads, output_grad_data, mask_data, input_channels, input_height, - input_width, output_height, output_width, ksize_height, ksize_width, - stride_height, stride_width, padding_height, padding_width, adaptive, - input_grad_data, pool_divmods); + nthreads, + output_grad_data, + mask_data, + input_channels, + input_height, + input_width, + output_height, + output_width, + ksize_height, + ksize_width, + stride_height, + stride_width, + padding_height, + padding_width, + adaptive, + input_grad_data, + pool_divmods); } }; -template class MaxPool2dWithIndexFunctor; -template class MaxPool2dWithIndexGradFunctor; -template class MaxPool2dWithIndexFunctor; -template class MaxPool2dWithIndexGradFunctor; +template class MaxPool2dWithIndexFunctor; +template class MaxPool2dWithIndexGradFunctor; +template class MaxPool2dWithIndexFunctor; +template class MaxPool2dWithIndexGradFunctor; template -__global__ void KernelMaxPool3DWithIdx( - const int nthreads, const T1* input_data, const int channels, - const int input_depth, const int input_height, const int input_width, - const int output_depth, const int output_height, const int output_width, - const int ksize_depth, const int ksize_height, const int ksize_width, - const int stride_depth, const int stride_height, const int stride_width, - const int padding_depth, const int padding_height, const int padding_width, - bool adaptive, T1* output_data, T2* mask_data) { +__global__ void KernelMaxPool3DWithIdx(const int nthreads, + const T1* input_data, + const int channels, + const int input_depth, + const int input_height, + const int input_width, + const int output_depth, + const int output_height, + const int output_width, + const int ksize_depth, + const int ksize_height, + const int ksize_width, + const int stride_depth, + const int stride_height, + const int stride_width, + const int padding_depth, + const int padding_height, + const int padding_width, + bool adaptive, + T1* output_data, + T2* mask_data) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { int pw = index % output_width; @@ -1650,14 +2078,27 @@ __global__ void KernelMaxPool3DWithIdx( } template -__global__ void KernelMaxPool3DWithIdxGrad( - const int nthreads, const T1* output_grad, const T2* mask, - const int channels, const int input_depth, const int input_height, - const int input_width, const int output_depth, const int output_height, - const int output_width, const int ksize_depth, const int ksize_height, - const int ksize_width, const int stride_depth, const int stride_height, - const int stride_width, const int padding_depth, const int padding_height, - const int padding_width, bool adaptive, T1* input_grad) { +__global__ void KernelMaxPool3DWithIdxGrad(const int nthreads, + const T1* output_grad, + const T2* mask, + const int channels, + const int input_depth, + const int input_height, + const int input_width, + const int output_depth, + const int output_height, + const int output_width, + const int ksize_depth, + const int ksize_height, + const int ksize_width, + const int stride_depth, + const int stride_height, + const int stride_width, + const int padding_depth, + const int padding_height, + const int padding_width, + bool adaptive, + T1* input_grad) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { int w_offset = index % input_width; @@ -1727,13 +2168,16 @@ __global__ void KernelMaxPool3DWithIdxGrad( * depth, height and width, respectively. */ template -class MaxPool3dWithIndexFunctor { +class MaxPool3dWithIndexFunctor { public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, const std::vector& ksize, + void operator()(const phi::GPUContext& context, + const DenseTensor& input, + const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, bool adaptive, - framework::Tensor* output, framework::Tensor* mask) { + const std::vector& paddings, + bool adaptive, + DenseTensor* output, + DenseTensor* mask) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; const int input_depth = input.dims()[2]; @@ -1754,14 +2198,14 @@ class MaxPool3dWithIndexFunctor { const int padding_width = paddings[2]; const T1* input_data = input.data(); - T1* output_data = output->mutable_data(context.GetPlace()); - T2* mask_data = mask->mutable_data(context.GetPlace()); + T1* output_data = context.template Alloc(output); + T2* mask_data = context.template Alloc(mask); int nthreads = batch_size * output_channels * output_depth * output_height * output_width; int thread_num = 1024; #ifdef WITH_NV_JETSON - platform::ChangeThreadNum(context, &thread_num); + paddle::platform::ChangeThreadNum(context, &thread_num); #endif int blocks = (nthreads + thread_num - 1) / thread_num; @@ -1769,10 +2213,26 @@ class MaxPool3dWithIndexFunctor { dim3 grid(blocks, 1); KernelMaxPool3DWithIdx<<>>( - nthreads, input_data, input_channels, input_depth, input_height, - input_width, output_depth, output_height, output_width, ksize_depth, - ksize_height, ksize_width, stride_depth, stride_height, stride_width, - padding_depth, padding_height, padding_width, adaptive, output_data, + nthreads, + input_data, + input_channels, + input_depth, + input_height, + input_width, + output_depth, + output_height, + output_width, + ksize_depth, + ksize_height, + ksize_width, + stride_depth, + stride_height, + stride_width, + padding_depth, + padding_height, + padding_width, + adaptive, + output_data, mask_data); } }; @@ -1783,14 +2243,16 @@ class MaxPool3dWithIndexFunctor { * depth, height and width, respectively. */ template -class MaxPool3dWithIndexGradFunctor { +class MaxPool3dWithIndexGradFunctor { public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& output_grad, - const framework::Tensor& mask, const std::vector& ksize, + void operator()(const phi::GPUContext& context, + const DenseTensor& output_grad, + const DenseTensor& mask, + const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, bool adaptive, - framework::Tensor* input_grad) { + const std::vector& paddings, + bool adaptive, + DenseTensor* input_grad) { const int batch_size = input_grad->dims()[0]; const int input_channels = input_grad->dims()[1]; const int input_depth = input_grad->dims()[2]; @@ -1811,7 +2273,7 @@ class MaxPool3dWithIndexGradFunctor { const T1* output_grad_data = output_grad.data(); const T2* mask_data = mask.data(); - T1* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T1* input_grad_data = context.template Alloc(input_grad); int nthreads = batch_size * input_channels * input_depth * input_height * input_width; @@ -1820,23 +2282,34 @@ class MaxPool3dWithIndexGradFunctor { dim3 grid(blocks, 1); KernelMaxPool3DWithIdxGrad<<>>( - nthreads, output_grad_data, mask_data, input_channels, input_depth, - input_height, input_width, output_depth, output_height, output_width, - ksize_depth, ksize_height, ksize_width, stride_depth, stride_height, - stride_width, padding_depth, padding_height, padding_width, adaptive, + nthreads, + output_grad_data, + mask_data, + input_channels, + input_depth, + input_height, + input_width, + output_depth, + output_height, + output_width, + ksize_depth, + ksize_height, + ksize_width, + stride_depth, + stride_height, + stride_width, + padding_depth, + padding_height, + padding_width, + adaptive, input_grad_data); } }; -template class MaxPool3dWithIndexFunctor; -template class MaxPool3dWithIndexGradFunctor; -template class MaxPool3dWithIndexFunctor; -template class MaxPool3dWithIndexGradFunctor; - -} // namespace math -} // namespace operators -} // namespace paddle +template class MaxPool3dWithIndexFunctor; +template class MaxPool3dWithIndexGradFunctor; +template class MaxPool3dWithIndexFunctor; +template class MaxPool3dWithIndexGradFunctor; + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/pooling.h b/paddle/phi/kernels/funcs/pooling.h new file mode 100644 index 0000000000000000000000000000000000000000..19c6d52c4c9018f821c4e7f6ddaebf933aa045e8 --- /dev/null +++ b/paddle/phi/kernels/funcs/pooling.h @@ -0,0 +1,469 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include "paddle/fluid/platform/macros.h" // import FLT_MAX +#include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/hostdevice.h" + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#include "paddle/phi/backends/gpu/gpu_decls.h" +#endif + +namespace phi { +namespace funcs { + +/* + * \brief Extracting simple operations from pooling. + * Both MaxPool and AvgPool need "initial", "compute" and "finalize" + * operation. + * MaxPool initializes temp variable to the negative maximum to find the + * maximum value in the pooling field. + * AvgPool initializes temp variable to the zero to accumulate all values + * in pool pooling, and finally takes the average. + * MaxPoolGrad and AvgPoolGrad are gradient operations respectively. + */ +template +class MaxPool { + public: + DEVICE inline T initial() { return static_cast(-FLT_MAX); } + DEVICE inline void compute(const T& x, T* y) { *y = *y > x ? *y : x; } + DEVICE inline void finalize(const T& pool_field, T* y) {} +}; + +template +class AvgPool { + using MT = typename dtype::MPTypeTrait::Type; + MT intermediate_res; + + public: + DEVICE inline T initial() { + intermediate_res = static_cast(0.0f); + return static_cast(0); + } + + DEVICE inline void compute(const T& x, T* y) { + intermediate_res += static_cast(x); + } + + DEVICE inline void finalize(const T& pool_field, T* y) { + *y = static_cast(intermediate_res / (static_cast(pool_field))); + } +}; + +template +class MaxPoolGrad { + public: + static constexpr bool use_x = true; + HOSTDEVICE inline void compute( + const T& x, const T& y, const T& dy, T scale, T* dx) { + *dx += dy * static_cast(x == y); + } +}; + +template +class AvgPoolGrad { + public: + static constexpr bool use_x = false; + HOSTDEVICE inline void compute( + const T& x, const T& y, const T& dy, T scale, T* dx) { + *dx += (scale * dy); + } +}; + +/* used for adaptive pool to calculate start and end index of each divided grid + */ +HOSTDEVICE inline int AdaptStartIndex(int ph, int input_size, int output_size) { + return static_cast( + floor(static_cast(ph * input_size) / output_size)); +} + +HOSTDEVICE inline int AdaptEndIndex(int ph, int input_size, int output_size) { + return static_cast( + ceil(static_cast((ph + 1) * input_size) / output_size)); +} + +/* + * \brief Getting pooling results, and calculating gradient. + * + * In pool2d, all Tensors are in NCHW or NHWC format. Where N is batch size, C + * is the number of channels, H and W is the height and width of feature. + * In pool3d, all Tensors are in NCDHW or NDHWC format. Where N is batch size, C + * is the number of channels, D, H and W is the depth, height and width of + * feature. + * + * In max pooling, it is possible that the pooling region has multiple maximum + * elements. In this case, we should compute the gradient of the first maximum + * element. + * This is different from average pooling. So we rewrite the max_pool_grad: + * MaxPool2dGradFunctor, MaxPool3dGradFunctor. + */ +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +template +class Pool2dDirectCUDAFunctor { + public: + void operator()(const T* input, + const std::vector& input_shape, + const std::vector& output_shape, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + bool exclusive, + bool adaptive, + T* output, + gpuStream_t stream, + PoolProcess pool_compute); +}; +#endif + +template +class Pool2dFunctor { + public: + void operator()(const Context& context, + const DenseTensor& input, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + bool exclusive, + bool adaptive, + DenseTensor* output, + PoolProcess pool_compute); + + // overload operator() to support argument data_format + void operator()(const Context& context, + const DenseTensor& input, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::string data_format, + bool exclusive, + bool adaptive, + DenseTensor* output, + PoolProcess pool_compute); +}; + +template +class Pool2dGradFunctor { + public: + void operator()(const Context& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + bool exclusive, + bool adaptive, + DenseTensor* input_grad, + PoolProcess pool_compute); + // overload operator() to support argument data_format + void operator()(const Context& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::string data_format, + bool exclusive, + bool adaptive, + DenseTensor* input_grad, + PoolProcess pool_compute); +}; + +template +class MaxPool2dGradFunctor { + public: + void operator()(const Context& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + DenseTensor* input_grad); + // overload operator() to support argument data_format + void operator()(const Context& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::string data_format, + DenseTensor* input_grad); +}; + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +template +class Pool3dDirectCUDAFunctor { + public: + void operator()(const T* input, + const std::vector& input_shape, + const std::vector& output_shape, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + bool exclusive, + bool adaptive, + T* output, + gpuStream_t stream, + PoolProcess pool_compute); +}; +#endif + +template +class Pool3dFunctor { + public: + void operator()(const Context& context, + const DenseTensor& input, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + bool exclusive, + bool adaptive, + DenseTensor* output, + PoolProcess pool_compute); + // overload operator() to support argument data_format + void operator()(const Context& context, + const DenseTensor& input, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::string data_format, + bool exclusive, + bool adaptive, + DenseTensor* output, + PoolProcess pool_compute); +}; + +template +class Pool3dGradFunctor { + public: + void operator()(const Context& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + bool exclusive, + bool adaptive, + DenseTensor* input_grad, + PoolProcess pool_compute); + // overload operator() to support argument data_format + void operator()(const Context& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::string data_format, + bool exclusive, + bool adaptive, + DenseTensor* input_grad, + PoolProcess pool_compute); +}; + +template +class MaxPool3dGradFunctor { + public: + void operator()(const Context& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + DenseTensor* input_grad); + // overload operator() to support argument data_format + void operator()(const Context& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::string data_format, + DenseTensor* input_grad); +}; + +/* + * \brief Getting max pooling results and corresponding max index, and + * calculating gradient. + * In up-sampling-pooling, it is necessary to know max element index. + * In pool2d, all tensors are in NCHW format. In pool3d, all tensors are in + * NCDHW format. + */ +template +class MaxPool2dWithIndexFunctor { + public: + void operator()(const Context& context, + const DenseTensor& input, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + bool adaptive, + DenseTensor* output, + DenseTensor* mask); +}; + +template +class MaxPool2dWithIndexGradFunctor { + public: + void operator()(const Context& context, + const DenseTensor& output_grad, + const DenseTensor& mask, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + bool adaptive, + DenseTensor* input_grad); +}; + +template +class MaxPool3dWithIndexFunctor { + public: + void operator()(const Context& context, + const DenseTensor& input, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + bool adaptive, + DenseTensor* output, + DenseTensor* mask); +}; + +template +class MaxPool3dWithIndexGradFunctor { + public: + void operator()(const Context& context, + const DenseTensor& output_grad, + const DenseTensor& mask, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + bool adaptive, + DenseTensor* input_grad); +}; + +inline int PoolOutputSize(int input_size, + int filter_size, + int padding_1, + int padding_2, + int stride, + bool ceil_mode) { + int output_size; + if (!ceil_mode) { + output_size = + (input_size - filter_size + padding_1 + padding_2) / stride + 1; + } else { + output_size = + (input_size - filter_size + padding_1 + padding_2 + stride - 1) / + stride + + 1; + } + PADDLE_ENFORCE_GT( + output_size, + 0, + errors::InvalidArgument( + "the output size must be greater than 0. But received: " + "output_size = %d due to the settings of input_size(%d), " + "padding(%d,%d), " + "k_size(%d) and stride(%d). Please check again!", + output_size, + input_size, + padding_1, + padding_2, + filter_size, + stride)); + return output_size; +} + +inline int MaxPoolOutputSize(int input_size, + int filter_size, + int padding, + int stride) { + int output_size = (input_size - filter_size + 2 * padding) / stride + 1; + return output_size; +} + +template +inline void UpdatePadding(std::vector* paddings, + const bool global_pooling, + const bool adaptive, + const std::string padding_algorithm, + const DDim data_dims, + const std::vector& strides, + const std::vector& kernel_size) { + // set padding size == data_dims.size() * 2 + auto data_shape = vectorize(data_dims); + if (static_cast(paddings->size()) == data_dims.size()) { + for (int i = 0; i < data_dims.size(); ++i) { + T copy_pad = *(paddings->begin() + 2 * i); + paddings->insert(paddings->begin() + 2 * i + 1, copy_pad); + } + } else { + PADDLE_ENFORCE_EQ(data_dims.size() * 2, + paddings->size(), + errors::InvalidArgument( + "Paddings size %d should be the same or twice as the " + "pooling size %d.", + paddings->size(), + data_dims.size() * 2)); + } + + // when padding_algorithm is "VALID" or "SAME" + if (padding_algorithm == "SAME") { + for (int i = 0; i < data_dims.size(); ++i) { + T out_size = (data_dims[i] + strides[i] - 1) / strides[i]; + T pad_sum = + std::max((out_size - 1) * strides[i] + kernel_size[i] - data_shape[i], + static_cast(0)); + T pad_0 = pad_sum / 2; + T pad_1 = pad_sum - pad_0; + *(paddings->begin() + i * 2) = pad_0; + *(paddings->begin() + i * 2 + 1) = pad_1; + } + } else if (padding_algorithm == "VALID") { + for (auto it = paddings->begin(); it != paddings->end(); it++) { + *it = 0; + } + } + + // if global_pooling == true or adaptive == true, padding will be ignore + if (global_pooling || adaptive) { + for (auto it = paddings->begin(); it != paddings->end(); it++) { + *it = 0; + } + } +} + +template +inline void UpdateKernelSize(std::vector* kernel_size, + const DDim data_dims) { + kernel_size->resize(static_cast(data_dims.size())); + for (size_t i = 0; i < kernel_size->size(); ++i) { + *(kernel_size->begin() + i) = static_cast(data_dims[i]); + } +} + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/sparse/common_shape.h b/paddle/phi/kernels/funcs/sparse/common_shape.h new file mode 100644 index 0000000000000000000000000000000000000000..3617e3cd2f406d889c0b79ecfc34a68d19259a17 --- /dev/null +++ b/paddle/phi/kernels/funcs/sparse/common_shape.h @@ -0,0 +1,45 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include "paddle/phi/core/ddim.h" + +namespace phi { +namespace funcs { +namespace sparse { + +inline const DDim InferDenseDims(const DDim& x_dims, + const int64_t sparse_dim, + const int64_t non_zero_num) { + auto dense_dim = x_dims.size() - sparse_dim; + DDim values_dims; + if (dense_dim > 0) { + std::vector dense_dim_vec(dense_dim + 1); + dense_dim_vec[0] = non_zero_num; + memcpy(&dense_dim_vec[1], + x_dims.Get() + sparse_dim, + dense_dim * sizeof(x_dims[0])); + values_dims = phi::make_ddim(dense_dim_vec); + } else { + values_dims = phi::make_ddim({non_zero_num}); + } + return values_dims; +} + +} // namespace sparse +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/sparse/convolution.h b/paddle/phi/kernels/funcs/sparse/convolution.h new file mode 100644 index 0000000000000000000000000000000000000000..68fe8880a971dd7a56d677a5567bb053f5ba117a --- /dev/null +++ b/paddle/phi/kernels/funcs/sparse/convolution.h @@ -0,0 +1,170 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/ddim.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" + +namespace phi { +namespace funcs { +namespace sparse { + +struct Dims4D { + int dims[4]; + Dims4D(const int batch, const int x, const int y, const int z) { + dims[0] = batch; + dims[1] = z; + dims[2] = y; + dims[3] = x; + } + HOSTDEVICE const int& operator[](int i) const { return dims[i]; } +}; + +// Judge whether the current position x is in (lower, upper) +inline HOSTDEVICE bool Check(const int& x, + const int& kx, + const int& pad, + const int& stride, + const int dilation, + const int kdim, + const int xdim) { + const int lower = x - dilation * kx + pad; + const int uper = x + (kdim - kx - 1) * dilation - pad; + return (lower >= 0 && lower % stride == 0 && uper < xdim); +} + +// Check whether the current position(x, y, z) is legal: +// Judge the minimum and maximum values at each latitude +inline HOSTDEVICE bool Check(const Dims4D& dims, + const Dims4D& kernel_dims, + const Dims4D& paddings, + const Dims4D& dilations, + const Dims4D& strides, + const int x, + const int y, + const int z, + const int kx, + const int ky, + const int kz) { + bool x_valid = Check( + x, kx, paddings[3], strides[3], dilations[3], kernel_dims[3], dims[3]); + bool y_valid = Check( + y, ky, paddings[2], strides[2], dilations[2], kernel_dims[2], dims[2]); + bool z_valid = Check( + z, kz, paddings[1], strides[1], dilations[1], kernel_dims[1], dims[1]); + return (x_valid && y_valid && z_valid); +} + +template +inline HOSTDEVICE int PointToIndex(const int& batch, + const int& x, + const int& y, + const int& z, + const Dim& dims) { + return batch * dims[1] * dims[2] * dims[3] + z * dims[2] * dims[3] + + y * dims[3] + x; +} + +// TODO(zhangkaihuo): use division and multiply to optimize +// modulo operation +template +inline HOSTDEVICE void IndexToPoint( + const int index, const Dim& dims, int* batch, int* x, int* y, int* z) { + int n = index; + *x = n % dims[3]; + n /= dims[3]; + *y = n % dims[2]; + n /= dims[2]; + *z = n % dims[1]; + n /= dims[1]; + *batch = n; +} + +inline void GetOutShape(const DDim& x_dims, + const DDim& kernel_dims, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + DDim* out_dims) { + PADDLE_ENFORCE_EQ( + x_dims.size(), + 5, + phi::errors::InvalidArgument("the shape of x should be (N, D, H, W, C)")); + PADDLE_ENFORCE_EQ(kernel_dims.size(), + 5, + phi::errors::InvalidArgument( + "the shape of kernel should be (D, H, W, C, OC)")); + + // infer out shape + (*out_dims)[0] = x_dims[0]; + (*out_dims)[4] = kernel_dims[4]; + for (int i = 1; i < 4; i++) { + (*out_dims)[i] = (x_dims[i] + 2 * paddings[i - 1] - + dilations[i - 1] * (kernel_dims[i - 1] - 1) - 1) / + strides[i - 1] + + 1; + } +} + +inline void ResetSubmKernelSizeAndStrides(const DDim& kernel_dims, + std::vector* paddings, + std::vector* strides) { + for (uint64_t i = 0; i < paddings->size(); i++) { + (*paddings)[i] = kernel_dims[i] / 2; + (*strides)[i] = 1; + } +} + +template +inline void SubmPreProcess(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& kernel, + const SparseCooTensor& out_grad, + const int in_channels, + const int out_channels, + const int half_kernel_size, + DenseTensor* kernel_grad, + DenseTensor* x_grad) { + auto blas = phi::funcs::GetBlas(dev_ctx); + T* d_kernel_ptr = kernel_grad->data(); + blas.GEMM(CblasTrans, + CblasNoTrans, + x.non_zero_elements().dims()[1], + out_grad.non_zero_elements().dims()[1], + x.non_zero_elements().dims()[0], + static_cast(1), + x.non_zero_elements().data(), + out_grad.non_zero_elements().data(), + static_cast(0), + d_kernel_ptr + half_kernel_size * in_channels * out_channels); + + // call gemm: d_x = out_grad * transpose(kernel) + // (n, out_channels) * (out_channels, in_channels) + T* x_grad_ptr = x_grad->data(); + blas.GEMM(CblasNoTrans, + CblasTrans, + out_grad.non_zero_elements().dims()[0], + in_channels, + out_grad.non_zero_elements().dims()[1], + static_cast(1), + out_grad.non_zero_elements().data(), + kernel.data() + half_kernel_size * in_channels * out_channels, + static_cast(0), + x_grad_ptr); +} + +} // namespace sparse +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/values_vectors_functor.h b/paddle/phi/kernels/funcs/values_vectors_functor.h index b3189fc5cc3c307f04758663250098f384c2c8fc..336e9c809427c68be79bc8eaddd98193462f5405 100644 --- a/paddle/phi/kernels/funcs/values_vectors_functor.h +++ b/paddle/phi/kernels/funcs/values_vectors_functor.h @@ -20,7 +20,6 @@ #endif // PADDLE_WITH_CUDA #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/core/device_context.h" #include "paddle/phi/kernels/funcs/complex_functors.h" #include "paddle/phi/kernels/funcs/lapack/lapack_function.h" #include "paddle/phi/kernels/transpose_kernel.h" diff --git a/paddle/phi/kernels/gather_tree_kernel.h b/paddle/phi/kernels/gather_tree_kernel.h index e5a1a684daef099b5da8e7d9b8469b2857c29a6b..b3e6ffbc4297a2ae6a067e6b1ec5f2f88f7ef2ba 100644 --- a/paddle/phi/kernels/gather_tree_kernel.h +++ b/paddle/phi/kernels/gather_tree_kernel.h @@ -15,6 +15,7 @@ #pragma once #include "paddle/phi/core/dense_tensor.h" + namespace phi { template diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu index c2995c79a7e8c2651ed4aa16d75d59c8f24c96dc..00792b8ab607036112295f2dd4018c69eb78680a 100644 --- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu @@ -79,113 +79,97 @@ void ActivationGradGPUImpl(const Context& dev_ctx, const DenseTensor& x, \ const DenseTensor& dout, \ DenseTensor* dx) { \ - functor_class functor; \ - ActivationGradGPUImpl( \ + funcs::functor_class functor; \ + ActivationGradGPUImpl>( \ dev_ctx, &x, nullptr, &dout, dx, functor); \ } +#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX( \ + name, functor_class, attr) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& dout, \ + float attr, \ + DenseTensor* dx) { \ + funcs::functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr; \ + ActivationGradGPUImpl>( \ + dev_ctx, &x, nullptr, &dout, dx, functor); \ + } + +#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX( \ + name, functor_class, attr1, attr2) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& dout, \ + float attr1, \ + float attr2, \ + DenseTensor* dx) { \ + funcs::functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr1; \ + *(attrs[1].second) = attr2; \ + ActivationGradGPUImpl>( \ + dev_ctx, &x, nullptr, &dout, dx, functor); \ + } + #define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(name, functor_class) \ template \ void name##GradKernel(const Context& dev_ctx, \ const DenseTensor& out, \ const DenseTensor& dout, \ DenseTensor* dx) { \ - functor_class functor; \ - ActivationGradGPUImpl( \ + funcs::functor_class functor; \ + ActivationGradGPUImpl>( \ dev_ctx, nullptr, &out, &dout, dx, functor); \ } -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(Relu, funcs::CudaReluGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Cos, funcs::CudaCosGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Tan, funcs::CudaTanGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Acos, funcs::CudaAcosGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Sin, funcs::CudaSinGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Asin, funcs::CudaAsinGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Atan, funcs::CudaAtanGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Sinh, funcs::CudaSinhGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Cosh, funcs::CudaCoshGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Asinh, funcs::CudaAsinhGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Acosh, funcs::CudaAcoshGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Atanh, funcs::CudaAtanhGradFunctor); +#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepOut( \ + name, functor_class, attr) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& out, \ + const DenseTensor& dout, \ + float attr, \ + DenseTensor* dx) { \ + funcs::functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr; \ + ActivationGradGPUImpl>( \ + dev_ctx, nullptr, &out, &dout, dx, functor); \ + } + +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(Relu, CudaReluGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(Tanh, CudaTanhGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Cos, CudaCosGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Tan, CudaTanGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Acos, CudaAcosGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Sin, CudaSinGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Asin, CudaAsinGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Atan, CudaAtanGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Sinh, CudaSinhGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Cosh, CudaCoshGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Asinh, CudaAsinhGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Acosh, CudaAcoshGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Atanh, CudaAtanhGradFunctor); + +DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(LeakyRelu, + CudaLeakyReluGradFunctor, + alpha); +DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(ThresholdedRelu, + CudaThresholdedReluGradFunctor, + threshold); + +DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX(BRelu, + CudaBReluGradFunctor, + t_min, + t_max); } // namespace phi -PD_REGISTER_KERNEL(cos_grad, - GPU, - ALL_LAYOUT, - phi::CosGradKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(tan_grad, - GPU, - ALL_LAYOUT, - phi::TanGradKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(acos_grad, - GPU, - ALL_LAYOUT, - phi::AcosGradKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(sin_grad, - GPU, - ALL_LAYOUT, - phi::SinGradKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(asin_grad, - GPU, - ALL_LAYOUT, - phi::AsinGradKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(atan_grad, - GPU, - ALL_LAYOUT, - phi::AtanGradKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(sinh_grad, - GPU, - ALL_LAYOUT, - phi::SinhGradKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(cosh_grad, - GPU, - ALL_LAYOUT, - phi::CoshGradKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(asinh_grad, - GPU, - ALL_LAYOUT, - phi::AsinhGradKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(acosh_grad, - GPU, - ALL_LAYOUT, - phi::AcoshGradKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(atanh_grad, - GPU, - ALL_LAYOUT, - phi::AtanhGradKernel, - float, - double, - phi::dtype::float16) {} + #ifdef PADDLE_WITH_HIP PD_REGISTER_KERNEL(relu_grad, GPU, @@ -219,3 +203,34 @@ PD_REGISTER_KERNEL(relu_double_grad, phi::dtype::float16, phi::dtype::bfloat16) {} #endif + +#define PD_REGISTER_ACTIVATION_GRAD_KERNEL(name, func) \ + PD_REGISTER_KERNEL(name, \ + GPU, \ + ALL_LAYOUT, \ + phi::func, \ + float, \ + double, \ + phi::dtype::float16, \ + phi::dtype::bfloat16) {} + +PD_REGISTER_ACTIVATION_GRAD_KERNEL(sin_grad, SinGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(cos_grad, CosGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(tan_grad, TanGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(acos_grad, AcosGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(asin_grad, AsinGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(atan_grad, AtanGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(sinh_grad, SinhGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(cosh_grad, CoshGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(asinh_grad, AsinhGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(acosh_grad, AcoshGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(atanh_grad, AtanhGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_grad, TanhGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_double_grad, TanhDoubleGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_triple_grad, TanhTripleGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(brelu_grad, BReluGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_grad, LeakyReluGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_double_grad, + LeakyReluDoubleGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(thresholded_relu_grad, + ThresholdedReluGradKernel) diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu index 26752b89e7c345f88cdbe2000b119c07507d2c37..3c340a89f5746bd8de31826f7639e6ed0b7391f6 100644 --- a/paddle/phi/kernels/gpu/activation_kernel.cu +++ b/paddle/phi/kernels/gpu/activation_kernel.cu @@ -46,6 +46,35 @@ void ActivationGPUImpl(const Context& dev_ctx, ActivationGPUImpl(dev_ctx, x, out, functor); \ } +#define DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(name, functor_class, attr) \ + template \ + void name##Kernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + float attr, \ + DenseTensor* out) { \ + funcs::functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr; \ + ActivationGPUImpl>( \ + dev_ctx, x, out, functor); \ + } + +#define DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS( \ + name, functor_class, attr1, attr2) \ + template \ + void name##Kernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + float attr1, \ + float attr2, \ + DenseTensor* out) { \ + funcs::functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr1; \ + *(attrs[1].second) = attr2; \ + ActivationGPUImpl>( \ + dev_ctx, x, out, functor); \ + } + DEFINE_GPU_ACTIVATION_KERNEL(Cos, funcs::CudaCosFunctor) DEFINE_GPU_ACTIVATION_KERNEL(Tan, funcs::CudaTanFunctor) DEFINE_GPU_ACTIVATION_KERNEL(Acos, funcs::CudaAcosFunctor) @@ -58,6 +87,14 @@ DEFINE_GPU_ACTIVATION_KERNEL(Asinh, funcs::CudaAsinhFunctor) DEFINE_GPU_ACTIVATION_KERNEL(Acosh, funcs::CudaAcoshFunctor) DEFINE_GPU_ACTIVATION_KERNEL(Atanh, funcs::CudaAtanhFunctor) DEFINE_GPU_ACTIVATION_KERNEL(Relu, funcs::CudaReluFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Tanh, funcs::CudaTanhFunctor) + +DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, CudaLeakyReluFunctor, alpha) +DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu, + CudaThresholdedReluFunctor, + threshold) + +DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, CudaBReluFunctor, t_min, t_max) } // namespace phi @@ -79,65 +116,29 @@ PD_REGISTER_KERNEL(relu, phi::dtype::float16, phi::dtype::bfloat16) {} #endif -PD_REGISTER_KERNEL( - sin, GPU, ALL_LAYOUT, phi::SinKernel, float, double, phi::dtype::float16) {} -PD_REGISTER_KERNEL( - cos, GPU, ALL_LAYOUT, phi::CosKernel, float, double, phi::dtype::float16) {} -PD_REGISTER_KERNEL( - tan, GPU, ALL_LAYOUT, phi::TanKernel, float, double, phi::dtype::float16) {} -PD_REGISTER_KERNEL(acos, - GPU, - ALL_LAYOUT, - phi::AcosKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(asin, - GPU, - ALL_LAYOUT, - phi::AsinKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(atan, - GPU, - ALL_LAYOUT, - phi::AtanKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(sinh, - GPU, - ALL_LAYOUT, - phi::SinhKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(cosh, - GPU, - ALL_LAYOUT, - phi::CoshKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(asinh, - GPU, - ALL_LAYOUT, - phi::AsinhKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(acosh, - GPU, - ALL_LAYOUT, - phi::AcoshKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(atanh, - GPU, - ALL_LAYOUT, - phi::AtanhKernel, - float, - double, - phi::dtype::float16) {} + +#define PD_REGISTER_ACTIVATION_KERNEL(name, func) \ + PD_REGISTER_KERNEL(name, \ + GPU, \ + ALL_LAYOUT, \ + phi::func, \ + float, \ + double, \ + phi::dtype::float16, \ + phi::dtype::bfloat16) {} + +PD_REGISTER_ACTIVATION_KERNEL(sin, SinKernel) +PD_REGISTER_ACTIVATION_KERNEL(cos, CosKernel) +PD_REGISTER_ACTIVATION_KERNEL(tan, TanKernel) +PD_REGISTER_ACTIVATION_KERNEL(acos, AcosKernel) +PD_REGISTER_ACTIVATION_KERNEL(asin, AsinKernel) +PD_REGISTER_ACTIVATION_KERNEL(atan, AtanKernel) +PD_REGISTER_ACTIVATION_KERNEL(sinh, SinhKernel) +PD_REGISTER_ACTIVATION_KERNEL(cosh, CoshKernel) +PD_REGISTER_ACTIVATION_KERNEL(asinh, AsinhKernel) +PD_REGISTER_ACTIVATION_KERNEL(acosh, AcoshKernel) +PD_REGISTER_ACTIVATION_KERNEL(atanh, AtanhKernel) +PD_REGISTER_ACTIVATION_KERNEL(tanh, TanhKernel) +PD_REGISTER_ACTIVATION_KERNEL(brelu, BReluKernel) +PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedReluKernel) +PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel) diff --git a/paddle/phi/kernels/gpu/allclose_kernel.cu b/paddle/phi/kernels/gpu/allclose_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..af2612bb10c9fe108a471253ff87f2a686059c2a --- /dev/null +++ b/paddle/phi/kernels/gpu/allclose_kernel.cu @@ -0,0 +1,89 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/allclose_kernel.h" + +#include "paddle/phi/core/enforce.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +__global__ void AllcloseCUDAKernel(const T* in_data, + const T* other_data, + const double rtol, + const double atol, + bool equal_nan, + int num, + bool* out_data) { + unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; + bool val; + for (int i = idx; i < num; i += blockDim.x * gridDim.x) { + const T a = in_data[i], b = other_data[i]; + if (isnan(a) || isnan(b)) { + val = equal_nan && isnan(a) == isnan(b); + } else { + T left = (a > b ? a - b : b - a); + T right = atol + (b > 0 ? rtol * b : (-rtol) * b); + T diff = (left > right ? left - right : right - left); + val = a == b || left <= right || diff <= 1e-15; + } + if (!val) *out_data = false; + } +} + +template +void AllCloseKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const Scalar& rtol, + const Scalar& atol, + bool equal_nan, + DenseTensor* out) { + PADDLE_ENFORCE_EQ( + rtol.dtype(), + DataType::FLOAT64, + phi::errors::InvalidArgument( + "Input (Rtol) type must be double, but get %s.", rtol.dtype())); + PADDLE_ENFORCE_EQ( + atol.dtype(), + DataType::FLOAT64, + phi::errors::InvalidArgument( + "Input (Atol) type must be double, but get %s.", atol.dtype())); + + const T* in_data = x.data(); + const T* other_data = y.data(); + auto rtol_v = rtol.to(); + auto atol_v = atol.to(); + bool* out_data = dev_ctx.template Alloc(out); + + int num = x.numel(); + int block = 1024; + int grid = (block - 1 + num) / block; + grid = (grid > block) ? block : grid; +#ifdef PADDLE_WITH_HIP + hipMemset(out_data, true, sizeof(bool)); +#else + cudaMemset(out_data, true, sizeof(bool)); +#endif + AllcloseCUDAKernel<<>>( + in_data, other_data, rtol_v, atol_v, equal_nan, num, out_data); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + allclose, GPU, ALL_LAYOUT, phi::AllCloseKernel, float, double) { + kernel->OutputAt(0).SetDataType(phi::DataType::BOOL); +} diff --git a/paddle/phi/kernels/gpu/diag_grad_kernel.cu b/paddle/phi/kernels/gpu/diag_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..65bf837e6cf8a330fbb744c994311d17a7cc6299 --- /dev/null +++ b/paddle/phi/kernels/gpu/diag_grad_kernel.cu @@ -0,0 +1,139 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/diag_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/diag_functor.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +// Extract the diagonal of a matrix 'dout' to a matrix 'dx' +template +__global__ void ExtractDiagonalKernel(const T* dout, + T* dx, + std::ptrdiff_t start, + std::ptrdiff_t dx_length, + const std::ptrdiff_t sumStride, + const std::ptrdiff_t xStride) { + for (std::ptrdiff_t idx = blockIdx.x * blockDim.x + threadIdx.x; + idx < dx_length; + idx += gridDim.x * blockDim.x) { + const std::ptrdiff_t outOffset = start + sumStride * idx; + dx[xStride * idx] = dout[outOffset]; + } +} + +// Paste a vector 'dout' to the diagonal of a matrix 'dx' +template +__global__ void PasteDiagonalKernel(const T* dout, + T* dx, + std::ptrdiff_t start, + std::ptrdiff_t size, + const std::ptrdiff_t sumStride, + const std::ptrdiff_t outStride) { + for (std::ptrdiff_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; + idx += gridDim.x * blockDim.x) { + std::ptrdiff_t xOffset = start + sumStride * idx; + dx[xOffset] = dout[outStride * idx]; + } +} + +template +void DiagGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + int offset, + DenseTensor* x_grad) { + T* dx_data = dev_ctx.template Alloc(x_grad); + auto* dout_data = out_grad.data(); + auto dx_dims = x_grad->dims(); + auto dout_dims = out_grad.dims(); + + auto GetBlockGridSize = [&dev_ctx](int64_t size) { + const int64_t block_size = + std::min(size, static_cast(dev_ctx.GetMaxThreadsPerBlock())); + int64_t max_threads = dev_ctx.GetMaxPhysicalThreadCount(); + const int64_t max_blocks = + std::max(((max_threads - 1) / block_size + 1), static_cast(1)); + const int64_t grid_size = + std::min(max_blocks, (size + block_size - 1) / block_size); + return std::tuple{block_size, grid_size}; + }; + + if (dx_dims.size() == 1) { + auto dx_length = dx_dims[0]; + auto size = (offset > 0) ? dx_length + offset : dx_length - offset; + int dx_stride = phi::funcs::ComputeStride(0, dx_dims); + if (size > 0) { + auto dout_stride_0 = phi::funcs::ComputeStride(0, dout_dims); + auto dout_stride_1 = phi::funcs::ComputeStride(1, dout_dims); + auto start = + (offset >= 0 ? offset * dout_stride_1 : -offset * dout_stride_0); + + std::tuple block_grid_size = GetBlockGridSize(size); + ExtractDiagonalKernel<<(block_grid_size), + std::get<0>(block_grid_size), + 0, + dev_ctx.stream()>>>( + dout_data, + dx_data, + start, + dx_length, + dout_stride_0 + dout_stride_1, + dx_stride); + } + } else { + phi::funcs::SetConstant set_padding_value; + set_padding_value(dev_ctx, x_grad, static_cast(0)); + + int dx_stride_0 = phi::funcs::ComputeStride(0, dx_dims); + int dx_stride_1 = phi::funcs::ComputeStride(1, dx_dims); + int64_t size; + if (offset > 0) { + size = std::min(dx_dims[0], dx_dims[1] - offset); + } else { + size = std::min(dx_dims[0] + offset, dx_dims[1]); + } + + if (size > 0) { + auto start = (offset >= 0 ? offset * dx_stride_1 : -offset * dx_stride_0); + auto dout_stride_0 = phi::funcs::ComputeStride(0, dout_dims); + std::tuple block_grid_size = GetBlockGridSize(size); + PasteDiagonalKernel<<(block_grid_size), + std::get<0>(block_grid_size), + 0, + dev_ctx.stream()>>>(dout_data, + dx_data, + start, + size, + dx_stride_0 + dx_stride_1, + dout_stride_0); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(diag_grad, + GPU, + ALL_LAYOUT, + phi::DiagGradKernel, + phi::dtype::float16, + int, + int64_t, + float, + double) {} diff --git a/paddle/phi/kernels/gpu/diag_kernel.cu b/paddle/phi/kernels/gpu/diag_kernel.cu index fc70639787173d84b69262245dbb0500aa179a90..95d3d3365d91be61013e2016d06334f0498d866a 100644 --- a/paddle/phi/kernels/gpu/diag_kernel.cu +++ b/paddle/phi/kernels/gpu/diag_kernel.cu @@ -130,5 +130,12 @@ void DiagKernel(const Context& dev_ctx, } // namespace phi -PD_REGISTER_KERNEL( - diag, GPU, ALL_LAYOUT, phi::DiagKernel, int, int64_t, float, double) {} +PD_REGISTER_KERNEL(diag, + GPU, + ALL_LAYOUT, + phi::DiagKernel, + phi::dtype::float16, + int, + int64_t, + float, + double) {} diff --git a/paddle/phi/kernels/gpu/eigh_grad_kernel.cu b/paddle/phi/kernels/gpu/eigh_grad_kernel.cu index fdf61dc73991d84d4b38ddd214e1abf80cb2798e..5e33966055ea07d9b70227a5ed4760ad3b21e1a8 100644 --- a/paddle/phi/kernels/gpu/eigh_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/eigh_grad_kernel.cu @@ -15,7 +15,6 @@ #include "paddle/phi/kernels/eigh_grad_kernel.h" #include "paddle/phi/kernels/impl/eigh_grad_kernel_impl.h" -#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/complex_functors.h" diff --git a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu index 81f7fac10880325e152f37b5d4ab783ae93a279c..c4481bf6ce3c33ea260d774d0ac240a166856388 100644 --- a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu @@ -282,3 +282,20 @@ PD_REGISTER_KERNEL(multiply_triple_grad, phi::dtype::bfloat16, phi::dtype::complex, phi::dtype::complex) {} +PD_REGISTER_KERNEL(elementwise_fmax_grad, + GPU, + ALL_LAYOUT, + phi::ElementwiseFMaxGradKernel, + float, + double, + int, + int64_t) {} + +PD_REGISTER_KERNEL(elementwise_fmin_grad, + GPU, + ALL_LAYOUT, + phi::ElementwiseFMinGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/elementwise_kernel.cu b/paddle/phi/kernels/gpu/elementwise_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..2cffc68fa0648937b96095f5bd58210adaf865b3 --- /dev/null +++ b/paddle/phi/kernels/gpu/elementwise_kernel.cu @@ -0,0 +1,35 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/elementwise_kernel_impl.h" + +PD_REGISTER_KERNEL(elementwise_fmax, + GPU, + ALL_LAYOUT, + phi::ElementwiseFMaxKernel, + float, + double, + int, + int64_t) {} + +PD_REGISTER_KERNEL(elementwise_fmin, + GPU, + ALL_LAYOUT, + phi::ElementwiseFMinKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/gather_tree_kernel.cu b/paddle/phi/kernels/gpu/gather_tree_kernel.cu index a9e73ec37c8ed5f064144e27b06ac6304f5694b3..2906b81cb40096855fc990040f8d23b832f4da2e 100644 --- a/paddle/phi/kernels/gpu/gather_tree_kernel.cu +++ b/paddle/phi/kernels/gpu/gather_tree_kernel.cu @@ -12,10 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/phi/kernels/gather_tree_kernel.h" + #include -#include "paddle/phi/core/device_context.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/gather_tree_kernel.h" namespace phi { diff --git a/paddle/phi/kernels/gpu/kron_grad_kernel.cu b/paddle/phi/kernels/gpu/kron_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..13ef2adaab3f32791e5c108b3f12b217e5dcea07 --- /dev/null +++ b/paddle/phi/kernels/gpu/kron_grad_kernel.cu @@ -0,0 +1,31 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/kron_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/kron_grad_kernel_impl.h" + +PD_REGISTER_KERNEL(kron_grad, + GPU, + ALL_LAYOUT, + phi::KronGradKernel, + int, + int64_t, + float, + double, + phi::dtype::float16, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/gpu/kron_kernel.cu b/paddle/phi/kernels/gpu/kron_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..a2124fd5af7d79cf6d1227a73105dd3e5b729547 --- /dev/null +++ b/paddle/phi/kernels/gpu/kron_kernel.cu @@ -0,0 +1,31 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/kron_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/kron_kernel_impl.h" + +PD_REGISTER_KERNEL(kron, + GPU, + ALL_LAYOUT, + phi::KronKernel, + int, + int64_t, + float, + double, + phi::dtype::float16, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/gpu/matrix_rank_kernel.cu b/paddle/phi/kernels/gpu/matrix_rank_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..9b889a9b4c0069efcbf38a10ce00f20072560a36 --- /dev/null +++ b/paddle/phi/kernels/gpu/matrix_rank_kernel.cu @@ -0,0 +1,52 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef PADDLE_WITH_HIP +// HIP not support cusolver + +#include "paddle/phi/kernels/matrix_rank_kernel.h" +#include "paddle/phi/kernels/matrix_rank_tol_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/full_kernel.h" + +namespace phi { + +template +void MatrixRankKernel(const Context& dev_ctx, + const DenseTensor& x, + float tol, + bool use_default_tol, + bool hermitian, + DenseTensor* out) { + DenseTensor atol_tensor; + if (use_default_tol) { + atol_tensor = phi::Full(dev_ctx, {1}, static_cast(0)); + } else { + atol_tensor = phi::Full(dev_ctx, {1}, static_cast(tol)); + } + MatrixRankTolKernel( + dev_ctx, x, atol_tensor, use_default_tol, hermitian, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL(matrix_rank, // cuda_only + GPU, + ALL_LAYOUT, + phi::MatrixRankKernel, + float, + double) {} + +#endif // not PADDLE_WITH_HIP diff --git a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..2009547fc8d6f18c488faab5fd57cc985990229b --- /dev/null +++ b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu @@ -0,0 +1,433 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef PADDLE_WITH_HIP +// HIP not support cusolver + +#include "paddle/phi/kernels/matrix_rank_tol_kernel.h" + +#include +#include +#include "paddle/fluid/memory/memory.h" +#include "paddle/phi/backends/dynload/cusolver.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/abs_kernel.h" +#include "paddle/phi/kernels/full_kernel.h" +#include "paddle/phi/kernels/funcs/broadcast_function.h" +#include "paddle/phi/kernels/funcs/compare_functors.h" +#include "paddle/phi/kernels/impl/matrix_rank_kernel_impl.h" +#include "paddle/phi/kernels/math_kernel.h" +#include "paddle/phi/kernels/reduce_max_kernel.h" + +namespace phi { + +template +void GesvdjBatched(const phi::GPUContext& dev_ctx, + int batchSize, + int m, + int n, + int k, + T* A, + T* U, + T* V, + T* S, + int* info, + int thin_UV = 1); + +template +void SyevjBatched(const phi::GPUContext& dev_ctx, + int batchSize, + int n, + T* A, + T* W, + int* info); + +template <> +void GesvdjBatched(const phi::GPUContext& dev_ctx, + int batchSize, + int m, + int n, + int k, + float* A, + float* U, + float* V, + float* S, + int* info, + int thin_UV) { + // do not compute singular vectors + const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR; + gesvdjInfo_t gesvdj_params = NULL; + int lda = m; + int ldu = m; + int ldt = n; + int lwork = 0; + auto handle = dev_ctx.cusolver_dn_handle(); + PADDLE_ENFORCE_GPU_SUCCESS( + dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params)); + PADDLE_ENFORCE_GPU_SUCCESS( + dynload::cusolverDnSgesvdj_bufferSize(handle, + jobz, + thin_UV, + m, + n, + A, + lda, + S, + U, + ldu, + V, + ldt, + &lwork, + gesvdj_params)); + auto workspace = paddle::memory::Alloc(dev_ctx, lwork * sizeof(float)); + float* workspace_ptr = reinterpret_cast(workspace->ptr()); + int stride_A = lda * n; + int stride_U = ldu * (thin_UV ? k : m); + int stride_V = ldt * (thin_UV ? k : n); + for (int i = 0; i < batchSize; i++) { + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSgesvdj(handle, + jobz, + thin_UV, + m, + n, + A + stride_A * i, + lda, + S + k * i, + U + stride_U * i, + ldu, + V + stride_V * i, + ldt, + workspace_ptr, + lwork, + info, + gesvdj_params)); + int error_info; + paddle::memory::Copy(phi::CPUPlace(), + &error_info, + dev_ctx.GetPlace(), + info, + sizeof(int), + dev_ctx.stream()); + PADDLE_ENFORCE_EQ( + error_info, + 0, + phi::errors::PreconditionNotMet( + "For batch [%d]: CUSolver SVD is not zero. [%d]", i, error_info)); + } + PADDLE_ENFORCE_GPU_SUCCESS( + dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params)); +} + +template <> +void GesvdjBatched(const phi::GPUContext& dev_ctx, + int batchSize, + int m, + int n, + int k, + double* A, + double* U, + double* V, + double* S, + int* info, + int thin_UV) { + // do not compute singular vectors + const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR; + gesvdjInfo_t gesvdj_params = NULL; + int lda = m; + int ldu = m; + int ldt = n; + int lwork = 0; + auto handle = dev_ctx.cusolver_dn_handle(); + PADDLE_ENFORCE_GPU_SUCCESS( + dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params)); + PADDLE_ENFORCE_GPU_SUCCESS( + dynload::cusolverDnDgesvdj_bufferSize(handle, + jobz, + thin_UV, + m, + n, + A, + lda, + S, + U, + ldu, + V, + ldt, + &lwork, + gesvdj_params)); + auto workspace = paddle::memory::Alloc(dev_ctx, lwork * sizeof(double)); + double* workspace_ptr = reinterpret_cast(workspace->ptr()); + int stride_A = lda * n; + int stride_U = ldu * (thin_UV ? k : m); + int stride_V = ldt * (thin_UV ? k : n); + for (int i = 0; i < batchSize; ++i) { + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnDgesvdj(handle, + jobz, + thin_UV, + m, + n, + A + stride_A * i, + lda, + S + k * i, + U + stride_U * i, + ldu, + V + stride_V * i, + ldt, + workspace_ptr, + lwork, + info, + gesvdj_params)); + // check the error info + int error_info; + paddle::memory::Copy(phi::CPUPlace(), + &error_info, + dev_ctx.GetPlace(), + info, + sizeof(int), + dev_ctx.stream()); + PADDLE_ENFORCE_EQ( + error_info, + 0, + phi::errors::PreconditionNotMet( + "For batch [%d]: CUSolver SVD is not zero. [%d]", i, error_info)); + } + PADDLE_ENFORCE_GPU_SUCCESS( + dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params)); +} + +template <> +void SyevjBatched(const phi::GPUContext& dev_ctx, + int batchSize, + int n, + float* A, + float* W, + int* info) { + auto handle = dev_ctx.cusolver_dn_handle(); + // Compute eigenvalues only + const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR; + // matrix is saved as column-major in cusolver. + // numpy and torch use lower triangle to compute eigenvalues, so here use + // upper triangle + cublasFillMode_t uplo = CUBLAS_FILL_MODE_UPPER; + int lda = n; + int stride_A = lda * n; + int lwork = 0; + syevjInfo_t params = NULL; + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnCreateSyevjInfo(¶ms)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSsyevj_bufferSize( + handle, jobz, uplo, n, A, lda, W, &lwork, params)); + auto workspace = paddle::memory::Alloc(dev_ctx, lwork * sizeof(float)); + float* workspace_ptr = reinterpret_cast(workspace->ptr()); + for (int i = 0; i < batchSize; i++) { + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSsyevj(handle, + jobz, + uplo, + n, + A + stride_A * i, + lda, + W + n * i, + workspace_ptr, + lwork, + info, + params)); + + int error_info; + paddle::memory::Copy(phi::CPUPlace(), + &error_info, + dev_ctx.GetPlace(), + info, + sizeof(int), + dev_ctx.stream()); + PADDLE_ENFORCE_EQ( + error_info, + 0, + phi::errors::PreconditionNotMet( + "For batch [%d]: CUSolver eigenvalues is not zero. [%d]", + i, + error_info)); + } + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnDestroySyevjInfo(params)); +} + +template <> +void SyevjBatched(const phi::GPUContext& dev_ctx, + int batchSize, + int n, + double* A, + double* W, + int* info) { + auto handle = dev_ctx.cusolver_dn_handle(); + // Compute eigenvalues only + const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR; + // upper triangle of A is stored + cublasFillMode_t uplo = CUBLAS_FILL_MODE_UPPER; + int lda = n; + int stride_A = lda * n; + int lwork = 0; + syevjInfo_t params = NULL; + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnCreateSyevjInfo(¶ms)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnDsyevj_bufferSize( + handle, jobz, uplo, n, A, lda, W, &lwork, params)); + auto workspace = paddle::memory::Alloc(dev_ctx, lwork * sizeof(double)); + double* workspace_ptr = reinterpret_cast(workspace->ptr()); + + for (int i = 0; i < batchSize; i++) { + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnDsyevj(handle, + jobz, + uplo, + n, + A + stride_A * i, + lda, + W + n * i, + workspace_ptr, + lwork, + info, + params)); + int error_info; + paddle::memory::Copy(phi::CPUPlace(), + &error_info, + dev_ctx.GetPlace(), + info, + sizeof(int), + dev_ctx.stream()); + PADDLE_ENFORCE_EQ( + error_info, + 0, + phi::errors::PreconditionNotMet( + "For batch [%d]: CUSolver eigenvalues is not zero. [%d]", + i, + error_info)); + } + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnDestroySyevjInfo(params)); +} + +template +void MatrixRankTolKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& atol_tensor, + bool use_default_tol, + bool hermitian, + DenseTensor* out) { + auto* x_data = x.data(); + dev_ctx.template Alloc(out); + + auto dim_x = x.dims(); + auto dim_out = out->dims(); + int rows = dim_x[dim_x.size() - 2]; + int cols = dim_x[dim_x.size() - 1]; + int k = std::min(rows, cols); + auto numel = x.numel(); + int batches = numel / (rows * cols); + + T rtol_T = 0; + if (use_default_tol) { + rtol_T = std::numeric_limits::epsilon() * std::max(rows, cols); + } + + // Must Copy X once, because the gesvdj will destory the content when exit. + DenseTensor x_tmp; + paddle::framework::TensorCopy(x, dev_ctx.GetPlace(), &x_tmp); + auto info = paddle::memory::Alloc(dev_ctx, sizeof(int) * batches); + int* info_ptr = reinterpret_cast(info->ptr()); + + DenseTensor eigenvalue_tensor; + eigenvalue_tensor.Resize(detail::GetEigenvalueDim(dim_x, k)); + auto* eigenvalue_data = dev_ctx.template Alloc(&eigenvalue_tensor); + + if (hermitian) { + SyevjBatched( + dev_ctx, batches, rows, x_tmp.data(), eigenvalue_data, info_ptr); + + phi::AbsKernel(dev_ctx, eigenvalue_tensor, &eigenvalue_tensor); + + } else { + DenseTensor U, VH; + U.Resize(detail::GetUDDim(dim_x, k)); + VH.Resize(detail::GetVHDDim(dim_x, k)); + auto* u_data = dev_ctx.template Alloc(&U); + auto* vh_data = dev_ctx.template Alloc(&VH); + GesvdjBatched(dev_ctx, + batches, + cols, + rows, + k, + x_tmp.data(), + vh_data, + u_data, + eigenvalue_data, + info_ptr, + 1); + } + + DenseTensor max_eigenvalue_tensor; + dev_ctx.template Alloc(&max_eigenvalue_tensor); + max_eigenvalue_tensor.Resize(detail::RemoveLastDim(eigenvalue_tensor.dims())); + + phi::MaxKernel(dev_ctx, + eigenvalue_tensor, + std::vector{-1}, + false, + &max_eigenvalue_tensor); + + DenseTensor temp_rtol_tensor; + temp_rtol_tensor = + phi::Full(dev_ctx, {1}, static_cast(rtol_T)); + + DenseTensor rtol_tensor = + phi::Multiply(dev_ctx, temp_rtol_tensor, max_eigenvalue_tensor); + DenseTensor tol_tensor; + tol_tensor.Resize(dim_out); + dev_ctx.template Alloc(&tol_tensor); + + funcs::ElementwiseCompute, T, T>( + dev_ctx, + atol_tensor, + rtol_tensor, + -1, + GreaterElementFunctor(), + &tol_tensor); + + tol_tensor.Resize(detail::NewAxisDim(tol_tensor.dims(), 1)); + + DenseTensor compare_result; + compare_result.Resize(detail::NewAxisDim(dim_out, k)); + dev_ctx.template Alloc(&compare_result); + + int axis = -1; + funcs::ElementwiseCompute, T, int64_t>( + dev_ctx, + eigenvalue_tensor, + tol_tensor, + axis, + funcs::GreaterThanFunctor(), + &compare_result); + + phi::SumKernel(dev_ctx, + compare_result, + std::vector{-1}, + compare_result.dtype(), + false, + out); +} + +} // namespace phi + +PD_REGISTER_KERNEL(matrix_rank_tol, // cuda_only + GPU, + ALL_LAYOUT, + phi::MatrixRankTolKernel, + float, + double) {} + +#endif // not PADDLE_WITH_HIP diff --git a/paddle/phi/kernels/gpu/one_hot_kernel.cu b/paddle/phi/kernels/gpu/one_hot_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..32c7fa1e85d150b99e7a05d169b01cd8727c1a98 --- /dev/null +++ b/paddle/phi/kernels/gpu/one_hot_kernel.cu @@ -0,0 +1,86 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/one_hot_kernel.h" + +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +using paddle::platform::PADDLE_CUDA_NUM_THREADS; + +template +__global__ void FillOutputKernel(const InT* p_in_data, + OutT* p_out_data, + const int64_t numel, + const int depth) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < numel && p_in_data[idx] >= 0 && p_in_data[idx] < depth) { + *(p_out_data + (idx * depth) + p_in_data[idx]) = 1.0; + } +} + +template +struct OneHotV2OpCUDAFunctor { + const DenseTensor* in_; + DenseTensor* out_; + const DeviceContext& ctx_; + int depth_; + + OneHotV2OpCUDAFunctor(const DenseTensor* in, + DenseTensor* out, + int depth, + const DeviceContext& ctx) + : in_(in), out_(out), depth_(depth), ctx_(ctx) {} + + template + void apply() const { + auto* p_in_data = in_->data(); + auto numel = in_->numel(); + auto* p_out_data = ctx_.template Alloc(out_); + auto stream = ctx_.stream(); + funcs::set_constant(ctx_, out_, 0.0); + + FillOutputKernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / + PADDLE_CUDA_NUM_THREADS, + PADDLE_CUDA_NUM_THREADS, + 0, + stream>>>(p_in_data, p_out_data, numel, depth_); + } +}; + +template +void OneHotRawKernel(const Context& dev_ctx, + const DenseTensor& x, + int32_t depth, + DataType dtype, + bool allow_out_of_range, + DenseTensor* out) { + auto out_dims = out->dims(); + if (out_dims[out_dims.size() - 1] == -1) { + out_dims[out_dims.size() - 1] = depth; + out->Resize(out_dims); + } + + phi::VisitDataType( + dtype, OneHotV2OpCUDAFunctor(&x, out, depth, dev_ctx)); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + one_hot_raw, GPU, ALL_LAYOUT, phi::OneHotRawKernel, int, int64_t) {} diff --git a/paddle/phi/kernels/gpu/pool_grad_kernel.cu b/paddle/phi/kernels/gpu/pool_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..a5ab6a1ccd49f2a88835bf1dd63c2d874db4e2a7 --- /dev/null +++ b/paddle/phi/kernels/gpu/pool_grad_kernel.cu @@ -0,0 +1,60 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/pool_grad_kernel.h" + +#include "paddle/phi/kernels/impl/pool_grad_kernel_impl.h" + +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(pool2d_grad, + GPU, + ALL_LAYOUT, + phi::Pool2dGradKernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(pool2d_double_grad, + GPU, + ALL_LAYOUT, + phi::Pool2dDoubleGradKernel, + float, + double) {} +PD_REGISTER_KERNEL(max_pool2d_with_index_grad, + GPU, + ALL_LAYOUT, + phi::MaxPool2dWithIndexGradKernel, + float, + double) { + kernel->InputAt(1).SetDataType( + paddle::experimental::CppTypeToDataType::Type()); +} + +PD_REGISTER_KERNEL(pool3d_grad, + GPU, + ALL_LAYOUT, + phi::Pool3dGradKernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(max_pool3d_with_index_grad, + GPU, + ALL_LAYOUT, + phi::MaxPool3dWithIndexGradKernel, + float, + double) { + kernel->InputAt(1).SetDataType( + paddle::experimental::CppTypeToDataType::Type()); +} diff --git a/paddle/phi/kernels/gpu/pool_kernel.cu b/paddle/phi/kernels/gpu/pool_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..e8641395bef927b7e8f7c9ba522af84c0b34680e --- /dev/null +++ b/paddle/phi/kernels/gpu/pool_kernel.cu @@ -0,0 +1,54 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/pool_kernel.h" + +#include "paddle/phi/kernels/impl/pool_kernel_impl.h" + +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(pool2d, + GPU, + ALL_LAYOUT, + phi::Pool2dKernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(max_pool2d_with_index, + GPU, + ALL_LAYOUT, + phi::MaxPool2dWithIndexKernel, + float, + double) { + kernel->OutputAt(1).SetDataType( + paddle::experimental::CppTypeToDataType::Type()); +} + +PD_REGISTER_KERNEL(pool3d, + GPU, + ALL_LAYOUT, + phi::Pool3dKernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(max_pool3d_with_index, + GPU, + ALL_LAYOUT, + phi::MaxPool3dWithIndexKernel, + float, + double) { + kernel->OutputAt(1).SetDataType( + paddle::experimental::CppTypeToDataType::Type()); +} diff --git a/paddle/phi/kernels/gpu/reduce_prod_kernel.cu b/paddle/phi/kernels/gpu/reduce_prod_kernel.cu index 14084d0f4f3c6fbd4edeb335e15704ce2b4e6e15..278d4a6e5ab79a7519e1052a2d05c6ecda62692f 100644 --- a/paddle/phi/kernels/gpu/reduce_prod_kernel.cu +++ b/paddle/phi/kernels/gpu/reduce_prod_kernel.cu @@ -12,10 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/reduce_prod_kernel.h" +#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/gpu/reduce.h" -#include "paddle/phi/kernels/reduce_prod_kernel.h" namespace phi { diff --git a/paddle/phi/kernels/gpu/roi_align_kernel.cu b/paddle/phi/kernels/gpu/roi_align_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..2f906fa4f663b6da65a3e986af2214dfb49f2ec0 --- /dev/null +++ b/paddle/phi/kernels/gpu/roi_align_kernel.cu @@ -0,0 +1,255 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/roi_align_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/empty_kernel.h" + +#include "paddle/fluid/memory/memory.h" + +namespace phi { + +static constexpr int kNumCUDAThreads = 512; +static constexpr int kNumMaxinumNumBlocks = 4096; +static constexpr int kROISize = 4; + +static inline int NumBlocks(const int N) { + return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, + kNumMaxinumNumBlocks); +} + +template +__device__ T BilinearInterpolate( + const T* input_data, const int height, const int width, T y, T x) { + if (y < -1.0 || y > height || x < -1.0 || x > width) { + return 0; + } + y = y <= 0 ? 0 : y; + x = x <= 0 ? 0 : x; + int y_low = static_cast(y); + int x_low = static_cast(x); + int y_high; + int x_high; + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = static_cast(y_low); + } else { + y_high = y_low + 1; + } + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = static_cast(x_low); + } else { + x_high = x_low + 1; + } + T ly = y - y_low, lx = x - x_low; + T hy = 1. - ly, hx = 1. - lx; + + T v1 = input_data[y_low * width + x_low]; + T v2 = input_data[y_low * width + x_high]; + T v3 = input_data[y_high * width + x_low]; + T v4 = input_data[y_high * width + x_high]; + T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + + T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + return val; +} + +template +__global__ void GPUROIAlignForward(const int nthreads, + const T* input_data, + const T* input_rois, + const float spatial_scale, + const int channels, + const int height, + const int width, + const int pooled_height, + const int pooled_width, + const int sampling_ratio, + int* roi_batch_id_data, + T* output_data, + const bool continuous_coordinate) { + CUDA_KERNEL_LOOP(i, nthreads) { + int pw = i % pooled_width; + int ph = (i / pooled_width) % pooled_height; + int c = (i / pooled_width / pooled_height) % channels; + int n = i / pooled_width / pooled_height / channels; + + const T* offset_input_rois = input_rois + n * kROISize; + int roi_batch_ind = roi_batch_id_data[n]; + + T roi_offset = continuous_coordinate ? static_cast(0.5) : 0; + T roi_xmin = offset_input_rois[0] * spatial_scale - roi_offset; + T roi_ymin = offset_input_rois[1] * spatial_scale - roi_offset; + T roi_xmax = offset_input_rois[2] * spatial_scale - roi_offset; + T roi_ymax = offset_input_rois[3] * spatial_scale - roi_offset; + + T roi_width = roi_xmax - roi_xmin; + T roi_height = roi_ymax - roi_ymin; + if (!continuous_coordinate) { + roi_width = max(roi_width, static_cast(1.)); + roi_height = max(roi_height, static_cast(1.)); + } + + T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + + const T* offset_input_data = + input_data + (roi_batch_ind * channels + c) * height * width; + + int roi_bin_grid_h = (sampling_ratio > 0) + ? sampling_ratio + : ceil(roi_height / pooled_height); + int roi_bin_grid_w = + (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); + const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1); + T output_val = 0; + for (int iy = 0; iy < roi_bin_grid_h; iy++) { + const T y = roi_ymin + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + const T x = roi_xmin + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + T val = BilinearInterpolate(offset_input_data, height, width, y, x); + output_val += val; + } + } + output_val /= count; + output_data[i] = output_val; + } +} + +template +void ROIAlignKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& boxes, + paddle::optional boxes_num, + int pooled_height, + int pooled_width, + float spatial_scale, + int sampling_ratio, + bool aligned, + DenseTensor* out) { + auto in_dims = x.dims(); + int batch_size = in_dims[0]; + int channels = in_dims[1]; + int height = in_dims[2]; + int width = in_dims[3]; + + int rois_num = boxes.dims()[0]; + + if (rois_num == 0) return; + + int output_size = out->numel(); + int blocks = NumBlocks(output_size); + int threads = kNumCUDAThreads; +#ifdef WITH_NV_JETSON + backends::gpu::ChangeThreadNum(dev_ctx, &threads, 256); +#endif + DenseTensor roi_batch_id_list; + roi_batch_id_list.Resize({rois_num}); + int* roi_batch_id_data = dev_ctx.template HostAlloc(&roi_batch_id_list); + auto cplace = phi::CPUPlace(); + auto gplace = dev_ctx.GetPlace(); + if (boxes_num) { + int boxes_batch_size = boxes_num->numel(); + PADDLE_ENFORCE_EQ( + boxes_batch_size, + batch_size, + errors::InvalidArgument( + "The boxes_batch_size and imgs " + "batch_size must be the same. But received boxes_batch_size = %d, " + "batch_size = %d", + boxes_batch_size, + batch_size)); + + std::vector boxes_num_list(boxes_batch_size); + paddle::memory::Copy(cplace, + boxes_num_list.data(), + gplace, + boxes_num->data(), + sizeof(int) * boxes_batch_size, + 0); + int start = 0; + for (int n = 0; n < boxes_batch_size; ++n) { + for (int i = start; i < start + boxes_num_list[n]; ++i) { + roi_batch_id_data[i] = n; + } + start += boxes_num_list[n]; + } + } else { + auto lod = boxes.lod(); + PADDLE_ENFORCE_EQ(lod.empty(), + false, + errors::InvalidArgument("Input(ROIs) in ROIAlignOp does " + "not contain LoD information.")); + auto boxes_lod = lod.back(); + int boxes_batch_size = boxes_lod.size() - 1; + PADDLE_ENFORCE_EQ( + boxes_batch_size, + batch_size, + errors::InvalidArgument( + "The batch size of rois and batch size " + "of images must be the same. But received rois batch size = %d, " + "and images batch size = %d", + boxes_batch_size, + batch_size)); + int boxes_num_with_lod = boxes_lod[boxes_batch_size]; + PADDLE_ENFORCE_EQ( + rois_num, + boxes_num_with_lod, + errors::InvalidArgument( + "The actual number of rois and the number of rois " + "provided from Input(RoIsLoD) in RoIAlign must be the same." + " But received actual number of rois is %d, and the number " + "of rois from RoIsLoD is %d", + rois_num, + boxes_num_with_lod)); + for (int n = 0; n < boxes_batch_size; ++n) { + for (size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) { + roi_batch_id_data[i] = n; + } + } + } + int bytes = roi_batch_id_list.numel() * sizeof(int); + auto roi_ptr = paddle::memory::Alloc(dev_ctx, bytes); + int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); + paddle::memory::Copy( + gplace, roi_id_data, cplace, roi_batch_id_data, bytes, dev_ctx.stream()); + GPUROIAlignForward<<>>( + output_size, + x.data(), + boxes.data(), + spatial_scale, + channels, + height, + width, + pooled_height, + pooled_width, + sampling_ratio, + roi_id_data, + dev_ctx.template Alloc(out), + aligned); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + roi_align, GPU, ALL_LAYOUT, phi::ROIAlignKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/scale_kernel.cu b/paddle/phi/kernels/gpu/scale_kernel.cu index 930c50a24be8fae40535c2d5e6dbbe85e7ced990..6f96a697b2f2db6c2097640f34c30142939f80e0 100644 --- a/paddle/phi/kernels/gpu/scale_kernel.cu +++ b/paddle/phi/kernels/gpu/scale_kernel.cu @@ -15,10 +15,9 @@ limitations under the License. */ #include "paddle/phi/kernels/scale_kernel.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/float16.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/elementwise_base.h" -// See Note [ Why still include the fluid headers? ] -#include "paddle/phi/common/float16.h" namespace phi { diff --git a/paddle/phi/kernels/gpu/searchsorted_kernel.cu b/paddle/phi/kernels/gpu/searchsorted_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..4a2ce2241c22dc5c1cab391fe24a502ba845802b --- /dev/null +++ b/paddle/phi/kernels/gpu/searchsorted_kernel.cu @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/searchsorted_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/searchsorted_kernel_impl.h" + +PD_REGISTER_KERNEL(searchsorted, + GPU, + ALL_LAYOUT, + phi::SearchsortedKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/set_value_grad_kernel.cu b/paddle/phi/kernels/gpu/set_value_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..7eed96699e720870577c3d5246ce07c12c37335c --- /dev/null +++ b/paddle/phi/kernels/gpu/set_value_grad_kernel.cu @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/set_value_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/set_value_grad_kernel_impl.h" + +PD_REGISTER_KERNEL(set_value_grad, + GPU, + ALL_LAYOUT, + phi::SetValueGradKernel, + float, + double, + int, + int64_t, + bool) {} diff --git a/paddle/phi/kernels/gpu/softmax_kernel.cu b/paddle/phi/kernels/gpu/softmax_kernel.cu index 03c5714b967841ef1bd124bd9191830a79567514..4a02f438c7e7e4f0c0212ee613ce78a7fac20909 100644 --- a/paddle/phi/kernels/gpu/softmax_kernel.cu +++ b/paddle/phi/kernels/gpu/softmax_kernel.cu @@ -23,7 +23,7 @@ limitations under the License. */ PD_REGISTER_KERNEL(softmax, GPU, ALL_LAYOUT, - phi::SoftmaxRawKernel, + phi::SoftmaxKernel, float, double, phi::dtype::float16, diff --git a/paddle/phi/kernels/gpu/split_kernel.cu b/paddle/phi/kernels/gpu/split_kernel.cu index c28fc3794f092a4cee8d7fc351190c13291892b1..83c2ec4b6e99d675bfbcab58abd265cc8595259c 100644 --- a/paddle/phi/kernels/gpu/split_kernel.cu +++ b/paddle/phi/kernels/gpu/split_kernel.cu @@ -37,7 +37,7 @@ void SplitKernel(const Context& dev_ctx, out_metas_ptr.push_back(&out_metas.back()); } - phi::SplitInferMeta(x, num_or_sections, axis_scalar, out_metas_ptr, true); + phi::SplitInferMeta(x, num_or_sections, axis_scalar, out_metas_ptr); for (size_t i = 0; i < out_metas.size(); ++i) { outs[i]->Resize(out_metas[i].dims()); diff --git a/paddle/phi/kernels/gpu/top_k_kernel.cu b/paddle/phi/kernels/gpu/top_k_kernel.cu index 4e9aa88c6cb2da7fabe3f5d841a313e82b9ebed2..7f06af7de43f7ee234831203c485eaa0b8c86cbf 100644 --- a/paddle/phi/kernels/gpu/top_k_kernel.cu +++ b/paddle/phi/kernels/gpu/top_k_kernel.cu @@ -78,15 +78,16 @@ void TopkKernel(const Context& dev_ctx, // The conclusion is drawn from the data through multiple sets of // statistics if (input_width >= 128 && k >= input_width * 0.75) { - if (ops::SortTopk( - paddle::platform::CUDADeviceContext(dev_ctx.GetPlace()), - input, - input_width, - input_height, - k, - out, - indices, - largest)) { + auto* ctx = reinterpret_cast( + &dev_ctx); + if (ops::SortTopk(*ctx, + input, + input_width, + input_height, + k, + out, + indices, + largest)) { // Successed, return. return; } else { @@ -181,15 +182,16 @@ void TopkKernel(const Context& dev_ctx, // The conclusion is drawn from the data through multiple sets of // statistics if (input_width >= 128 && k >= input_width * 0.75) { - if (ops::SortTopk( - paddle::platform::CUDADeviceContext(dev_ctx.GetPlace()), - &trans_input, - input_width, - input_height, - k, - &trans_out, - &trans_ind, - largest)) { + auto* ctx = reinterpret_cast( + &dev_ctx); + if (ops::SortTopk(*ctx, + &trans_input, + input_width, + input_height, + k, + &trans_out, + &trans_ind, + largest)) { // last step, tranpose back the indices and output funcs::TransCompute( ndims, dev_ctx, trans_ind, indices, trans); diff --git a/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu b/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu index f27b32ca7b8319440b62f0d03d21129133c8470c..bb04e7ee8515bb6320860e4fd20366995d26c991 100644 --- a/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu +++ b/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu @@ -33,23 +33,27 @@ struct GPUTruncatedNormal { T mean, std; T a_normal_cdf; T b_normal_cdf; + unsigned int seed; T numeric_min; __host__ __device__ GPUTruncatedNormal(T mean, T std, T numeric_min, int seed) : mean(mean), std(std), seed(seed), numeric_min(numeric_min) { - a_normal_cdf = (1.0 + erff(-2.0 / sqrtf(2.0))) / 2.0; - b_normal_cdf = (1.0 + erff(2.0 / sqrtf(2.0))) / 2.0; + auto normal_cdf = [](float x) { + return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0; + }; + a_normal_cdf = normal_cdf((-2.0 - mean) / std); + b_normal_cdf = normal_cdf((2.0 - mean) / std); } __host__ __device__ T operator()(const unsigned int n) const { thrust::minstd_rand rng; rng.seed(seed); - thrust::uniform_real_distribution dist(numeric_min, 1); + thrust::uniform_real_distribution dist(2.0 * a_normal_cdf - 1.0, + 2.0 * b_normal_cdf - 1.0); rng.discard(n); T value = dist(rng); - auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value; - return std::sqrt(2.0) * erfinvf(2 * p - 1) * std + mean; + return std::sqrt(2.0) * erfinvf(value) * std + mean; } }; @@ -69,18 +73,21 @@ struct TruncatedNormalOffset { seed(seed), numeric_min(numeric_min), offset_(offset) { - a_normal_cdf = (1.0 + erff(-2.0 / sqrtf(2.0))) / 2.0; - b_normal_cdf = (1.0 + erff(2.0 / sqrtf(2.0))) / 2.0; + auto normal_cdf = [](float x) { + return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0; + }; + a_normal_cdf = normal_cdf((-2.0 - mean) / std); + b_normal_cdf = normal_cdf((2.0 - mean) / std); } __host__ __device__ T operator()(const unsigned int n) const { thrust::minstd_rand rng; rng.seed(seed); - thrust::uniform_real_distribution dist(numeric_min, 1); + thrust::uniform_real_distribution dist(2.0 * a_normal_cdf - 1.0, + 2.0 * b_normal_cdf - 1.0); rng.discard(n + offset_); T value = dist(rng); - auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value; - return std::sqrt(2.0) * erfinvf(2 * p - 1) * std + mean; + return std::sqrt(2.0) * erfinvf(value) * std + mean; } }; diff --git a/paddle/phi/kernels/gpudnn/pool_gpudnn.h b/paddle/phi/kernels/gpudnn/pool_gpudnn.h new file mode 100644 index 0000000000000000000000000000000000000000..0cf2c991464fc6e091eee0bc75641d7abae8598c --- /dev/null +++ b/paddle/phi/kernels/gpudnn/pool_gpudnn.h @@ -0,0 +1,43 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" + +namespace phi { + +using GPUDNNDataLayout = paddle::platform::DataLayout; +using PoolingMode = paddle::platform::PoolingMode; +using ScopedPoolingDescriptor = paddle::platform::ScopedPoolingDescriptor; +using ScopedTensorDescriptor = paddle::platform::ScopedTensorDescriptor; + +template +using ScalingParamType = + typename paddle::platform::CudnnDataType::ScalingParamType; + +inline GPUDNNDataLayout GetLayoutFromStr(std::string data_format) { + if (data_format == "NHWC") { + return GPUDNNDataLayout::kNHWC; + } else if (data_format == "NCHW") { + return GPUDNNDataLayout::kNCHW; + } else if (data_format == "NCDHW") { + return GPUDNNDataLayout::kNCDHW; + } else { + return GPUDNNDataLayout::kNCDHW; + } +} + +} // namespace phi diff --git a/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu b/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..b731d03347024ccd76eafc02c7096f3633948eb5 --- /dev/null +++ b/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu @@ -0,0 +1,448 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/pool_grad_kernel.h" + +#include "paddle/phi/kernels/gpudnn/pool_gpudnn.h" + +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/pooling.h" +#include "paddle/phi/kernels/pool_kernel.h" + +#ifdef PADDLE_WITH_HIP +#include "paddle/phi/kernels/impl/pool_grad_kernel_impl.h" // PoolGradRawGPUDNNKernel will call PoolGradRawKernel for pooling type "max" in ROCm +#endif + +namespace phi { + +template +void PoolGradRawGPUDNNKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* dx) { + PADDLE_ENFORCE_EQ( + paddle::platform::is_gpu_place(ctx.GetPlace()), + true, + errors::InvalidArgument("Pool operator CUDA kernel must use CUDAPlace " + "rather than CPUPlace.")); + + const DenseTensor* input = &x; + const DenseTensor* output = &out; + const DenseTensor* output_grad = &dout; + DenseTensor* input_grad = dx; + std::vector paddings_ = paddings; + std::vector kernel_size_ = kernel_size; + + const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); + +#ifdef PADDLE_WITH_HIP + if (pooling_type == "max") { + PoolGradRawKernel(ctx, + x, + out, + dout, + kernel_size, + strides, + paddings_, + exclusive, + data_format, + pooling_type, + global_pooling, + adaptive, + padding_algorithm, + dx); + return; + } +#endif + + // update paddings + auto in_x_dims = input->dims(); + DDim data_dims; + if (channel_last) { + data_dims = slice_ddim(in_x_dims, 1, in_x_dims.size() - 1); + } else { + data_dims = slice_ddim(in_x_dims, 2, in_x_dims.size()); + } + funcs::UpdatePadding(&paddings_, + global_pooling, + adaptive, + padding_algorithm, + data_dims, + strides, + kernel_size_); + if (data_dims.size() * 2 == static_cast(paddings_.size())) { + for (int i = 0; i < data_dims.size(); ++i) { + paddings_.erase(paddings_.begin() + i + 1); + } + } + + if (global_pooling) { + funcs::UpdateKernelSize(&kernel_size_, data_dims); + } + + // ------- tensor grad -------------- + DenseTensor transformed_input(input->type()); + DenseTensor transformed_output(output->type()); + DenseTensor transformed_output_grad(output_grad->type()); + + ctx.template Alloc(input_grad); + DenseTensor transformed_input_grad(input_grad->type()); + GPUDNNDataLayout layout; + const std::string str_NCHW = "NCHW", str_NHWC = "NHWC"; + const std::string str_NCDHW = "NCDHW", str_NDHWC = "NDHWC"; + if (data_format == str_NDHWC) { + layout = GPUDNNDataLayout::kNCDHW; + std::vector axis{0, 4, 1, 2, 3}; + + // input + transformed_input.Resize(input->dims()); + auto in_dims_vec = vectorize(input->dims()); + in_dims_vec[1] = input->dims()[4]; + in_dims_vec[2] = input->dims()[1]; + in_dims_vec[3] = input->dims()[2]; + in_dims_vec[4] = input->dims()[3]; + transformed_input.Resize(make_ddim(in_dims_vec)); + ctx.Alloc(&transformed_input, input->type()); + + funcs::Transpose trans5; + trans5(ctx, *input, &transformed_input, axis); + + // output + transformed_output.Resize(output->dims()); + auto out_dims_vec = vectorize(output->dims()); + out_dims_vec[1] = output->dims()[4]; + out_dims_vec[2] = output->dims()[1]; + out_dims_vec[3] = output->dims()[2]; + out_dims_vec[4] = output->dims()[3]; + transformed_output.Resize(make_ddim(out_dims_vec)); + + ctx.Alloc(&transformed_output, output->type()); + + funcs::Transpose trans5_v2; + trans5_v2(ctx, *output, &transformed_output, axis); + + // output grad + transformed_output_grad.Resize(make_ddim(out_dims_vec)); + ctx.Alloc(&transformed_output_grad, output_grad->type()); + + funcs::Transpose trans5_v3; + trans5_v3(ctx, *output_grad, &transformed_output_grad, axis); + + // input grad + transformed_input_grad.Resize(make_ddim(in_dims_vec)); + +#ifdef PADDLE_WITH_HIP + // MIOPEN not support NHWC data layout + } else if (data_format == str_NHWC) { + layout = GPUDNNDataLayout::kNCHW; + + std::vector axis{0, 3, 1, 2}; + + // input + transformed_input.Resize(input->dims()); + auto in_dims_vec = vectorize(input->dims()); + in_dims_vec[1] = input->dims()[3]; + in_dims_vec[2] = input->dims()[1]; + in_dims_vec[3] = input->dims()[2]; + transformed_input.Resize(make_ddim(in_dims_vec)); + ctx.Alloc(&transformed_input, input->type()); + + funcs::Transpose trans4; + trans4(ctx, *input, &transformed_input, axis); + + // output + transformed_output.Resize(output->dims()); + auto out_dims_vec = vectorize(output->dims()); + out_dims_vec[1] = output->dims()[3]; + out_dims_vec[2] = output->dims()[1]; + out_dims_vec[3] = output->dims()[2]; + transformed_output.Resize(make_ddim(out_dims_vec)); + ctx.Alloc(&transformed_output, output->type()); + + funcs::Transpose trans4_v2; + trans4_v2(ctx, *output, &transformed_output, axis); + + // output grad + transformed_output_grad.Resize(make_ddim(out_dims_vec)); + ctx.Alloc(&transformed_output_grad, output_grad->type()); + + funcs::Transpose trans4_v3; + trans4_v3(ctx, *output_grad, &transformed_output_grad, axis); + + // input grad + transformed_input_grad.Resize(make_ddim(in_dims_vec)); +#endif + } else { + layout = GetLayoutFromStr(data_format); + transformed_input = *input; + transformed_output = *output; + transformed_output_grad = *output_grad; + transformed_input_grad = *input_grad; + } + + const T* input_data = transformed_input.data(); + const T* output_data = transformed_output.data(); + const T* output_grad_data = transformed_output_grad.data(); + + // ------------------- cudnn descriptors --------------------- + ScopedTensorDescriptor input_desc; + ScopedTensorDescriptor output_desc; + ScopedPoolingDescriptor pool_desc; + +#ifdef PADDLE_WITH_HIP + miopenTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( + layout, vectorize(transformed_input.dims())); + miopenTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( + layout, vectorize(transformed_output.dims())); +#else + cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( + layout, vectorize(transformed_input.dims())); + cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( + layout, vectorize(transformed_output.dims())); +#endif + PoolingMode pooling_mode; + if (pooling_type == "max") { + if (FLAGS_cudnn_deterministic) { + pooling_mode = PoolingMode::kMaximumDeterministic; + } else { + pooling_mode = PoolingMode::kMaximum; + } + } else { + pooling_mode = exclusive ? PoolingMode::kAverageExclusive + : PoolingMode::kAverageInclusive; + } + +#ifdef PADDLE_WITH_HIP + miopenPoolingDescriptor_t cudnn_pool_desc = + pool_desc.descriptor(pooling_mode, kernel_size_, paddings_, strides); +#else + cudnnPoolingDescriptor_t cudnn_pool_desc = + pool_desc.descriptor(pooling_mode, kernel_size_, paddings_, strides); +#endif + + // ------------------- cudnn pool algorithm --------------------- + auto handle = ctx.cudnn_handle(); + ScalingParamType alpha = 1.0f, beta = 0.0f; + if (input_grad) { + T* input_grad_data = ctx.template Alloc(&transformed_input_grad); +// Because beta is zero, it is unnecessary to reset input_grad. +#ifdef PADDLE_WITH_HIP + char* pool_workspace; + size_t pool_worksize = 0; + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenPoolingGetWorkSpaceSizeV2( + cudnn_pool_desc, cudnn_output_desc, &pool_worksize)); + PADDLE_ENFORCE_GPU_SUCCESS(hipMalloc(&pool_workspace, pool_worksize)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenPoolingBackward(handle, + cudnn_pool_desc, + &alpha, + cudnn_output_desc, + output_data, + cudnn_output_desc, + output_grad_data, + cudnn_input_desc, + input_data, + &beta, + cudnn_input_desc, + input_grad_data, + pool_workspace)); + PADDLE_ENFORCE_GPU_SUCCESS(hipFree(pool_workspace)); +#else + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnPoolingBackward(handle, + cudnn_pool_desc, + &alpha, + cudnn_output_desc, + output_data, + cudnn_output_desc, + output_grad_data, + cudnn_input_desc, + input_data, + &beta, + cudnn_input_desc, + input_grad_data)); +#endif + + if (data_format == str_NDHWC) { + std::vector axis{0, 2, 3, 4, 1}; + funcs::Transpose trans5_v4; + trans5_v4(ctx, transformed_input_grad, input_grad, axis); + } +#ifdef PADDLE_WITH_HIP + // MIOPEN not support NHWC data layout + if (data_format == str_NHWC) { + std::vector axis{0, 2, 3, 1}; + funcs::Transpose trans4_v4; + trans4_v4(ctx, transformed_input_grad, input_grad, axis); + } +#endif + } +} + +template +void Pool2dGradGPUDNNKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* dx) { + PoolGradRawGPUDNNKernel(ctx, + x, + out, + dout, + kernel_size, + strides, + paddings, + exclusive, + data_format, + pooling_type, + global_pooling, + adaptive, + padding_algorithm, + dx); +} + +template +void Pool2dDoubleGradGPUDNNKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* out) { + if (pooling_type == "max") { + PADDLE_THROW( + errors::InvalidArgument("Pool op grad grad only supports avgpool.")); + } else { + Pool2dGPUDNNKernel(ctx, + x, + kernel_size, + strides, + paddings, + ceil_mode, + exclusive, + data_format, + pooling_type, + global_pooling, + adaptive, + padding_algorithm, + out); + } +} + +template +void Pool3dGradGPUDNNKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* dx) { + PoolGradRawGPUDNNKernel(ctx, + x, + out, + dout, + kernel_size, + strides, + paddings, + exclusive, + data_format, + pooling_type, + global_pooling, + adaptive, + padding_algorithm, + dx); +} + +} // namespace phi + +using phi::dtype::float16; + +#ifdef PADDLE_WITH_HIP +// MIOPEN do not support double +PD_REGISTER_KERNEL(pool2d_grad, + GPUDNN, + ALL_LAYOUT, + phi::Pool2dGradGPUDNNKernel, + float, + float16) {} +PD_REGISTER_KERNEL(pool2d_double_grad, + GPUDNN, + ALL_LAYOUT, + phi::Pool2dDoubleGradGPUDNNKernel, + float, + float16) {} +PD_REGISTER_KERNEL(pool3d_grad, + GPUDNN, + ALL_LAYOUT, + phi::Pool3dGradGPUDNNKernel, + float, + float16) {} +#else +PD_REGISTER_KERNEL(pool2d_grad, + GPUDNN, + ALL_LAYOUT, + phi::Pool2dGradGPUDNNKernel, + float, + double, + float16) {} +PD_REGISTER_KERNEL(pool2d_double_grad, + GPUDNN, + ALL_LAYOUT, + phi::Pool2dDoubleGradGPUDNNKernel, + float, + double, + float16) {} +PD_REGISTER_KERNEL(pool3d_grad, + GPUDNN, + ALL_LAYOUT, + phi::Pool3dGradGPUDNNKernel, + float, + double, + float16) {} +#endif diff --git a/paddle/phi/kernels/gpudnn/pool_kernel.cu b/paddle/phi/kernels/gpudnn/pool_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..d8f965667758b9118635e3c8db4be74f9ff54a6a --- /dev/null +++ b/paddle/phi/kernels/gpudnn/pool_kernel.cu @@ -0,0 +1,312 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/pool_kernel.h" + +#include "paddle/phi/kernels/gpudnn/pool_gpudnn.h" + +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/pooling.h" + +namespace phi { + +template +void PoolRawGPUDNNKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* out) { + PADDLE_ENFORCE_EQ( + paddle::platform::is_gpu_place(ctx.GetPlace()), + true, + errors::InvalidArgument("Pool operator CUDA kernel must use CUDAPlace " + "rather than CPUPlace.")); + + const DenseTensor* input = &x; + DenseTensor* output = out; + std::vector paddings_ = paddings; + std::vector kernel_size_ = kernel_size; + + ctx.template Alloc(output); + + const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); + + // update paddings_ + auto x_dims = input->dims(); + DDim data_dims; + if (channel_last) { + data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1); + } else { + data_dims = slice_ddim(x_dims, 2, x_dims.size()); + } + funcs::UpdatePadding(&paddings_, + global_pooling, + adaptive, + padding_algorithm, + data_dims, + strides, + kernel_size_); + if (data_dims.size() * 2 == static_cast(paddings_.size())) { + for (int i = 0; i < data_dims.size(); ++i) { + paddings_.erase(paddings_.begin() + i + 1); + } + } + + if (global_pooling) { + funcs::UpdateKernelSize(&kernel_size_, data_dims); + } + + const std::string str_NCHW = "NCHW", str_NHWC = "NHWC"; + const std::string str_NCDHW = "NCDHW", str_NDHWC = "NDHWC"; + + // -----------------transformed tensor ------------------------ + + DenseTensor transformed_input(input->type()); + DenseTensor transformed_output(output->type()); + GPUDNNDataLayout layout; + + if (data_format == str_NDHWC) { + layout = GPUDNNDataLayout::kNCDHW; + std::vector axis{0, 4, 1, 2, 3}; + + // input + transformed_input.Resize(input->dims()); + + auto in_dims_vec = vectorize(input->dims()); + in_dims_vec[1] = input->dims()[4]; + in_dims_vec[2] = input->dims()[1]; + in_dims_vec[3] = input->dims()[2]; + in_dims_vec[4] = input->dims()[3]; + transformed_input.Resize(make_ddim(in_dims_vec)); + ctx.Alloc(&transformed_input, input->type()); + + funcs::Transpose trans5; + trans5(ctx, *input, &transformed_input, axis); + + // output + transformed_output.Resize(output->dims()); + + auto out_dims_vec = vectorize(output->dims()); + out_dims_vec[1] = output->dims()[4]; + out_dims_vec[2] = output->dims()[1]; + out_dims_vec[3] = output->dims()[2]; + out_dims_vec[4] = output->dims()[3]; + transformed_output.Resize(make_ddim(out_dims_vec)); +#ifdef PADDLE_WITH_HIP + // MIOPEN not support NHWC data layout + } else if (data_format == str_NHWC) { + layout = GPUDNNDataLayout::kNCHW; + + std::vector axis{0, 3, 1, 2}; + + transformed_input.Resize(input->dims()); + auto in_dims_vec = vectorize(input->dims()); + in_dims_vec[1] = input->dims()[3]; + in_dims_vec[2] = input->dims()[1]; + in_dims_vec[3] = input->dims()[2]; + transformed_input.Resize(make_ddim(in_dims_vec)); + ctx.Alloc(&transformed_input, input->type()); + + funcs::Transpose trans; + trans(ctx, *input, &transformed_input, axis); + + transformed_output.Resize(output->dims()); + auto out_dims_vec = vectorize(output->dims()); + out_dims_vec[1] = output->dims()[3]; + out_dims_vec[2] = output->dims()[1]; + out_dims_vec[3] = output->dims()[2]; + transformed_output.Resize(make_ddim(out_dims_vec)); +#endif + } else { + layout = GetLayoutFromStr(data_format); + transformed_input = *input; + transformed_output = *output; + } + + const T* tranformed_input_data = transformed_input.data(); + T* tranformed_output_data = ctx.template Alloc(&transformed_output); + + // ------------------- cudnn descriptors --------------------- + ScopedTensorDescriptor input_desc; + ScopedTensorDescriptor output_desc; + ScopedPoolingDescriptor pool_desc; + +#ifdef PADDLE_WITH_HIP + miopenTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( + layout, vectorize(transformed_input.dims())); + miopenTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( + layout, vectorize(transformed_output.dims())); +#else + cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( + layout, vectorize(transformed_input.dims())); + cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( + layout, vectorize(transformed_output.dims())); +#endif + PoolingMode pooling_mode; + if (pooling_type == "max") { + pooling_mode = PoolingMode::kMaximum; + } else { + pooling_mode = exclusive ? PoolingMode::kAverageExclusive + : PoolingMode::kAverageInclusive; + } + +#ifdef PADDLE_WITH_HIP + miopenPoolingDescriptor_t cudnn_pool_desc = + pool_desc.descriptor(pooling_mode, kernel_size_, paddings_, strides); +#else + cudnnPoolingDescriptor_t cudnn_pool_desc = + pool_desc.descriptor(pooling_mode, kernel_size_, paddings_, strides); +#endif + + // ------------------- cudnn pool algorithm --------------------- + auto handle = ctx.cudnn_handle(); + ScalingParamType alpha = 1.0f, beta = 0.0f; + +#ifdef PADDLE_WITH_HIP + char* pool_workspace; + size_t pool_workernel_size_ = 0; + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenPoolingGetWorkSpaceSizeV2( + cudnn_pool_desc, cudnn_output_desc, &pool_workernel_size_)); + PADDLE_ENFORCE_GPU_SUCCESS(hipMalloc(&pool_workspace, pool_workernel_size_)); + PADDLE_ENFORCE_GPU_SUCCESS( + dynload::miopenPoolingForward(handle, + cudnn_pool_desc, + &alpha, + cudnn_input_desc, + tranformed_input_data, + &beta, + cudnn_output_desc, + tranformed_output_data, + false, + pool_workspace, + pool_workernel_size_)); + PADDLE_ENFORCE_GPU_SUCCESS(hipFree(pool_workspace)); +#else + PADDLE_ENFORCE_GPU_SUCCESS( + dynload::cudnnPoolingForward(handle, + cudnn_pool_desc, + &alpha, + cudnn_input_desc, + tranformed_input_data, + &beta, + cudnn_output_desc, + tranformed_output_data)); +#endif + // add + if (data_format == str_NDHWC) { + std::vector axis{0, 2, 3, 4, 1}; + funcs::Transpose trans5_v2; + trans5_v2(ctx, transformed_output, output, axis); + } +#ifdef PADDLE_WITH_HIP + // MIOPEN not support NHWC data layout + if (data_format == str_NHWC) { + std::vector axis{0, 2, 3, 1}; + funcs::Transpose trans; + trans(ctx, transformed_output, output, axis); + } +#endif +} + +template +void Pool2dGPUDNNKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* out) { + PoolRawGPUDNNKernel(ctx, + x, + kernel_size, + strides, + paddings, + exclusive, + data_format, + pooling_type, + global_pooling, + adaptive, + padding_algorithm, + out); +} + +template +void Pool3dGPUDNNKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* out) { + PoolRawGPUDNNKernel(ctx, + x, + kernel_size, + strides, + paddings, + exclusive, + data_format, + pooling_type, + global_pooling, + adaptive, + padding_algorithm, + out); +} + +} // namespace phi + +using phi::dtype::float16; + +#ifdef PADDLE_WITH_HIP +// MIOPEN do not support double +PD_REGISTER_KERNEL( + pool2d, GPUDNN, ALL_LAYOUT, phi::Pool2dGPUDNNKernel, float, float16) {} +PD_REGISTER_KERNEL( + pool3d, GPUDNN, ALL_LAYOUT, phi::Pool3dGPUDNNKernel, float, float16) {} +#else +PD_REGISTER_KERNEL(pool2d, + GPUDNN, + ALL_LAYOUT, + phi::Pool2dGPUDNNKernel, + float, + double, + float16) {} +PD_REGISTER_KERNEL(pool3d, + GPUDNN, + ALL_LAYOUT, + phi::Pool3dGPUDNNKernel, + float, + double, + float16) {} +#endif diff --git a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h index 0352fdf6fa2f1c1b74515d8e0023ef5a58e4efae..2b2dd5118969cf35c4762f3ab774ce41c04d2e4d 100644 --- a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h +++ b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h @@ -351,8 +351,17 @@ __global__ void WarpSoftmaxForward(T* softmax, VecT* softmax_v = reinterpret_cast(&softmax[(first_batch + i) * stride]); VecT* reg_v = reinterpret_cast(&out_tmp[i][0][0]); - kps::ElementwiseUnary>( - &out_tmp[i][0][0], &srcdata[i][0][0], UnaryDivFunctor(sum[i])); + if (LogMode) { + kps::ElementwiseUnary>( + &srcdata[i][0][0], &srcdata[i][0][0], UnaryLogFunctor()); + kps::ElementwiseUnary>( + &out_tmp[i][0][0], + &srcdata[i][0][0], + UnarySubFunctor(std::log(sum[i]))); + } else { + kps::ElementwiseUnary>( + &out_tmp[i][0][0], &srcdata[i][0][0], UnaryDivFunctor(sum[i])); + } kps::WriteData( &softmax_v[0], ®_v[0], idx_max_v[i], 0, kWarpSize, 1); } @@ -434,15 +443,25 @@ __global__ void WarpSoftmaxBackward(T* dst, AccT sum_tmp[kBatchSize][kLoopsV][kVSize]; AccT* gradptr = reinterpret_cast(&grad_tmp[0][0][0]); AccT* srcptr = reinterpret_cast(&src_tmp[0][0][0]); - kps::ElementwiseBinary>( - &sum_tmp[0][0][0], &gradptr[0], &srcptr[0], kps::MulFunctor()); - kps::Reduce, - kps::details::ReduceMode::kLocalMode>( - &sum[0], &sum_tmp[0][0][0], kps::AddFunctor(), true); + if (LogMode) { + kps::Reduce, + kps::details::ReduceMode::kLocalMode>( + &sum[0], &grad_tmp[0][0][0], kps::AddFunctor(), true); + } else { + kps::ElementwiseBinary>( + &sum_tmp[0][0][0], &gradptr[0], &srcptr[0], kps::MulFunctor()); + kps::Reduce, + kps::details::ReduceMode::kLocalMode>( + &sum[0], &sum_tmp[0][0][0], kps::AddFunctor(), true); + } WarpReduceSum(sum); // write result to global memory @@ -453,10 +472,23 @@ __global__ void WarpSoftmaxBackward(T* dst, if (i >= local_batches) break; AccT* gradptr = reinterpret_cast(&grad_tmp[i][0][0]); AccT* srcptr = reinterpret_cast(&src_tmp[i][0][0]); - kps::ElementwiseUnary>( - &out[i][0][0], &gradptr[0], UnarySubFunctor(sum[i])); - kps::ElementwiseBinary>( - &out_tmp[i][0][0], &srcptr[0], &out[i][0][0], kps::MulFunctor()); + if (LogMode) { + kps::ElementwiseUnary>( + &out[i][0][0], &srcptr[0], ExpMulFunctor(sum[i])); + kps::ElementwiseBinary>( + &out_tmp[i][0][0], + &gradptr[0], + &out[i][0][0], + kps::SubFunctor()); + } else { + kps::ElementwiseUnary>( + &out[i][0][0], &gradptr[0], UnarySubFunctor(sum[i])); + kps::ElementwiseBinary>( + &out_tmp[i][0][0], + &srcptr[0], + &out[i][0][0], + kps::MulFunctor()); + } VecT* dst_v = reinterpret_cast(&dst[(first_batch + i) * stride]); VecT* reg_v = reinterpret_cast(&out_tmp[i][0][0]); kps::WriteData( @@ -639,7 +671,8 @@ __global__ void NormalSoftmaxForward( template class Functor> + template class Functor, + bool LogMode> __global__ void NormalSoftmaxBackward(T* input_grad, const T* output_grad, const T* output, @@ -656,10 +689,17 @@ __global__ void NormalSoftmaxBackward(T* input_grad, // 1. reduce sum AccT sum = 0; - for (int mid_id = threadIdx.y; mid_id < mid_dim; mid_id += blockDim.y) { - int data_offset = grad_offset + mid_id * mid_stride; - sum += static_cast(output_grad[data_offset]) * - static_cast(output[data_offset]); + if (LogMode) { + for (int mid_id = threadIdx.y; mid_id < mid_dim; mid_id += blockDim.y) { + int data_offset = grad_offset + mid_id * mid_stride; + sum += static_cast(output_grad[data_offset]); + } + } else { + for (int mid_id = threadIdx.y; mid_id < mid_dim; mid_id += blockDim.y) { + int data_offset = grad_offset + mid_id * mid_stride; + sum += static_cast(output_grad[data_offset]) * + static_cast(output[data_offset]); + } } if (blockDim.y > 1) { kps::Reduce, kMode::kGlobalMode>( @@ -715,10 +755,10 @@ void LaunchNormalSoftmaxBackward(const GPUContext& dev_ctx, dim3 grid, block; GetLaunchConfig(high_dim, mid_dim, low_dim, &grid, &block); if (LogMode) { - NormalSoftmaxBackward< - T, - AccT, - LogSoftmaxBackwardFunctor><<>>( + NormalSoftmaxBackward<<>>( input_grad_data, output_grad_data, output_data, @@ -726,10 +766,10 @@ void LaunchNormalSoftmaxBackward(const GPUContext& dev_ctx, mid_dim, low_dim); } else { - NormalSoftmaxBackward< - T, - AccT, - SoftmaxBackwardFunctor><<>>( + NormalSoftmaxBackward<<>>( input_grad_data, output_grad_data, output_data, @@ -864,6 +904,32 @@ static bool CanUseCudnnSoftmax(const GPUContext& dev_ctx) { return false; } +#if CUDNN_VERSION < 8100 +template <> +inline void SoftmaxForwardCudnnKernel( + const GPUContext& dev_ctx, + const DenseTensor& x, + const int axis, + const bool log_mode, + DenseTensor* out) { + PADDLE_THROW(errors::Unavailable( + "This kernel is not supported when the dtype is bf16 and CUDNN_VERSION < " + "8100.")); +} +template <> +inline void SoftmaxBackwardCudnnKernel( + const GPUContext& dev_ctx, + const DenseTensor& out, + const DenseTensor& dout, + const int axis, + const bool log_mode, + DenseTensor* dx) { + PADDLE_THROW(errors::Unavailable( + "This kernel is not supported when the dtype is bf16 and CUDNN_VERSION < " + "8100.")); +} +#endif + template void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx, const DenseTensor& x, diff --git a/paddle/phi/kernels/gpudnn/softmax_kernel.cu b/paddle/phi/kernels/gpudnn/softmax_kernel.cu index 7685c7dbb6894b4e640ea4b63010c4d22fc5e18f..37175c427ffe142c31b41c8356d160d203fd6d73 100644 --- a/paddle/phi/kernels/gpudnn/softmax_kernel.cu +++ b/paddle/phi/kernels/gpudnn/softmax_kernel.cu @@ -21,10 +21,10 @@ limitations under the License. */ namespace phi { template -void SoftmaxRawGPUDNNKernel(const Context& dev_ctx, - const DenseTensor& x, - int axis, - DenseTensor* out) { +void SoftmaxGPUDNNKernel(const Context& dev_ctx, + const DenseTensor& x, + int axis, + DenseTensor* out) { dev_ctx.template Alloc(out); SoftmaxForwardCUDAKernelDriver(dev_ctx, x, axis, out); } @@ -35,7 +35,7 @@ void SoftmaxRawGPUDNNKernel(const Context& dev_ctx, PD_REGISTER_KERNEL(softmax, GPUDNN, ALL_LAYOUT, - phi::SoftmaxRawGPUDNNKernel, + phi::SoftmaxGPUDNNKernel, float, phi::dtype::float16, phi::dtype::bfloat16) {} @@ -44,7 +44,7 @@ PD_REGISTER_KERNEL(softmax, PD_REGISTER_KERNEL(softmax, GPUDNN, ALL_LAYOUT, - phi::SoftmaxRawGPUDNNKernel, + phi::SoftmaxGPUDNNKernel, float, double, phi::dtype::float16, @@ -53,7 +53,7 @@ PD_REGISTER_KERNEL(softmax, PD_REGISTER_KERNEL(softmax, GPUDNN, ALL_LAYOUT, - phi::SoftmaxRawGPUDNNKernel, + phi::SoftmaxGPUDNNKernel, float, double, phi::dtype::float16) {} diff --git a/paddle/phi/kernels/impl/activation_grad_impl.h b/paddle/phi/kernels/impl/activation_grad_impl.h index 80e23d2b8e24b875fcc03bc0c1c149c0c13e3e41..a48a6226f23f8d9976dc86e59b051828b1d71b21 100644 --- a/paddle/phi/kernels/impl/activation_grad_impl.h +++ b/paddle/phi/kernels/impl/activation_grad_impl.h @@ -130,4 +130,76 @@ void ReluDoubleGradKernel(const Context& dev_ctx, relu_double_grad_functor); } +template +void LeakyReluDoubleGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& ddx, + float alpha, + DenseTensor* ddout) { + funcs::LeakyReluGradGradFunctor leaky_relu_double_grad_functor; + leaky_relu_double_grad_functor.alpha = alpha; + ActivationDoubleGradImpl>( + dev_ctx, + &x, + nullptr, + &ddx, + nullptr, + nullptr, + ddout, + leaky_relu_double_grad_functor); +} + +template +void TanhDoubleGradKernel(const Context& dev_ctx, + const DenseTensor& out, + const DenseTensor& ddx, + const DenseTensor& dout, + DenseTensor* dout_new, + DenseTensor* ddout) { + if (dout_new) { + dout_new->Resize(out.dims()); + dev_ctx.template Alloc(dout_new); + } + if (ddout) { + ddout->Resize(out.dims()); + dev_ctx.template Alloc(ddout); + } + funcs::TanhGradGradFunctor functor; + functor(dev_ctx, &out, &ddx, &dout, dout_new, ddout); +} + +template +void TanhTripleGradKernel(const Context& dev_ctx, + const DenseTensor& out, + const DenseTensor& ddx, + const DenseTensor& dout, + const DenseTensor& d_ddout, + const DenseTensor& d_dout_new, + DenseTensor* d_out_new, + DenseTensor* d_dout, + DenseTensor* d_ddx) { + if (d_dout) { + d_dout->Resize(out.dims()); + dev_ctx.template Alloc(d_dout); + } + if (d_out_new) { + d_dout->Resize(out.dims()); + dev_ctx.template Alloc(d_out_new); + } + if (d_ddx) { + d_dout->Resize(ddx.dims()); + dev_ctx.template Alloc(d_ddx); + } + funcs::TanhTripleGradFunctor functor; + functor(dev_ctx, + &out, + &ddx, + &dout, + &d_ddout, + &d_dout_new, // input + d_dout, + d_out_new, + d_ddx); // output +} + } // namespace phi diff --git a/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h b/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h index 2f0530b638f5ea3d263f7c2b1a932a65ccaf3da2..5b71fd7fa3a5ecd1c864c155df2586d293d3d2e6 100644 --- a/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h @@ -15,21 +15,15 @@ #pragma once #include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/device_context.h" -#include "paddle/phi/kernels/cast_kernel.h" #include "paddle/phi/kernels/complex_kernel.h" -#include "paddle/phi/kernels/full_kernel.h" -#include "paddle/phi/kernels/funcs/complex_functors.h" #include "paddle/phi/kernels/funcs/diag_functor.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/unsqueeze.h" #include "paddle/phi/kernels/math_kernel.h" #include "paddle/phi/kernels/matmul_kernel.h" #include "paddle/phi/kernels/transpose_kernel.h" -#include "paddle/phi/kernels/funcs/eigen/common.h" -#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" - namespace phi { template diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h index 65427e87506f70549c81acec714ce2f5ebdfc9b8..0b7a5d3bcb26a360eb5f7f664ead7932f428cc64 100644 --- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h @@ -258,6 +258,102 @@ void DivideDoubleGradKernel(const Context& dev_ctx, dout_result.device(place) = static_cast(-1) * dout_result; } } +template +void ElementwiseFMaxGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out_grad, + int axis, + DenseTensor* x_grad, + DenseTensor* y_grad) { + funcs::ElementwiseGradPreProcess(out_grad, x_grad); + + auto out = out_grad; // Fake out, not used + auto x_dim = x.dims(); + auto y_dim = y.dims(); + if (x.dims() == y.dims()) { + funcs::ElemwiseGradComputeNoBroadcast, + funcs::FMaxGradDy>( + dev_ctx, + x_dim, + y_dim, + x, + y, + out, + out_grad, + axis, + x_grad, + y_grad, + funcs::FMaxGradDx(), + funcs::FMaxGradDy()); + } else { + funcs::ElemwiseGradComputeWithBroadcast, + funcs::FMaxGradDy>( + dev_ctx, + x_dim, + y_dim, + x, + y, + out, + out_grad, + axis, + x_grad, + y_grad, + funcs::FMaxGradDx(), + funcs::FMaxGradDy()); + } +} + +template +void ElementwiseFMinGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out_grad, + int axis, + DenseTensor* x_grad, + DenseTensor* y_grad) { + funcs::ElementwiseGradPreProcess(out_grad, x_grad); + auto out = out_grad; // Fake out, not used + auto x_dim = x.dims(); + auto y_dim = y.dims(); + if (x.dims() == y.dims()) { + funcs::ElemwiseGradComputeNoBroadcast, + funcs::FMinGradDy>( + dev_ctx, + x_dim, + y_dim, + x, + y, + out, + out_grad, + axis, + x_grad, + y_grad, + funcs::FMinGradDx(), + funcs::FMinGradDy()); + } else { + funcs::ElemwiseGradComputeWithBroadcast, + funcs::FMinGradDy>( + dev_ctx, + x_dim, + y_dim, + x, + y, + out, + out_grad, + axis, + x_grad, + y_grad, + funcs::FMinGradDx(), + funcs::FMinGradDy()); + } +} template struct MulGradDX { diff --git a/paddle/phi/kernels/impl/elementwise_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_kernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..775a91bf026d298a61315a7e2d7ebfbe92efb0b5 --- /dev/null +++ b/paddle/phi/kernels/impl/elementwise_kernel_impl.h @@ -0,0 +1,47 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/elementwise_kernel.h" +#include "paddle/phi/kernels/funcs/elementwise_base.h" +#include "paddle/phi/kernels/funcs/elementwise_functor.h" +#if defined(__NVCC__) || defined(__HIPCC__) +#include "paddle/phi/kernels/funcs/broadcast_function.h" +#endif + +namespace phi { +template +void ElementwiseFMaxKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out) { + dev_ctx.template Alloc(out); + funcs::ElementwiseCompute, T, T>( + dev_ctx, x, y, axis, funcs::FMaxFunctor(), out); +} + +template +void ElementwiseFMinKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out) { + dev_ctx.template Alloc(out); + funcs::ElementwiseCompute, T, T>( + dev_ctx, x, y, axis, funcs::FMinFunctor(), out); +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/kron_grad_kernel_impl.h b/paddle/phi/kernels/impl/kron_grad_kernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..30297b53eabb99c4fcccc5c3c7faa04f86d4bb93 --- /dev/null +++ b/paddle/phi/kernels/impl/kron_grad_kernel_impl.h @@ -0,0 +1,295 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/impl/kron_kernel_impl.h" + +namespace phi { + +template +struct KronGradElemFunctor { + KronGradElemFunctor(const T* dout, + const T* A, + const T* B, + T* dout_a, + T* dout_b, + const int64_t* stride_dout, + const int64_t* stride_a, + const int64_t* stride_b, + const int64_t* shape_b, + const int64_t numel_a, + const int64_t numel_b, + const int ndims) + : dout_(dout), + A_(A), + B_(B), + dout_a_(dout_a), + dout_b_(dout_b), + stride_dout_(stride_dout), + stride_a_(stride_a), + stride_b_(stride_b), + shape_b_(shape_b), + numel_a_(numel_a), + numel_b_(numel_b), + ndims_(ndims) {} + + HOSTDEVICE void operator()(int64_t idx) { + int64_t index = idx; + int64_t index_a = 0; + int64_t index_b = 0; + for (int i = 0; i < ndims_; i++) { + auto pos_i = index / stride_dout_[i]; + index = index % stride_dout_[i]; + auto pos_ai = pos_i / shape_b_[i]; + auto pos_bi = pos_i % shape_b_[i]; + index_a += stride_a_[i] * pos_ai; + index_b += stride_b_[i] * pos_bi; + } + + if (dout_a_) { + size_t index_out_a = index_a * numel_b_ + index_b; + dout_a_[index_out_a] = dout_[idx] * B_[index_b]; + } + if (dout_b_) { + size_t index_out_b = index_b * numel_a_ + index_a; + dout_b_[index_out_b] = dout_[idx] * A_[index_a]; + } + } + + private: + const T* dout_; + const T* A_; + const T* B_; + T* dout_a_; + T* dout_b_; + const int64_t* stride_dout_; + const int64_t* stride_a_; + const int64_t* stride_b_; + const int64_t* shape_b_; + const int64_t numel_a_; + const int64_t numel_b_; + const int ndims_; +}; + +template +struct KronGradElemFunctor> { + KronGradElemFunctor(const dtype::complex* dout, + const dtype::complex* A, + const dtype::complex* B, + dtype::complex* dout_a, + dtype::complex* dout_b, + const int64_t* stride_dout, + const int64_t* stride_a, + const int64_t* stride_b, + const int64_t* shape_b, + const int64_t numel_a, + const int64_t numel_b, + const int ndims) + : dout_(dout), + A_(A), + B_(B), + dout_a_(dout_a), + dout_b_(dout_b), + stride_dout_(stride_dout), + stride_a_(stride_a), + stride_b_(stride_b), + shape_b_(shape_b), + numel_a_(numel_a), + numel_b_(numel_b), + ndims_(ndims) {} + + HOSTDEVICE void operator()(int64_t idx) { + int64_t index = idx; + int64_t index_a = 0; + int64_t index_b = 0; + for (int i = 0; i < ndims_; i++) { + auto pos_i = index / stride_dout_[i]; + index = index % stride_dout_[i]; + auto pos_ai = pos_i / shape_b_[i]; + auto pos_bi = pos_i % shape_b_[i]; + index_a += stride_a_[i] * pos_ai; + index_b += stride_b_[i] * pos_bi; + } + + if (dout_a_) { + size_t index_out_a = index_a * numel_b_ + index_b; + dout_a_[index_out_a] = + dout_[idx] * dtype::complex(B_[index_b].real, -B_[index_b].imag); + } + if (dout_b_) { + size_t index_out_b = index_b * numel_a_ + index_a; + dout_b_[index_out_b] = + dout_[idx] * dtype::complex(A_[index_a].real, -A_[index_a].imag); + } + } + + private: + const dtype::complex* dout_; + const dtype::complex* A_; + const dtype::complex* B_; + dtype::complex* dout_a_; + dtype::complex* dout_b_; + const int64_t* stride_dout_; + const int64_t* stride_a_; + const int64_t* stride_b_; + const int64_t* shape_b_; + const int64_t numel_a_; + const int64_t numel_b_; + const int ndims_; +}; + +template +struct KronGradOpFunctor { + void operator()(const Context& dev_ctx, + const DenseTensor& dout, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* dx, + DenseTensor* dy) { + int ndims = dout.dims().size(); + int64_t numel = dout.numel(); + int64_t numel_x = x.numel(); + int64_t numel_y = y.numel(); + + const phi::DDim& dim_x = x.dims(); + const phi::DDim& dim_y = y.dims(); + const phi::DDim& dim_dout = dout.dims(); + + const phi::DDim stride_x = phi::stride(dim_x); + const phi::DDim stride_y = phi::stride(dim_y); + const phi::DDim stride_dout = phi::stride(dim_dout); + + const int64_t* p_stride_x = nullptr; + const int64_t* p_stride_y = nullptr; + const int64_t* p_stride_dout = nullptr; + const int64_t* p_shape_y = nullptr; +#if defined(__NVCC__) || defined(__HIPCC__) + thrust::device_vector d_stride_x(ndims); + thrust::device_vector d_stride_y(ndims); + thrust::device_vector d_stride_dout(ndims); + thrust::device_vector d_shape_y(ndims); + thrust::copy(stride_x.Get(), stride_x.Get() + ndims, d_stride_x.begin()); + thrust::copy(stride_y.Get(), stride_y.Get() + ndims, d_stride_y.begin()); + thrust::copy( + stride_dout.Get(), stride_dout.Get() + ndims, d_stride_dout.begin()); + thrust::copy(dim_y.Get(), dim_y.Get() + ndims, d_shape_y.begin()); + + p_stride_x = thrust::raw_pointer_cast(d_stride_x.data()); + p_stride_y = thrust::raw_pointer_cast(d_stride_y.data()); + p_stride_dout = thrust::raw_pointer_cast(d_stride_dout.data()); + p_shape_y = thrust::raw_pointer_cast(d_shape_y.data()); +#else + p_stride_x = stride_x.Get(); + p_stride_y = stride_y.Get(); + p_stride_dout = stride_dout.Get(); + p_shape_y = dim_y.Get(); +#endif + // dout_x: dout * kron(ones(X), Y) re-aranged in shape (numel_x, numel_y) + // dout_y: dout * kron(X, ones(Y)) re-aranged in shaoe (numel_y, numel_x) + DenseTensor dout_x; + T* p_dout_x = nullptr; + if (dx) { + dout_x.Resize({numel_x, numel_y}); + dev_ctx.template Alloc(&dout_x); + p_dout_x = dout_x.data(); + } + DenseTensor dout_y; + T* p_dout_y = nullptr; + if (dy) { + dout_y.Resize({numel_y, numel_x}); + dev_ctx.template Alloc(&dout_y); + p_dout_y = dout_y.data(); + } + + funcs::ForRange for_range(dev_ctx, numel); + KronGradElemFunctor func(dout.data(), + x.data(), + y.data(), + p_dout_x, + p_dout_y, + p_stride_dout, + p_stride_x, + p_stride_y, + p_shape_y, + numel_x, + numel_y, + ndims); + for_range(func); + +// reduce_sum along aixs 1 +#if defined(__NVCC__) || defined(__HIPCC__) + auto stream = dev_ctx.stream(); // it is a cuda device_context + if (dx) { + funcs::ReduceKernel>( + dev_ctx, dout_x, dx, kps::IdentityFunctor(), {1}); + } + if (dy) { + funcs::ReduceKernel>( + dev_ctx, dout_y, dy, kps::IdentityFunctor(), {1}); + } +#else + auto* place = dev_ctx.eigen_device(); + Eigen::array reduce_dim = {1}; + if (dx) { + auto eigen_dout_x = EigenMatrix::Reshape(dout_x, 1); + auto eigen_vec_dx = EigenVector::Flatten(*dx); + eigen_vec_dx.device(*place) = eigen_dout_x.sum(reduce_dim); + } + if (dy) { + auto eigen_dout_y = EigenMatrix::Reshape(dout_y, 1); + auto eigen_vec_dy = EigenVector::Flatten(*dy); + eigen_vec_dy.device(*place) = eigen_dout_y.sum(reduce_dim); + } +#endif + } +}; + +template +void KronGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out_grad, + DenseTensor* x_grad, + DenseTensor* y_grad) { + if (x_grad) { + ctx.template Alloc(x_grad); + } + if (y_grad) { + ctx.template Alloc(y_grad); + } + + int ndims = out_grad.dims().size(); + DenseTensor xx = UnsqueezeTo(x, ndims); + DenseTensor yy = UnsqueezeTo(y, ndims); + + DenseTensor* pdxx = nullptr; + DenseTensor* pdyy = nullptr; + DenseTensor dxx; + DenseTensor dyy; + if (x_grad) { + dxx = UnsqueezeTo(*x_grad, ndims); + pdxx = &dxx; + } + + if (y_grad) { + dyy = UnsqueezeTo(*y_grad, ndims); + pdyy = &dyy; + } + + KronGradOpFunctor func; + func(ctx, out_grad, xx, yy, pdxx, pdyy); +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/kron_kernel_impl.h b/paddle/phi/kernels/impl/kron_kernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..47c76f59df23bfee68a2660b76a09df747048378 --- /dev/null +++ b/paddle/phi/kernels/impl/kron_kernel_impl.h @@ -0,0 +1,167 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" +#include "paddle/phi/kernels/funcs/for_range.h" +#if defined(__NVCC__) || defined(__HIPCC__) +#include "paddle/phi/kernels/funcs/reduce_function.h" +#include "thrust/device_vector.h" +#endif + +namespace phi { + +inline DenseTensor UnsqueezeTo(const DenseTensor& src, int ndims) { + const phi::DDim& shape = src.dims(); + int rank = shape.size(); + DenseTensor res; + res.ShareDataWith(src); + PADDLE_ENFORCE_LE( + rank, + ndims, + errors::InvalidArgument( + "The input Tensor's rank should be less than or equal to ndims" + "Received input Tensor's rank = %d, ndims = %d", + rank, + ndims)); + if (rank < ndims) { + std::vector new_dim(ndims, 1); + for (int i = ndims - rank; i < ndims; i++) { + new_dim[i] = shape[i - ndims + rank]; + } + res.Resize(phi::make_ddim(new_dim)); + } + return res; +} + +template +struct KronElemFunctor { + KronElemFunctor(const T* a, + const T* b, + T* out, + const int64_t* shape_b, + const int64_t* stride_a, + const int64_t* stride_b, + const int64_t* stride_out, + int ndims) + : a_(a), + b_(b), + out_(out), + shape_b_(shape_b), + stride_a_(stride_a), + stride_b_(stride_b), + stride_out_(stride_out), + ndims_(ndims) {} + + HOSTDEVICE void operator()(int64_t idx) const { + // it computes 1 element in the output + int64_t index = idx; + int64_t index_a = 0; + int64_t index_b = 0; + for (int i = 0; i < ndims_; i++) { + auto pos_i = index / stride_out_[i]; + index = index % stride_out_[i]; + auto pos_ai = pos_i / shape_b_[i]; + auto pos_bi = pos_i % shape_b_[i]; + index_a += stride_a_[i] * pos_ai; + index_b += stride_b_[i] * pos_bi; + } + out_[idx] = a_[index_a] * b_[index_b]; + } + + private: + const T* a_; + const T* b_; + T* out_; + const int64_t* shape_b_; + const int64_t* stride_a_; + const int64_t* stride_b_; + const int64_t* stride_out_; + const int ndims_; +}; + +template +struct KronOpFunctor { + void operator()(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { + int ndims = out->dims().size(); + int64_t numel = out->numel(); + + const phi::DDim& dim_x = x.dims(); + const phi::DDim& dim_y = y.dims(); + const phi::DDim& dim_out = out->dims(); + const phi::DDim stride_x = phi::stride(dim_x); + const phi::DDim stride_y = phi::stride(dim_y); + const phi::DDim stride_out = phi::stride(dim_out); + + const int64_t *p_stride_x = nullptr, *p_stride_y = nullptr, + *p_stride_out = nullptr, *p_shape_y = nullptr; +#if defined(__NVCC__) || defined(__HIPCC__) + thrust::device_vector d_stride_x(ndims); + thrust::device_vector d_stride_y(ndims); + thrust::device_vector d_stride_out(ndims); + thrust::device_vector d_shape_y(ndims); + thrust::copy(stride_x.Get(), stride_x.Get() + ndims, d_stride_x.begin()); + thrust::copy(stride_y.Get(), stride_y.Get() + ndims, d_stride_y.begin()); + thrust::copy( + stride_out.Get(), stride_out.Get() + ndims, d_stride_out.begin()); + thrust::copy(dim_y.Get(), dim_y.Get() + ndims, d_shape_y.begin()); + + p_stride_x = thrust::raw_pointer_cast(d_stride_x.data()); + p_stride_y = thrust::raw_pointer_cast(d_stride_y.data()); + p_stride_out = thrust::raw_pointer_cast(d_stride_out.data()); + p_shape_y = thrust::raw_pointer_cast(d_shape_y.data()); +#else + p_stride_x = stride_x.Get(); + p_stride_y = stride_y.Get(); + p_stride_out = stride_out.Get(); + p_shape_y = dim_y.Get(); +#endif + + funcs::ForRange for_range(dev_ctx, numel); + KronElemFunctor functor(x.data(), + y.data(), + out->data(), + p_shape_y, + p_stride_x, + p_stride_y, + p_stride_out, + ndims); + for_range(functor); + } +}; + +template +void KronKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { + ctx.template Alloc(out); + + int ndims = out->dims().size(); + DenseTensor xx = UnsqueezeTo(x, ndims); + DenseTensor yy = UnsqueezeTo(y, ndims); + + KronOpFunctor func; + func(ctx, xx, yy, out); +} + +} // namespace phi diff --git a/paddle/fluid/operators/matrix_rank_op.h b/paddle/phi/kernels/impl/matrix_rank_kernel_impl.h similarity index 72% rename from paddle/fluid/operators/matrix_rank_op.h rename to paddle/phi/kernels/impl/matrix_rank_kernel_impl.h index 93545fd31037ada823d35af5b5bad809ebf3d773..b0dd76a17eeb363d53f29ba3e6cb3e5bf209edfc 100644 --- a/paddle/fluid/operators/matrix_rank_op.h +++ b/paddle/phi/kernels/impl/matrix_rank_kernel_impl.h @@ -1,4 +1,4 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -13,14 +13,11 @@ // limitations under the License. #pragma once -#include -#include "paddle/fluid/framework/tensor.h" -#include "paddle/phi/core/ddim.h" -namespace paddle { -namespace operators { -using Tensor = framework::Tensor; -using DDim = framework::DDim; +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/matrix_rank_kernel.h" + +namespace phi { namespace detail { static DDim GetEigenvalueDim(const DDim& dim, int k) { @@ -44,6 +41,18 @@ static DDim RemoveLastDim(const DDim& dim) { vec.erase(vec.end() - 1, vec.end()); return phi::make_ddim(vec); } + +static DDim GetUDDim(const DDim& x_dim, int k) { + auto x_vec = phi::vectorize(x_dim); + x_vec[x_vec.size() - 1] = k; + return phi::make_ddim(x_vec); +} + +static DDim GetVHDDim(const DDim& x_dim, int k) { + auto x_vec = phi::vectorize(x_dim); + x_vec[x_vec.size() - 2] = k; + return phi::make_ddim(x_vec); +} } // namespace detail template @@ -57,5 +66,4 @@ struct GreaterElementFunctor { } }; -} // namespace operators -} // namespace paddle +} // namespace phi diff --git a/paddle/phi/kernels/impl/pool_grad_kernel_impl.h b/paddle/phi/kernels/impl/pool_grad_kernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..7fe89ce34c8b5a33df12c1931caeddb37de5aea2 --- /dev/null +++ b/paddle/phi/kernels/impl/pool_grad_kernel_impl.h @@ -0,0 +1,332 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/kernels/pool_grad_kernel.h" + +#include "paddle/phi/core/ddim.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/pooling.h" +#include "paddle/phi/kernels/pool_kernel.h" + +namespace phi { + +template +void PoolGradRawKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* dx) { + const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); + std::vector paddings_ = paddings; + std::vector kernel_size_ = kernel_size; + + // update paddings + auto x_dims = x.dims(); + DDim data_dims; + if (channel_last) { + data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1); + } else { + data_dims = slice_ddim(x_dims, 2, x_dims.size()); + } + funcs::UpdatePadding(&paddings_, + global_pooling, + adaptive, + padding_algorithm, + data_dims, + strides, + kernel_size_); + if (data_dims.size() * 2 == static_cast(paddings_.size())) { + for (int i = 0; i < data_dims.size(); ++i) { + paddings_.erase(paddings_.begin() + i + 1); + } + } + + if (global_pooling) { + funcs::UpdateKernelSize(&kernel_size_, data_dims); + } + + if (dx) { + ctx.template Alloc(dx); + funcs::SetConstant set_constant; + set_constant(ctx, dx, static_cast(0.0)); + + switch (kernel_size_.size()) { + case 2: { + if (pooling_type == "max") { + funcs::MaxPool2dGradFunctor pool2d_backward; + pool2d_backward(ctx, + x, + out, + dout, + kernel_size_, + strides, + paddings_, + data_format, + dx); + } else if (pooling_type == "avg") { + funcs::Pool2dGradFunctor, T> + pool2d_backward; + funcs::AvgPoolGrad pool_process; + pool2d_backward(ctx, + x, + out, + dout, + kernel_size_, + strides, + paddings_, + data_format, + exclusive, + adaptive, + dx, + pool_process); + } + } break; + case 3: { + if (pooling_type == "max") { + funcs::MaxPool3dGradFunctor pool3d_backward; + pool3d_backward(ctx, + x, + out, + dout, + kernel_size_, + strides, + paddings_, + data_format, + dx); + } else if (pooling_type == "avg") { + funcs::Pool3dGradFunctor, T> + pool3d_backward; + funcs::AvgPoolGrad pool_process; + pool3d_backward(ctx, + x, + out, + dout, + kernel_size_, + strides, + paddings_, + data_format, + exclusive, + adaptive, + dx, + pool_process); + } + } break; + default: { + PADDLE_THROW( + errors::InvalidArgument("Pool op only supports 2D and 3D input.")); + } + } + } +} + +template +void MaxPoolWithIndexGradRawKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& mask, + const DenseTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool global_pooling, + bool adaptive, + DenseTensor* dx) { + std::vector paddings_ = paddings; + std::vector kernel_size_ = kernel_size; + + if (global_pooling) { + for (size_t i = 0; i < kernel_size_.size(); ++i) { + paddings_[i] = 0; + kernel_size_[i] = static_cast(dx->dims()[i + 2]); + } + } + + if (dx) { + ctx.template Alloc(dx); + funcs::set_constant(ctx, dx, 0); + + switch (kernel_size_.size()) { + case 2: { + funcs::MaxPool2dWithIndexGradFunctor pool2d_backward; + pool2d_backward( + ctx, dout, mask, kernel_size_, strides, paddings_, adaptive, dx); + } break; + case 3: { + funcs::MaxPool3dWithIndexGradFunctor pool3d_backward; + pool3d_backward( + ctx, dout, mask, kernel_size_, strides, paddings_, adaptive, dx); + } break; + default: { + PADDLE_THROW( + errors::InvalidArgument("Pool op only supports 2D and 3D input.")); + } + } + } +} + +template +void Pool2dGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* dx) { + PoolGradRawKernel(ctx, + x, + out, + dout, + kernel_size, + strides, + paddings, + exclusive, + data_format, + pooling_type, + global_pooling, + adaptive, + padding_algorithm, + dx); +} + +template +void Pool2dDoubleGradKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* out) { + if (pooling_type == "max") { + PADDLE_THROW( + errors::InvalidArgument("Pool op grad grad only supports avgpool.")); + } else { + Pool2dKernel(ctx, + x, + kernel_size, + strides, + paddings, + ceil_mode, + exclusive, + data_format, + pooling_type, + global_pooling, + adaptive, + padding_algorithm, + out); + } +} + +template +void MaxPool2dWithIndexGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& mask, + const DenseTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool global_pooling, + bool adaptive, + DenseTensor* dx) { + MaxPoolWithIndexGradRawKernel(ctx, + x, + mask, + dout, + kernel_size, + strides, + paddings, + global_pooling, + adaptive, + dx); +} + +template +void Pool3dGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* dx) { + PoolGradRawKernel(ctx, + x, + out, + dout, + kernel_size, + strides, + paddings, + exclusive, + data_format, + pooling_type, + global_pooling, + adaptive, + padding_algorithm, + dx); +} + +template +void MaxPool3dWithIndexGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& mask, + const DenseTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool global_pooling, + bool adaptive, + DenseTensor* dx) { + MaxPoolWithIndexGradRawKernel(ctx, + x, + mask, + dout, + kernel_size, + strides, + paddings, + global_pooling, + adaptive, + dx); +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/pool_kernel_impl.h b/paddle/phi/kernels/impl/pool_kernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..665d02fd0173e0b4dec7de7bfbf89cfa13d92f3f --- /dev/null +++ b/paddle/phi/kernels/impl/pool_kernel_impl.h @@ -0,0 +1,321 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/kernels/pool_kernel.h" + +#include +#include "paddle/phi/core/ddim.h" +#include "paddle/phi/kernels/funcs/pooling.h" + +#if defined(__HIPCC__) || defined(__NVCC__) +#include "paddle/phi/kernels/funcs/reduce_function.h" +#include "paddle/phi/kernels/primitive/functor_primitives.h" +#endif + +namespace phi { + +inline int GetReduceNum(const DenseTensor& input, + const DenseTensor* output, + const std::string data_format, + std::vector* reduce_dim) { + // data_format only can be NCHW + bool channel_last = (data_format == "NHWC"); + if (channel_last) { + return 0; + } + int reduce_num = 0; + const int output_height = output->dims()[2]; + const int output_width = output->dims()[3]; + if ((output_height == 1) && (output_width == 1)) { + reduce_dim->push_back(2); + reduce_dim->push_back(3); + reduce_num = input.dims()[2] * input.dims()[3]; + } + return reduce_num; +} + +template +void PoolRawKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* out) { + const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); + std::vector paddings_ = paddings; + std::vector kernel_size_ = kernel_size; + + // update paddings + auto x_dims = x.dims(); + DDim data_dims; + if (channel_last) { + data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1); + } else { + data_dims = slice_ddim(x_dims, 2, x_dims.size()); + } + + funcs::UpdatePadding(&paddings_, + global_pooling, + adaptive, + padding_algorithm, + data_dims, + strides, + kernel_size_); + + if (data_dims.size() * 2 == static_cast(paddings_.size())) { + for (int i = 0; i < data_dims.size(); ++i) { + paddings_.erase(paddings_.begin() + i + 1); + } + } + + if (global_pooling) { + funcs::UpdateKernelSize(&kernel_size_, data_dims); + } + + switch (kernel_size_.size()) { + case 2: { + if (pooling_type == "max") { + funcs::Pool2dFunctor, T> pool2d_forward; + funcs::MaxPool pool_process; + pool2d_forward(ctx, + x, + kernel_size_, + strides, + paddings_, + data_format, + true, + false, + out, + pool_process); + + } else if (pooling_type == "avg") { + std::vector reduce_dim; + int reduce_num = GetReduceNum(x, out, data_format, &reduce_dim); + if (reduce_num > 0 && + adaptive) { // for adaptive_avg_pool2d && output_size == 1 +#if defined(__HIPCC__) || defined(__NVCC__) + auto stream = ctx.stream(); + funcs::ReduceKernel>( + ctx, x, out, kps::DivideFunctor(reduce_num), reduce_dim); +#else // for cpu + funcs::Pool2dFunctor, T> pool2d_forward; + funcs::AvgPool pool_process; + pool2d_forward(ctx, + x, + kernel_size_, + strides, + paddings_, + data_format, + exclusive, + adaptive, + out, + pool_process); +#endif + } else { // avgpool_2d or adaptive_avg_pool2d && output_size != 1 + funcs::Pool2dFunctor, T> pool2d_forward; + funcs::AvgPool pool_process; + pool2d_forward(ctx, + x, + kernel_size_, + strides, + paddings_, + data_format, + exclusive, + adaptive, + out, + pool_process); + } + } + } break; + case 3: { + if (pooling_type == "max") { + funcs::Pool3dFunctor, T> pool3d_forward; + funcs::MaxPool pool_process; + pool3d_forward(ctx, + x, + kernel_size_, + strides, + paddings_, + data_format, + true, + false, + out, + pool_process); + } else if (pooling_type == "avg") { + funcs::Pool3dFunctor, T> pool3d_forward; + funcs::AvgPool pool_process; + pool3d_forward(ctx, + x, + kernel_size_, + strides, + paddings_, + data_format, + exclusive, + adaptive, + out, + pool_process); + } + } break; + default: { + PADDLE_THROW( + errors::InvalidArgument("Pool op only supports 2D and 3D input.")); + } + } +} + +template +void MaxPoolWithIndexRawKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool global_pooling, + bool adaptive, + DenseTensor* out, + DenseTensor* mask) { + std::vector paddings_ = paddings; + std::vector kernel_size_ = kernel_size; + + if (global_pooling) { + for (size_t i = 0; i < kernel_size_.size(); ++i) { + paddings_[i] = 0; + kernel_size_[i] = static_cast(x.dims()[i + 2]); + } + } + + switch (kernel_size_.size()) { + case 2: { + funcs::MaxPool2dWithIndexFunctor pool2d_forward; + pool2d_forward( + ctx, x, kernel_size_, strides, paddings_, adaptive, out, mask); + } break; + case 3: { + funcs::MaxPool3dWithIndexFunctor pool3d_forward; + pool3d_forward( + ctx, x, kernel_size_, strides, paddings_, adaptive, out, mask); + } break; + default: { + PADDLE_THROW( + errors::InvalidArgument("Pool op only supports 2D and 3D input.")); + } + } +} + +template +void Pool2dKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* out) { + PoolRawKernel(ctx, + x, + kernel_size, + strides, + paddings, + exclusive, + data_format, + pooling_type, + global_pooling, + adaptive, + padding_algorithm, + out); +} + +template +void MaxPool2dWithIndexKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool global_pooling, + bool adaptive, + DenseTensor* out, + DenseTensor* mask) { + MaxPoolWithIndexRawKernel(ctx, + x, + kernel_size, + strides, + paddings, + global_pooling, + adaptive, + out, + mask); +} + +template +void Pool3dKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* out) { + PoolRawKernel(ctx, + x, + kernel_size, + strides, + paddings, + exclusive, + data_format, + pooling_type, + global_pooling, + adaptive, + padding_algorithm, + out); +} + +template +void MaxPool3dWithIndexKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool global_pooling, + bool adaptive, + DenseTensor* out, + DenseTensor* mask) { + MaxPoolWithIndexRawKernel(ctx, + x, + kernel_size, + strides, + paddings, + global_pooling, + adaptive, + out, + mask); +} + +} // namespace phi diff --git a/paddle/fluid/operators/searchsorted_op.h b/paddle/phi/kernels/impl/searchsorted_kernel_impl.h similarity index 58% rename from paddle/fluid/operators/searchsorted_op.h rename to paddle/phi/kernels/impl/searchsorted_kernel_impl.h index 6aa38a815813230c2921f3d3816881966df6bf98..82bd9fba2a66d7a4601b5aab360b9bbf80ff04d9 100644 --- a/paddle/fluid/operators/searchsorted_op.h +++ b/paddle/phi/kernels/impl/searchsorted_kernel_impl.h @@ -1,4 +1,4 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -16,16 +16,11 @@ #include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/for_range.h" #include "paddle/phi/core/ddim.h" #include "paddle/phi/kernels/funcs/algorithm.h" +#include "paddle/phi/kernels/funcs/for_range.h" -namespace paddle { -namespace operators { -using Tensor = framework::Tensor; +namespace phi { template class GpuAndCpuSearchSortedCompute { @@ -65,9 +60,11 @@ class GpuAndCpuSearchSortedCompute { static HOSTDEVICE bool IsInf(int64_t x) { return false; } HOSTDEVICE GpuAndCpuSearchSortedCompute(const T1* sequence_data, - const T2* value_data, bool right, + const T2* value_data, + bool right, bool is_1d_boundaries, - int64_t val_size, int64_t seq_size, + int64_t val_size, + int64_t seq_size, OutType* out_data) : sequence_data_(sequence_data), value_data_(value_data), @@ -104,12 +101,13 @@ class GpuAndCpuSearchSortedCompute { OutType* out_data_; }; -template +template class SearchSortedFunctor { public: - SearchSortedFunctor(const framework::ExecutionContext& context, - const framework::Tensor* sorted_sequence, - const framework::Tensor* value, bool right, + SearchSortedFunctor(const Context& context, + const DenseTensor* sorted_sequence, + const DenseTensor* value, + bool right, OutType* out_data) : context_(context), sorted_sequence_(sorted_sequence), @@ -121,74 +119,73 @@ class SearchSortedFunctor { void apply() { const T1* sequence_data = sorted_sequence_->data(); const T2* value_data = value_->data(); - const framework::DDim& seq_dims = sorted_sequence_->dims(); - const framework::DDim& val_dims = value_->dims(); + const phi::DDim& seq_dims = sorted_sequence_->dims(); + const phi::DDim& val_dims = value_->dims(); bool is_1d_boundaries = seq_dims.size() == 1; int64_t val_size = val_dims[val_dims.size() - 1]; int64_t seq_size = seq_dims[seq_dims.size() - 1]; - auto& dev_ctx = context_.template device_context(); - platform::ForRange for_range(dev_ctx, value_->numel()); + funcs::ForRange for_range(context_, value_->numel()); GpuAndCpuSearchSortedCompute - gpu_and_cpu_search_sorted_compute(sequence_data, value_data, right_, - is_1d_boundaries, val_size, seq_size, + gpu_and_cpu_search_sorted_compute(sequence_data, + value_data, + right_, + is_1d_boundaries, + val_size, + seq_size, out_data_); for_range(gpu_and_cpu_search_sorted_compute); } private: - const framework::ExecutionContext& context_; - const framework::Tensor* sorted_sequence_; - const framework::Tensor* value_; + const Context& context_; + const DenseTensor* sorted_sequence_; + const DenseTensor* value_; bool right_; OutType* out_data_; }; template -static void VisitDataType(framework::proto::VarType::Type type, - Visitor visitor) { - if (type == framework::proto::VarType::FP32) { +static void VisitDataType(DataType type, Visitor visitor) { + if (type == DataType::FLOAT32) { visitor.template apply(); - } else if (type == framework::proto::VarType::FP64) { + } else if (type == DataType::FLOAT64) { visitor.template apply(); - } else if (type == framework::proto::VarType::INT32) { + } else if (type == DataType::INT32) { visitor.template apply(); - } else if (type == framework::proto::VarType::INT64) { + } else if (type == DataType::INT64) { visitor.template apply(); } else { - PADDLE_THROW(platform::errors::InvalidArgument( + PADDLE_THROW(errors::InvalidArgument( "The recieved values data type %s can not meet input requirements. " "Because the given values data type of searchsorted operators must be " "float32, float64, int32 or int64. Please input appropriate " "sorted_sequence again! ", - framework::DataTypeToString(type))); + type)); } } -template -class SearchSortedKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* sorted_sequence = context.Input("SortedSequence"); - auto* value = context.Input("Values"); - bool out_int32 = context.Attr("out_int32"); - bool right = context.Attr("right"); - auto* out = context.Output("Out"); - - if (out_int32) { - int* out_data = out->mutable_data(context.GetPlace()); - SearchSortedFunctor functor( - context, sorted_sequence, value, right, out_data); - VisitDataType(framework::TransToProtoVarType(value->dtype()), functor); - } else { - int64_t* out_data = out->mutable_data(context.GetPlace()); - SearchSortedFunctor functor( - context, sorted_sequence, value, right, out_data); - VisitDataType(framework::TransToProtoVarType(value->dtype()), functor); - } +template +void SearchsortedKernel(const Context& ctx, + const DenseTensor& sorted_sequence, + const DenseTensor& value, + bool out_int32, + bool right, + DenseTensor* out) { + if (out_int32) { + ctx.template Alloc(out); + int* out_data = out->data(); + SearchSortedFunctor functor( + ctx, &sorted_sequence, &value, right, out_data); + VisitDataType(value.dtype(), functor); + } else { + ctx.template Alloc(out); + int64_t* out_data = out->data(); + SearchSortedFunctor functor( + ctx, &sorted_sequence, &value, right, out_data); + VisitDataType(value.dtype(), functor); } -}; +} -} // namespace operators -} // namespace paddle +} // namespace phi diff --git a/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h b/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..4947170088cba9701ad1065098451b97139bfc95 --- /dev/null +++ b/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h @@ -0,0 +1,344 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/core/dense_tensor.h" + +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/full_kernel.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +#include "paddle/fluid/operators/strided_slice_op.h" + +namespace phi { + +inline void GetOffsets(const DDim& big_dim, + const DDim& small_dim, + DDim start_offset, + int cur_dim, + std::vector* offsets) { + if (cur_dim == big_dim.size()) { + offsets->push_back(start_offset); + return; + } + if (small_dim[cur_dim] == big_dim[cur_dim]) { + GetOffsets(big_dim, small_dim, start_offset, cur_dim + 1, offsets); + } else { + for (int i = 0; i < big_dim[cur_dim]; i++) { + GetOffsets(big_dim, small_dim, start_offset, cur_dim + 1, offsets); + start_offset[cur_dim] += 1; + } + } +} + +template +void SetValueGradImpl(const Context& dev_ctx, + const DenseTensor& out_grad, + const ScalarArray& starts, + const ScalarArray& ends, + const ScalarArray& steps, + const std::vector& axes, + const std::vector& decrease_axes, + const std::vector& none_axes, + DenseTensor* x_grad, + DenseTensor* value_grad) { + PADDLE_ENFORCE_EQ( + out_grad.IsInitialized(), + true, + errors::PermissionDenied( + "The input of `set_value_grad`(out_grad) has not been initialized")); + + auto in_dims = out_grad.dims(); + + std::vector decrease_axis_int32(decrease_axes.begin(), + decrease_axes.end()); + std::vector axes_int32(axes.begin(), axes.end()); + std::vector infer_flags(axes.size(), 1); + std::vector out_dims_vector(in_dims.size(), -1); + std::vector starts_local = starts.GetData(); + std::vector ends_local = ends.GetData(); + std::vector steps_local = steps.GetData(); + paddle::operators::StridedSliceOutDims(starts_local, + ends_local, + steps_local, + axes_int32, + infer_flags, + in_dims, + decrease_axis_int32, + out_dims_vector.data(), + axes.size(), + false); + + DDim out_dims(phi::make_ddim(out_dims_vector)); + + std::vector reverse_vector(starts_local.size(), 0); + paddle::operators::StridedSliceFunctor(starts_local.data(), + ends_local.data(), + steps_local.data(), + axes_int32.data(), + reverse_vector.data(), + in_dims, + infer_flags, + decrease_axis_int32, + starts_local.size()); + + auto starts_indices = Eigen::DSizes(); + auto ends_indices = Eigen::DSizes(); + auto steps_indices = Eigen::DSizes(); + auto reverse_axis = Eigen::array(); + + for (size_t axis = 0; axis < RANK; axis++) { + starts_indices[axis] = 0; + ends_indices[axis] = out_dims[axis]; + steps_indices[axis] = 1; + reverse_axis[axis] = false; + } + + for (size_t axis = 0; axis < axes.size(); axis++) { + int axis_index = axes[axis]; + starts_indices[axis_index] = starts_local[axis]; + ends_indices[axis_index] = ends_local[axis]; + steps_indices[axis_index] = steps_local[axis]; + reverse_axis[axis_index] = (reverse_vector[axis] == 1) ? true : false; + } + + bool need_reverse = false; + for (size_t axis = 0; axis < axes.size(); axis++) { + if (reverse_vector[axis] == 1) { + need_reverse = true; + break; + } + } + + auto& place = *dev_ctx.eigen_device(); + phi::funcs::SetConstant set_zero; + + if (x_grad) { + // Set gradient of `Input` + Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad); + + auto x_grad_t = + EigenTensor::From(*x_grad); + + DenseTensor tmp = Full(dev_ctx, out_dims_vector, static_cast(0)); + auto tmp_t = + EigenTensor::From(tmp); + + x_grad_t.stridedSlice(starts_indices, ends_indices, steps_indices) + .device(place) = tmp_t; + } + if (value_grad) { + dev_ctx.template Alloc(value_grad); + set_zero(dev_ctx, value_grad, static_cast(0)); + + auto in_t = EigenTensor::From( + out_grad); + + if (value_grad->dims() == out_dims) { + auto value_grad_t = + EigenTensor::From( + *value_grad); + if (need_reverse) { + DenseTensor tmp = Full(dev_ctx, out_dims_vector, static_cast(0)); + auto tmp_t = + EigenTensor::From(tmp); + + tmp_t.device(place) = + in_t.stridedSlice(starts_indices, ends_indices, steps_indices); + value_grad_t.device(place) = tmp_t.reverse(reverse_axis); + } else { + value_grad_t.device(place) = + in_t.stridedSlice(starts_indices, ends_indices, steps_indices); + } + } else { + int out_dims_size = out_dims.size(); + auto value_grad_dims = value_grad->dims(); + auto fake_value_grad_dims = out_dims; + + // Create an extented shape according to the rules of broadcast. + auto value_grad_dims_size = value_grad_dims.size(); + + int num_decrease = 0; + + int decrease_axis_size = decrease_axes.size(); + for (int i = 0; i < out_dims_size; i++) { + if (decrease_axes.end() != + std::find(decrease_axes.begin(), decrease_axes.end(), i)) { + fake_value_grad_dims[i] = 1; + num_decrease++; + } else if (i < out_dims_size - (value_grad_dims_size + + decrease_axis_size - num_decrease)) { + fake_value_grad_dims[i] = 1; + } else { + auto index_grad = + i - (out_dims_size - + (value_grad_dims_size + decrease_axis_size - num_decrease)); + fake_value_grad_dims[i] = value_grad_dims[index_grad]; + + PADDLE_ENFORCE_EQ((out_dims[i] == value_grad_dims[index_grad]) || + (value_grad_dims[index_grad] == 1), + true, + errors::InvalidArgument( + "An error occurred while calculating %s: " + "[%s] can not be accumulated into [%s].", + paddle::framework::GradVarName("ValueTensor"), + out_dims, + value_grad_dims)); + } + } + + VLOG(3) << "Dimensions of " + << paddle::framework::GradVarName("ValueTensor") << "([" + << value_grad_dims << "])is broadcasted into [" + << fake_value_grad_dims << "]."; + + auto extent = Eigen::DSizes(); + auto offset = out_dims; + for (int i = 0; i < out_dims_size; i++) { + offset[i] = 0; + extent[i] = fake_value_grad_dims[i]; + } + std::vector offsets; + GetOffsets(out_dims, fake_value_grad_dims, offset, 0, &offsets); + + auto value_grad_t = + EigenTensor::From( + *value_grad, fake_value_grad_dims); + + DenseTensor tmp = Full(dev_ctx, out_dims_vector, static_cast(0)); + auto tmp_t = + EigenTensor::From(tmp); + + tmp_t.device(place) = + in_t.stridedSlice(starts_indices, ends_indices, steps_indices); + + // accumulate gradient + for (auto offset : offsets) { + value_grad_t.device(place) = + value_grad_t + tmp_t.slice(EigenDim::From(offset), extent); + } + if (need_reverse) { + DenseTensor tmp_value = + Full(dev_ctx, + {fake_value_grad_dims.Get(), fake_value_grad_dims.size()}, + static_cast(0)); + auto tmp_value_t = + EigenTensor::From( + tmp_value); + tmp_value_t.device(place) = value_grad_t.reverse(reverse_axis); + value_grad_t.device(place) = tmp_value_t; + } + } + } +} + +template +void SetValueGradKernel(const Context& dev_ctx, + const DenseTensor& out_grad, + const ScalarArray& starts, + const ScalarArray& ends, + const ScalarArray& steps, + const std::vector& axes, + const std::vector& decrease_axes, + const std::vector& none_axes, + DenseTensor* x_grad, + DenseTensor* value_grad) { + const int rank = out_grad.dims().size(); + + switch (rank) { + case 1: + SetValueGradImpl(dev_ctx, + out_grad, + starts, + ends, + steps, + axes, + decrease_axes, + none_axes, + x_grad, + value_grad); + break; + case 2: + SetValueGradImpl(dev_ctx, + out_grad, + starts, + ends, + steps, + axes, + decrease_axes, + none_axes, + x_grad, + value_grad); + break; + case 3: + SetValueGradImpl(dev_ctx, + out_grad, + starts, + ends, + steps, + axes, + decrease_axes, + none_axes, + x_grad, + value_grad); + break; + case 4: + SetValueGradImpl(dev_ctx, + out_grad, + starts, + ends, + steps, + axes, + decrease_axes, + none_axes, + x_grad, + value_grad); + break; + case 5: + SetValueGradImpl(dev_ctx, + out_grad, + starts, + ends, + steps, + axes, + decrease_axes, + none_axes, + x_grad, + value_grad); + break; + case 6: + SetValueGradImpl(dev_ctx, + out_grad, + starts, + ends, + steps, + axes, + decrease_axes, + none_axes, + x_grad, + value_grad); + break; + default: + PADDLE_THROW(phi::errors::InvalidArgument( + "The rank of set_value_grad's input should be less than 7, but " + "received %d.", + rank)); + } +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/set_value_kernel_impl.h b/paddle/phi/kernels/impl/set_value_kernel_impl.h index 5aebffe51b5e388bcc7ea72d1b804ffcb8768821..99db559f3b8166258a80814859c3296933634db8 100644 --- a/paddle/phi/kernels/impl/set_value_kernel_impl.h +++ b/paddle/phi/kernels/impl/set_value_kernel_impl.h @@ -25,7 +25,6 @@ #include "paddle/phi/kernels/funcs/eigen/eigen_function.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h" -#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/slice_utils.h" namespace phi { diff --git a/paddle/phi/kernels/impl/softmax_kernel_impl.h b/paddle/phi/kernels/impl/softmax_kernel_impl.h index 6552f6ed581f45008f01c02fad3c007bf3664942..7aa43fdb7f27056d5cb4c2947e2764bd8868ff02 100644 --- a/paddle/phi/kernels/impl/softmax_kernel_impl.h +++ b/paddle/phi/kernels/impl/softmax_kernel_impl.h @@ -22,10 +22,10 @@ limitations under the License. */ namespace phi { template -void SoftmaxRawKernel(const Context& dev_ctx, - const DenseTensor& x, - int axis, - DenseTensor* out) { +void SoftmaxKernel(const Context& dev_ctx, + const DenseTensor& x, + int axis, + DenseTensor* out) { const int rank = x.dims().size(); const int calc_axis = phi::funcs::CanonicalAxis(axis, rank); int axis_dim = x.dims()[calc_axis]; diff --git a/paddle/phi/kernels/kron_grad_kernel.h b/paddle/phi/kernels/kron_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..3daa9dcfba9f0d89bd8dec88905f0ddb321f630a --- /dev/null +++ b/paddle/phi/kernels/kron_grad_kernel.h @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void KronGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out_grad, + DenseTensor* x_grad, + DenseTensor* y_grad); + +} // namespace phi diff --git a/paddle/infrt/dialect/pd_types.cc b/paddle/phi/kernels/kron_kernel.h similarity index 61% rename from paddle/infrt/dialect/pd_types.cc rename to paddle/phi/kernels/kron_kernel.h index 94856e362d301978970279846907f41dfbc00b56..4451ac757a9534f4a48db97da81acc2047c26be2 100644 --- a/paddle/infrt/dialect/pd_types.cc +++ b/paddle/phi/kernels/kron_kernel.h @@ -1,4 +1,4 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,4 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/infrt/dialect/pd_types.h" +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void KronKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/sparse/cpu/submanifold_convolution_kernel.cu b/paddle/phi/kernels/matrix_rank_kernel.h similarity index 63% rename from paddle/phi/kernels/sparse/cpu/submanifold_convolution_kernel.cu rename to paddle/phi/kernels/matrix_rank_kernel.h index 5f6d24093a4d703d86550ab1847a082823f8af6b..6edea2723e589340f2c6dc3cfb0be6f895bf08bb 100644 --- a/paddle/phi/kernels/sparse/cpu/submanifold_convolution_kernel.cu +++ b/paddle/phi/kernels/matrix_rank_kernel.h @@ -14,17 +14,16 @@ limitations under the License. */ #pragma once -#include - -#include "paddle/phi/api/lib/utils/allocator.h" -#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/core/sparse_coo_tensor.h" -#include "paddle/phi/core/tensor_meta.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" -#include "paddle/phi/kernels/sparse/submanifold_convolution_kernel.h" namespace phi { -namespace sparse {} // namespace sparse + +template +void MatrixRankKernel(const Context& dev_ctx, + const DenseTensor& x, + float tol, + bool use_default_tol, + bool hermitian, + DenseTensor* out); + } // namespace phi diff --git a/paddle/phi/kernels/matrix_rank_tol_kernel.h b/paddle/phi/kernels/matrix_rank_tol_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..351358dfa04aa7ad2091ad0e01bc63e50046eda0 --- /dev/null +++ b/paddle/phi/kernels/matrix_rank_tol_kernel.h @@ -0,0 +1,29 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void MatrixRankTolKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& atol_tensor, + bool use_default_tol, + bool hermitian, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/one_hot_kernel.cc b/paddle/phi/kernels/one_hot_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..633f48cbb62ace9e3f7f21502bd61f8c305fb542 --- /dev/null +++ b/paddle/phi/kernels/one_hot_kernel.cc @@ -0,0 +1,38 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/one_hot_kernel.h" + +#include "paddle/phi/backends/all_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void OneHotKernel(const Context& dev_ctx, + const DenseTensor& x, + const Scalar& num_classes_s, + DenseTensor* out) { + int num_classes = num_classes_s.to(); + OneHotRawKernel( + dev_ctx, x, num_classes, phi::DataType::FLOAT32, false, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL(one_hot, CPU, ALL_LAYOUT, phi::OneHotKernel, int, int64_t) {} + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_REGISTER_KERNEL(one_hot, GPU, ALL_LAYOUT, phi::OneHotKernel, int, int64_t) {} +#endif diff --git a/paddle/phi/kernels/one_hot_kernel.h b/paddle/phi/kernels/one_hot_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..9f89609ea63365b0e7831201ca003d6c7320c5d7 --- /dev/null +++ b/paddle/phi/kernels/one_hot_kernel.h @@ -0,0 +1,36 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/scalar.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void OneHotKernel(const Context& dev_ctx, + const DenseTensor& x, + const Scalar& num_classes, + DenseTensor* out); + +template +void OneHotRawKernel(const Context& dev_ctx, + const DenseTensor& x, + int32_t depth, + DataType dtype, + bool allow_out_of_range, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/pool_grad_kernel.h b/paddle/phi/kernels/pool_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..0658dc22c823bf7ae162fb2e392f256cfb051496 --- /dev/null +++ b/paddle/phi/kernels/pool_grad_kernel.h @@ -0,0 +1,145 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void Pool2dGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* dx); + +template +void Pool2dGradGPUDNNKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* dx); + +template +void Pool2dDoubleGradKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* out); + +template +void Pool2dDoubleGradGPUDNNKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* out); + +template +void MaxPool2dWithIndexGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& mask, + const DenseTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool global_pooling, + bool adaptive, + DenseTensor* dx); + +template +void Pool3dGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* dx); + +template +void Pool3dGradGPUDNNKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* dx); + +template +void MaxPool3dWithIndexGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& mask, + const DenseTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool global_pooling, + bool adaptive, + DenseTensor* dx); + +} // namespace phi diff --git a/paddle/phi/kernels/pool_kernel.h b/paddle/phi/kernels/pool_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..348af021815175ca2c6c94b9721fec33fbaf864c --- /dev/null +++ b/paddle/phi/kernels/pool_kernel.h @@ -0,0 +1,105 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void Pool2dKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* out); + +template +void Pool2dGPUDNNKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* out); + +template +void MaxPool2dWithIndexKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool global_pooling, + bool adaptive, + DenseTensor* out, + DenseTensor* mask); + +template +void Pool3dKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* out); + +template +void Pool3dGPUDNNKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* out); + +template +void MaxPool3dWithIndexKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool global_pooling, + bool adaptive, + DenseTensor* out, + DenseTensor* mask); + +} // namespace phi diff --git a/paddle/phi/kernels/roi_align_kernel.h b/paddle/phi/kernels/roi_align_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..16b52c563a592f0cc23ddca94f554f5dc49e8ccf --- /dev/null +++ b/paddle/phi/kernels/roi_align_kernel.h @@ -0,0 +1,34 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/utils/optional.h" + +namespace phi { + +template +void ROIAlignKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& boxes, + paddle::optional boxes_num, + int pooled_height, + int pooled_width, + float spatial_scale, + int sampling_ratio, + bool aligned, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/searchsorted_kernel.h b/paddle/phi/kernels/searchsorted_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..e425c7fd7955544cc429fb0a071e8f8038b47063 --- /dev/null +++ b/paddle/phi/kernels/searchsorted_kernel.h @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void SearchsortedKernel(const Context& ctx, + const DenseTensor& sorted_sequence, + const DenseTensor& value, + bool out_int32, + bool right, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/set_value_grad_kernel.h b/paddle/phi/kernels/set_value_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..6a028b0c8dc50fb068de1ded367990c409bd45cb --- /dev/null +++ b/paddle/phi/kernels/set_value_grad_kernel.h @@ -0,0 +1,34 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void SetValueGradKernel(const Context& dev_ctx, + const DenseTensor& out_grad, + const ScalarArray& starts, + const ScalarArray& ends, + const ScalarArray& steps, + const std::vector& axes, + const std::vector& decrease_axes, + const std::vector& none_axes, + DenseTensor* x_grad, + DenseTensor* value_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/softmax_kernel.h b/paddle/phi/kernels/softmax_kernel.h index ca69d652770aacd01191f5c3ca685276f0f2336f..4edd562ca885301b02b8ecc737c8590831e3cac4 100644 --- a/paddle/phi/kernels/softmax_kernel.h +++ b/paddle/phi/kernels/softmax_kernel.h @@ -19,20 +19,10 @@ limitations under the License. */ namespace phi { -template -void SoftmaxRawKernel(const Context& dev_ctx, - const DenseTensor& x, - int axis, - DenseTensor* out); - template void SoftmaxKernel(const Context& dev_ctx, const DenseTensor& x, int axis, - DataType dtype, - DenseTensor* out) { - auto cast_x = phi::Cast(dev_ctx, x, dtype); - phi::SoftmaxRawKernel(dev_ctx, axis, out); -} + DenseTensor* out); } // namespace phi diff --git a/paddle/phi/kernels/sparse/convolution_kernel.h b/paddle/phi/kernels/sparse/convolution_kernel.h index 778600a2285de63a481ccd0094cb07a3206b48d9..ff2cf94edb5a378b5d43d569a869fcd705ef12bd 100644 --- a/paddle/phi/kernels/sparse/convolution_kernel.h +++ b/paddle/phi/kernels/sparse/convolution_kernel.h @@ -18,105 +18,11 @@ limitations under the License. */ #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/sparse_coo_tensor.h" #include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/funcs/sparse/convolution.h" namespace phi { namespace sparse { -struct Dims4D { - int dims[4]; - Dims4D(const int batch, const int x, const int y, const int z) { - dims[0] = batch; - dims[1] = z; - dims[2] = y; - dims[3] = x; - } - HOSTDEVICE const int& operator[](int i) const { return dims[i]; } -}; - -// Judge whether the current position x is in (lower, upper) -inline HOSTDEVICE bool Check(const int& x, - const int& kx, - const int& pad, - const int& stride, - const int dilation, - const int kdim, - const int xdim) { - const int lower = x - dilation * kx + pad; - const int uper = x + (kdim - kx - 1) * dilation - pad; - return (lower >= 0 && lower % stride == 0 && uper < xdim); -} - -// Check whether the current position(x, y, z) is legal: -// Judge the minimum and maximum values at each latitude -inline HOSTDEVICE bool Check(const Dims4D& dims, - const Dims4D& kernel_dims, - const Dims4D& paddings, - const Dims4D& dilations, - const Dims4D& strides, - const int x, - const int y, - const int z, - const int kx, - const int ky, - const int kz) { - bool x_valid = Check( - x, kx, paddings[3], strides[3], dilations[3], kernel_dims[3], dims[3]); - bool y_valid = Check( - y, ky, paddings[2], strides[2], dilations[2], kernel_dims[2], dims[2]); - bool z_valid = Check( - z, kz, paddings[1], strides[1], dilations[1], kernel_dims[1], dims[1]); - return (x_valid && y_valid && z_valid); -} - -template -inline HOSTDEVICE int PointToIndex(const int& batch, - const int& x, - const int& y, - const int& z, - const Dim& dims) { - return batch * dims[1] * dims[2] * dims[3] + z * dims[2] * dims[3] + - y * dims[3] + x; -} - -template -inline HOSTDEVICE void IndexToPoint( - const int index, const Dim& dims, int* batch, int* x, int* y, int* z) { - int n = index; - *x = n % dims[3]; - n /= dims[3]; - *y = n % dims[2]; - n /= dims[2]; - *z = n % dims[1]; - n /= dims[1]; - *batch = n; -} - -inline void GetOutShape(const DDim& x_dims, - const DDim& kernel_dims, - const std::vector& paddings, - const std::vector& dilations, - const std::vector& strides, - DDim* out_dims) { - PADDLE_ENFORCE_EQ( - x_dims.size(), - 5, - phi::errors::InvalidArgument("the shape of x should be (N, D, H, W, C)")); - PADDLE_ENFORCE_EQ(kernel_dims.size(), - 5, - phi::errors::InvalidArgument( - "the shape of kernel should be (D, H, W, C, OC)")); - - // infer out shape - (*out_dims)[0] = x_dims[0]; - (*out_dims)[4] = kernel_dims[4]; - for (int i = 1; i < 4; i++) { - (*out_dims)[i] = (x_dims[i] + 2 * paddings[i - 1] - - dilations[i - 1] * (kernel_dims[i - 1] - 1) - 1) / - strides[i - 1] + - 1; - } -} - template void Conv3dKernel(const Context& dev_ctx, const SparseCooTensor& x, diff --git a/paddle/phi/kernels/sparse/cpu/convolution.h b/paddle/phi/kernels/sparse/cpu/convolution.h index a5a946dce7912f706b1b4c149c89331ce9a3744f..64c32df18971c4d66873b02a61220a5bed8db005 100644 --- a/paddle/phi/kernels/sparse/cpu/convolution.h +++ b/paddle/phi/kernels/sparse/cpu/convolution.h @@ -16,8 +16,6 @@ limitations under the License. */ #include -#include "paddle/phi/api/lib/utils/allocator.h" -#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/sparse_coo_tensor.h" @@ -28,6 +26,8 @@ limitations under the License. */ namespace phi { namespace sparse { +using Dims4D = phi::funcs::sparse::Dims4D; + // such as: kernel(3, 3, 3), kernel_size = 27 // counter_per_weight: (kernel_size) // TODO(zhangkaihuo): optimize performance with multithreading @@ -67,7 +67,8 @@ void ProductRuleBook(const Context& dev_ctx, int in_z = indices_ptr[i + non_zero_num]; int in_y = indices_ptr[i + 2 * non_zero_num]; int in_x = indices_ptr[i + 3 * non_zero_num]; - int index = PointToIndex(batch, in_x, in_y, in_z, x_dims); + int index = phi::funcs::sparse::PointToIndex( + batch, in_x, in_y, in_z, x_dims); hash_in.insert(index); } } @@ -86,20 +87,20 @@ void ProductRuleBook(const Context& dev_ctx, int out_z = (in_z + paddings[0] - kz * dilations[0]) / strides[0]; int out_y = (in_y + paddings[1] - ky * dilations[1]) / strides[1]; int out_x = (in_x + paddings[2] - kx * dilations[2]) / strides[2]; - if (Check(c_x_dims, - c_kernel_dims, - c_paddings, - c_dilations, - c_strides, - in_x, - in_y, - in_z, - kx, - ky, - kz)) { + if (phi::funcs::sparse::Check(c_x_dims, + c_kernel_dims, + c_paddings, + c_dilations, + c_strides, + in_x, + in_y, + in_z, + kx, + ky, + kz)) { if (subm) { - int out_index = - PointToIndex(batch, out_x, out_y, out_z, out_dims); + int out_index = phi::funcs::sparse::PointToIndex( + batch, out_x, out_y, out_z, out_dims); if (hash_in.find(out_index) == hash_in.end()) { continue; } @@ -112,7 +113,7 @@ void ProductRuleBook(const Context& dev_ctx, rulebook_ptr[rulebook_index] = kernel_index - 1; rulebook_ptr[rulebook_index + rulebook_len] = i; // in_i rulebook_ptr[rulebook_index + rulebook_len * 2] = - PointToIndex( + phi::funcs::sparse::PointToIndex( batch, out_x, out_y, out_z, out_dims); // out_index ++rulebook_index; } @@ -161,7 +162,7 @@ void UpdateRulebookAndOutIndex(const Context& dev_ctx, for (auto it = out_indexs.begin(); it != out_indexs.end(); it++, i++) { const int index = *it; int batch, x, y, z; - IndexToPoint(index, out_dims, &batch, &x, &y, &z); + phi::funcs::sparse::IndexToPoint(index, out_dims, &batch, &x, &y, &z); out_indices_ptr[i] = batch; out_indices_ptr[i + out_non_zero_num] = z; out_indices_ptr[i + out_non_zero_num * 2] = y; diff --git a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc index bb414faef6743126cf2e25b49ae17689f0a6048f..5d7b381b7cb0beef7e69608ce7f732d8cdf9d222 100644 --- a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc @@ -94,30 +94,15 @@ void Conv3dGradKernel(const Context& dev_ctx, offsets[kernel_size] = offset; if (subm) { - blas.GEMM(CblasTrans, - CblasNoTrans, - x.non_zero_elements().dims()[1], - out_grad.non_zero_elements().dims()[1], - x.non_zero_elements().dims()[0], - static_cast(1), - x.non_zero_elements().data(), - out_grad.non_zero_elements().data(), - static_cast(0), - d_kernel_ptr + half_kernel_size * in_channels * out_channels); - - // call gemm: d_x = out_grad * transpose(kernel) - // (n, out_channels) * (out_channels, in_channels) - T* x_grad_ptr = x_grad->data(); - blas.GEMM(CblasNoTrans, - CblasTrans, - out_grad.non_zero_elements().dims()[0], - in_channels, - out_grad.non_zero_elements().dims()[1], - static_cast(1), - out_grad.non_zero_elements().data(), - kernel.data() + half_kernel_size * in_channels * out_channels, - static_cast(0), - x_grad_ptr); + phi::funcs::sparse::SubmPreProcess(dev_ctx, + x, + kernel, + out_grad, + in_channels, + out_channels, + half_kernel_size, + kernel_grad, + x_grad); if (max_count == 0) { return; } diff --git a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc index f65e1cf579a9344b6e46ff693b1ce05600adc6a0..746ca04a826c020201e53803dd2ec83519cf576e 100644 --- a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc @@ -13,8 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/phi/kernels/sparse/cpu/convolution.h" -#include "paddle/phi/api/lib/utils/allocator.h" -#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_meta.h" #include "paddle/phi/kernels/funcs/blas/blas.h" @@ -46,10 +44,16 @@ void Conv3dKernel(const Context& dev_ctx, const auto& kernel_dims = kernel.dims(); int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2]; DDim out_dims = {1, 1, 1, 1, 1}; - GetOutShape(x_dims, kernel_dims, paddings, dilations, strides, &out_dims); + phi::funcs::sparse::GetOutShape( + x_dims, kernel_dims, paddings, dilations, strides, &out_dims); const int in_channels = kernel_dims[3]; const int out_channels = kernel_dims[4]; + std::vector subm_paddings(paddings), subm_strides(strides); + if (subm) { + phi::funcs::sparse::ResetSubmKernelSizeAndStrides( + kernel.dims(), &subm_paddings, &subm_strides); + } // Second algorithm: // https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf // 1. product rulebook @@ -60,9 +64,9 @@ void Conv3dKernel(const Context& dev_ctx, ProductRuleBook(dev_ctx, x, kernel, - paddings, + subm_paddings, dilations, - strides, + subm_strides, out_dims, subm, rulebook, diff --git a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc index ba89135641e0e67daa84cd526d8b389953ef1862..50e95ee0b8a4876a65b8ba7d09fd2d112eac2b30 100644 --- a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc @@ -14,9 +14,9 @@ limitations under the License. */ #include "paddle/phi/kernels/sparse/sparse_utils_kernel.h" #include "paddle/phi/api/lib/utils/allocator.h" -#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_meta.h" +#include "paddle/phi/kernels/funcs/sparse/common_shape.h" namespace phi { namespace sparse { @@ -71,7 +71,8 @@ void DenseToSparseCooKernel(const Context& dev_ctx, int64_t non_zero_num = GetNonZeroNum(x, sparse_dim); const auto place = dev_ctx.GetPlace(); - const auto values_dims = InferDenseDims(x_dims, sparse_dim, non_zero_num); + const auto values_dims = + phi::funcs::sparse::InferDenseDims(x_dims, sparse_dim, non_zero_num); DenseTensorMeta indices_meta(DataType::INT64, {sparse_dim, static_cast(non_zero_num)}, DataLayout::NCHW); diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu index a307ab0f54613a91deee6215b5c389ca0a44d6e8..d6d992d0f4b651b1a9a47cdddcae116a215a0e57 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu @@ -110,30 +110,15 @@ void Conv3dGradKernel(const Context& dev_ctx, offsets[kernel_size] = offset; if (subm) { - blas.GEMM(CblasTrans, - CblasNoTrans, - x.non_zero_elements().dims()[1], - out_grad.non_zero_elements().dims()[1], - x.non_zero_elements().dims()[0], - static_cast(1), - x.non_zero_elements().data(), - out_grad.non_zero_elements().data(), - static_cast(0), - d_kernel_ptr + half_kernel_size * in_channels * out_channels); - - // call gemm: d_x = out_grad * transpose(kernel) - // (n, out_channels) * (out_channels, in_channels) - T* x_grad_ptr = x_grad->data(); - blas.GEMM(CblasNoTrans, - CblasTrans, - out_grad.non_zero_elements().dims()[0], - in_channels, - out_grad.non_zero_elements().dims()[1], - static_cast(1), - out_grad.non_zero_elements().data(), - kernel.data() + half_kernel_size * in_channels * out_channels, - static_cast(0), - x_grad_ptr); + phi::funcs::sparse::SubmPreProcess(dev_ctx, + x, + kernel, + out_grad, + in_channels, + out_channels, + half_kernel_size, + kernel_grad, + x_grad); if (max_count == 0) { return; } diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu index 94186600f1e2994f9b464bb8d81e9dbf891a4ae9..1a0c7e9b972145fbc98cdb9dfee0267a9eaa9f90 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu @@ -33,6 +33,8 @@ limitations under the License. */ namespace phi { namespace sparse { +using Dims4D = phi::funcs::sparse::Dims4D; + __global__ void SetFlagAndUpdateCounterKernel(const int* indexs, const int n, const int rulebook_len, @@ -83,7 +85,8 @@ __global__ void UpdateIndexKernel(const int* unique_keys, for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) { const int index = unique_keys[i]; int batch, x, y, z; - IndexToPoint(index, out_dims, &batch, &x, &y, &z); + phi::funcs::sparse::IndexToPoint( + index, out_dims, &batch, &x, &y, &z); // get out indices out_indices[i] = batch; out_indices[i + non_zero_num] = z; @@ -150,23 +153,23 @@ __global__ void ProductRuleBookKernel(const int* x_indices, for (int ky = 0; ky < kernel_dims[2]; ky++) { for (int kx = 0; kx < kernel_dims[3]; kx++) { int in_i = -1, out_index = -1, kernel_i = -1; - if (Check(x_dims, - kernel_dims, - paddings, - dilations, - strides, - in_x, - in_y, - in_z, - kx, - ky, - kz)) { + if (phi::funcs::sparse::Check(x_dims, + kernel_dims, + paddings, + dilations, + strides, + in_x, + in_y, + in_z, + kx, + ky, + kz)) { int out_z = (in_z + paddings[1] - kz * dilations[1]) / strides[1]; int out_y = (in_y + paddings[2] - ky * dilations[2]) / strides[2]; int out_x = (in_x + paddings[3] - kx * dilations[3]) / strides[3]; in_i = i; - out_index = - PointToIndex(batch, out_x, out_y, out_z, out_dims); + out_index = phi::funcs::sparse::PointToIndex( + batch, out_x, out_y, out_z, out_dims); atomicAdd(&counter_buf[kernel_index], 1); kernel_i = kernel_index; } @@ -542,7 +545,8 @@ void Conv3dKernel(const Context& dev_ctx, const auto& kernel_dims = kernel.dims(); int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2]; DDim out_dims = {1, 1, 1, 1, 1}; - GetOutShape(x_dims, kernel_dims, paddings, dilations, strides, &out_dims); + phi::funcs::sparse::GetOutShape( + x_dims, kernel_dims, paddings, dilations, strides, &out_dims); out->set_dims(out_dims); const int in_channels = kernel_dims[3]; const int out_channels = kernel_dims[4]; @@ -564,11 +568,8 @@ void Conv3dKernel(const Context& dev_ctx, std::vector subm_paddings(paddings), subm_strides(strides); if (subm) { - auto kernel_dims = kernel.dims(); - for (int i = 0; i < paddings.size(); i++) { - subm_paddings[i] = kernel_dims[i] / 2; - subm_strides[i] = 1; - } + phi::funcs::sparse::ResetSubmKernelSizeAndStrides( + kernel.dims(), &subm_paddings, &subm_strides); } int n = ProductRuleBook(dev_ctx, diff --git a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu index 2e741111fb1489aef5bdc51de637b77eec9d28a7..8048180e425ead98e6db15514caf38c406a2aebf 100644 --- a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu @@ -16,8 +16,10 @@ limitations under the License. */ #include #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_meta.h" +#include "paddle/phi/kernels/funcs/sparse/common_shape.h" #include "paddle/phi/kernels/sparse/sparse_utils_kernel.h" namespace phi { @@ -115,14 +117,16 @@ void DenseToSparseCooKernel(const Context& dev_ctx, PADDLE_ENFORCE_GPU_SUCCESS( cudaMemsetAsync(nums_ptr, 0, sizeof(int), dev_ctx.stream())); #endif - int grid_size = 1, block_size = 1; - GetGpuLaunchConfig1D(dev_ctx, rows, &grid_size, &block_size); + auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rows, 1); auto temp_indexs_meta = phi::DenseTensorMeta(DataType::INT32, {rows}, phi::DataLayout::NCHW); DenseTensor temp_indexs = phi::Empty(dev_ctx, std::move(temp_indexs_meta)); int* temp_indexs_ptr = temp_indexs.mutable_data(place); - GetNonZeroNums<<>>( + GetNonZeroNums<<>>( x_data, rows, cols, nums_ptr, temp_indexs_ptr); #ifdef PADDLE_WITH_HIP thrust::remove(thrust::hip::par.on(dev_ctx.stream()), @@ -167,7 +171,8 @@ void DenseToSparseCooKernel(const Context& dev_ctx, dev_ctx.Wait(); // wait the copy - const auto values_dims = InferDenseDims(x_dims, sparse_dim, non_zero_num); + const auto values_dims = + phi::funcs::sparse::InferDenseDims(x_dims, sparse_dim, non_zero_num); DenseTensorMeta indices_meta(DataType::INT64, {sparse_dim, static_cast(non_zero_num)}, DataLayout::NCHW); @@ -184,16 +189,18 @@ void DenseToSparseCooKernel(const Context& dev_ctx, T* sparse_data = values.mutable_data(place); // 3. calc indices by indexs and get values by indexs - GetGpuLaunchConfig1D(dev_ctx, non_zero_num, &grid_size, &block_size); - GetNonZeroElementsAndIndices<<>>( - x_data, - sparse_dim, - cols, - d_x_dims.data(), - non_zero_num, - temp_indexs_ptr, - indices_data, - sparse_data); + config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1); + GetNonZeroElementsAndIndices<<>>(x_data, + sparse_dim, + cols, + d_x_dims.data(), + non_zero_num, + temp_indexs_ptr, + indices_data, + sparse_data); out->SetMember(indices, values, x_dims, true); } @@ -263,10 +270,9 @@ void SparseCsrToCooKernel(const Context& dev_ctx, int* offsets_ptr = batchs == 1 ? nullptr : offsets.mutable_data(place); T* coo_values_data = values.mutable_data(place); - int grid_size = 1, block_size = 1; if (batchs > 1) { - GetGpuLaunchConfig1D(dev_ctx, batchs, &grid_size, &block_size); - GetBatchSizes<<>>( + auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, batchs, 1); + GetBatchSizes<<>>( csr_crows_data, rows, batchs, offsets_ptr); #ifdef PADDLE_WITH_HIP @@ -279,9 +285,10 @@ void SparseCsrToCooKernel(const Context& dev_ctx, offsets_ptr); } - GetGpuLaunchConfig1D(dev_ctx, rows, &grid_size, &block_size); - dim3 grids(grid_size, batchs, 1); - ConvertCsrCrowsToCooRows<<>>( + auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rows, 1); + config.block_per_grid.y = batchs; + ConvertCsrCrowsToCooRows<<>>( csr_crows_data, offsets_ptr, coo_rows_data, batch_ptr, rows); #ifdef PADDLE_WITH_HIP @@ -404,21 +411,29 @@ void SparseCooToCsrKernel(const Context& dev_ctx, // TODO(zhangkahuo): call coalesced() to distinct and sort the indices } - int grid_size = 1, block_size = 1; - GetGpuLaunchConfig1D(dev_ctx, batchs, &grid_size, &block_size); + auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, batchs, 1); if (batchs > 1) { DenseTensorMeta batchs_meta(DataType::INT64, {batchs}, DataLayout::NCHW); phi::DenseTensor batchs_offset( phi::make_intrusive(place), std::move(batchs_meta)); int64_t* batchs_offset_ptr = batchs_offset.mutable_data(place); - GetBatchsOffset<<>>( + GetBatchsOffset<<>>( batchs_ptr, non_zero_num, batchs_offset_ptr); - dim3 grids(grid_size, batchs, 1); - ConvertCooRowsToCsrCrows<<>>( + config.block_per_grid.y = batchs; + ConvertCooRowsToCsrCrows<<>>( batchs_offset_ptr, coo_rows_data, csr_crows_data, rows, non_zero_num); } else { - ConvertCooRowsToCsrCrows<<>>( + ConvertCooRowsToCsrCrows<<>>( nullptr, coo_rows_data, csr_crows_data, rows, non_zero_num); } @@ -522,12 +537,13 @@ void SparseCooToDenseKernel(const Context& dev_ctx, PADDLE_ENFORCE_GPU_SUCCESS( cudaMemsetAsync(out_data, 0, sizeof(T) * out->numel(), dev_ctx.stream())); #endif - int grid_size = 1, block_size = 1; - GetGpuLaunchConfig1D(dev_ctx, non_zero_num, &grid_size, &block_size); + auto config = + phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1); - KernelSparseCooToDense< - T, - int64_t><<>>( + KernelSparseCooToDense<<>>( indices.data(), d_sparse_offsets.data(), x_data, diff --git a/paddle/phi/kernels/sparse/sparse_utils_kernel.h b/paddle/phi/kernels/sparse/sparse_utils_kernel.h index c83b2130ed4550540a98148aec26e42332c8060d..da05eb3d3cf7682e376efe59aaea8d09d1b6c757 100644 --- a/paddle/phi/kernels/sparse/sparse_utils_kernel.h +++ b/paddle/phi/kernels/sparse/sparse_utils_kernel.h @@ -23,37 +23,6 @@ limitations under the License. */ namespace phi { namespace sparse { -inline const DDim InferDenseDims(const DDim& x_dims, - const int64_t sparse_dim, - const int64_t non_zero_num) { - auto dense_dim = x_dims.size() - sparse_dim; - DDim values_dims; - if (dense_dim) { - std::vector dense_dim_vec(dense_dim + 1); - dense_dim_vec[0] = non_zero_num; - memcpy(&dense_dim_vec[1], - x_dims.Get() + sparse_dim, - dense_dim * sizeof(x_dims[0])); - values_dims = phi::make_ddim(dense_dim_vec); - } else { - values_dims = phi::make_ddim({non_zero_num}); - } - return values_dims; -} - -template -inline void GetGpuLaunchConfig1D(const Context& dev_ctx, - const int64_t n, - int* grid_size, - int* block_size) { - const int MAX_BLOCK_DIM = dev_ctx.GetMaxThreadsPerBlock(); - const int MAX_GRID_DIM = dev_ctx.GetMaxPhysicalThreadCount() / MAX_BLOCK_DIM; - *block_size = (n >= MAX_BLOCK_DIM) ? MAX_BLOCK_DIM - : (1 << static_cast(std::log2(n))); - *grid_size = n / *block_size; - *grid_size = (*grid_size >= MAX_GRID_DIM) ? MAX_GRID_DIM : *grid_size; -} - template void DenseToSparseCooKernel(const Context& dev_ctx, const DenseTensor& x, diff --git a/paddle/phi/kernels/truncated_gaussian_random_kernel.h b/paddle/phi/kernels/truncated_gaussian_random_kernel.h index f8547ced41934a9810dc6874c090ab5aefd43497..c4c13578a989961839600d8ee403e478c76d1345 100644 --- a/paddle/phi/kernels/truncated_gaussian_random_kernel.h +++ b/paddle/phi/kernels/truncated_gaussian_random_kernel.h @@ -141,19 +141,9 @@ T Erfinv(T x) { template struct TruncatedNormal { T mean, std; - T a_normal_cdf; - T b_normal_cdf; - TruncatedNormal(T mean, T std) : mean(mean), std(std) { - auto normal_cdf = [](T x) { - return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0; - }; - a_normal_cdf = normal_cdf(-2.0); - b_normal_cdf = normal_cdf(2.0); - } - + TruncatedNormal(T mean, T std) : mean(mean), std(std) {} T operator()(T value) const { - auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value; - return std::sqrt(2.0) * Erfinv(2 * p - 1) * std + mean; + return std::sqrt(2.0) * Erfinv(value) * std + mean; } }; diff --git a/paddle/phi/ops/compat/activation_sig.cc b/paddle/phi/ops/compat/activation_sig.cc index 396830ca20765bc24d9ddc0e9d09ef045d376dfc..cbfca5b17ae995a89360c6d6d4987028d95dc281 100644 --- a/paddle/phi/ops/compat/activation_sig.cc +++ b/paddle/phi/ops/compat/activation_sig.cc @@ -16,40 +16,80 @@ limitations under the License. */ namespace phi { -#define DefineActGradDepXOpArgMap(func_name, op_name) \ - KernelSignature func_name##GradOpArgumentMapping( \ - const ArgumentMappingContext& ctx) { \ - return KernelSignature( \ - op_name "_grad", {"X", GradVarName("Out")}, {}, {GradVarName("X")}); \ +#define DefineActGradDepXOpArgMap(func_name, op_name, attrs) \ + KernelSignature func_name##GradOpArgumentMapping( \ + const ArgumentMappingContext& ctx) { \ + return KernelSignature(op_name "_grad", \ + {"X", GradVarName("Out")}, \ + {attrs}, \ + {GradVarName("X")}); \ } -#define DefineActGradDepOutOpArgMap(func_name, op_name) \ - KernelSignature func_name##GradOpArgumentMapping( \ - const ArgumentMappingContext& ctx) { \ - return KernelSignature( \ - op_name "_grad", {"Out", GradVarName("Out")}, {}, {GradVarName("X")}); \ +#define DefineActGradDepOutOpArgMap(func_name, op_name, attrs) \ + KernelSignature func_name##GradOpArgumentMapping( \ + const ArgumentMappingContext& ctx) { \ + return KernelSignature(op_name "_grad", \ + {"Out", GradVarName("Out")}, \ + {attrs}, \ + {GradVarName("X")}); \ } +#define comma , + +DefineActGradDepXOpArgMap(Cos, "cos", ); // NOLINT +DefineActGradDepXOpArgMap(Tan, "tan", ); // NOLINT +DefineActGradDepXOpArgMap(Acos, "acos", ); // NOLINT +DefineActGradDepXOpArgMap(Sin, "sin", ); // NOLINT +DefineActGradDepXOpArgMap(Asin, "asin", ); // NOLINT +DefineActGradDepXOpArgMap(Atan, "atan", ); // NOLINT +DefineActGradDepXOpArgMap(Sinh, "sinh", ); // NOLINT +DefineActGradDepXOpArgMap(Cosh, "cosh", ); // NOLINT +DefineActGradDepXOpArgMap(Asinh, "asinh", ); // NOLINT +DefineActGradDepXOpArgMap(Acosh, "acosh", ); // NOLINT +DefineActGradDepXOpArgMap(Atanh, "atanh", ); // NOLINT +DefineActGradDepXOpArgMap(BRelu, "brelu", "t_min" comma "t_max"); // NOLINT +DefineActGradDepXOpArgMap(LeakyRelu, "leaky_relu", "alpha"); // NOLINT +DefineActGradDepXOpArgMap(ThresholdedRelu, + "thresholded_relu", + "threshold"); // NOLINT + +DefineActGradDepOutOpArgMap(Relu, "relu", ); // NOLINT +DefineActGradDepOutOpArgMap(Tanh, "tanh", ); // NOLINT + KernelSignature ReluDoubleGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("relu_double_grad", {"Out", "DDX"}, {}, {"DDOut"}); } -DefineActGradDepXOpArgMap(Cos, "cos"); -DefineActGradDepXOpArgMap(Tan, "tan"); -DefineActGradDepXOpArgMap(Acos, "acos"); -DefineActGradDepXOpArgMap(Sin, "sin"); -DefineActGradDepXOpArgMap(Asin, "asin"); -DefineActGradDepXOpArgMap(Atan, "atan"); -DefineActGradDepXOpArgMap(Sinh, "sinh"); -DefineActGradDepXOpArgMap(Cosh, "cosh"); -DefineActGradDepXOpArgMap(Asinh, "asinh"); -DefineActGradDepXOpArgMap(Acosh, "acosh"); -DefineActGradDepXOpArgMap(Atanh, "atanh"); -DefineActGradDepOutOpArgMap(Relu, "relu"); +KernelSignature TanhDoubleGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "tanh_double_grad", {"Out", "DDX", "DOut"}, {}, {"DOutNew", "DDOut"}); +} + +KernelSignature TanhTripleGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("tanh_triple_grad", + {"Out", "DDX", "DOut", "D_DDOut", "D_DOut_New"}, + {}, + {"D_OutNew", "D_DOut", "D_DDx"}); +} + +KernelSignature LeakyReluDoubleGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "leaky_relu_double_grad", {"X", "DDX"}, {"alpha"}, {"DDOut"}); +} + +KernelSignature LeakyReluOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("leaky_relu", {"X"}, {"alpha"}, {"Out"}); +} + } // namespace phi PD_REGISTER_BASE_KERNEL_NAME(relu_grad_grad, relu_double_grad); +PD_REGISTER_BASE_KERNEL_NAME(tanh_grad_grad, tanh_double_grad); +PD_REGISTER_BASE_KERNEL_NAME(leaky_relu_grad_grad, leaky_relu_double_grad); PD_REGISTER_ARG_MAPPING_FN(cos_grad, phi::CosGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(tan_grad, phi::TanGradOpArgumentMapping); @@ -65,3 +105,16 @@ PD_REGISTER_ARG_MAPPING_FN(atanh_grad, phi::AtanhGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(relu_grad, phi::ReluGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(relu_grad_grad, phi::ReluDoubleGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(tanh_grad, phi::TanhGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(tanh_grad_grad, + phi::TanhDoubleGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(tanh_triple_grad, + phi::TanhTripleGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(brelu_grad, phi::BReluGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(leaky_relu, phi::LeakyReluOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(leaky_relu_grad, + phi::LeakyReluGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(leaky_relu_grad_grad, + phi::LeakyReluDoubleGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(thresholded_relu_grad, + phi::ThresholdedReluGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/allclose_sig.cc b/paddle/phi/ops/compat/allclose_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..e5c4fc027b54225cfdbcc67498eed18789922bd3 --- /dev/null +++ b/paddle/phi/ops/compat/allclose_sig.cc @@ -0,0 +1,49 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature AllCloseOpArgumentMapping(const ArgumentMappingContext& ctx) { + if (ctx.HasInput("Rtol")) { + if (ctx.HasInput("Atol")) { + return KernelSignature("allclose", + {"Input", "Other"}, + {"Rtol", "Atol", "equal_nan"}, + {"Out"}); + } else { + return KernelSignature("allclose", + {"Input", "Other"}, + {"Rtol", "atol", "equal_nan"}, + {"Out"}); + } + } else { + if (ctx.HasInput("Atol")) { + return KernelSignature("allclose", + {"Input", "Other"}, + {"rtol", "Atol", "equal_nan"}, + {"Out"}); + } else { + return KernelSignature("allclose", + {"Input", "Other"}, + {"rtol", "atol", "equal_nan"}, + {"Out"}); + } + } +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(allclose, phi::AllCloseOpArgumentMapping); diff --git a/paddle/phi/ops/compat/diag_sig.cc b/paddle/phi/ops/compat/diag_sig.cc index 0a14b9095c8343f47e1d6aa039c9aced963984ce..f3245b922c0d913a87b58f813bd0ca142ecb6287 100644 --- a/paddle/phi/ops/compat/diag_sig.cc +++ b/paddle/phi/ops/compat/diag_sig.cc @@ -20,8 +20,15 @@ KernelSignature DiagOpArgumentMapping(const ArgumentMappingContext& ctx) { return KernelSignature("diag", {"X"}, {"offset", "padding_value"}, {"Out"}); } +KernelSignature DiagGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature( + "diag_grad", {"X", GradVarName("Out")}, {"offset"}, {GradVarName("X")}); +} + } // namespace phi PD_REGISTER_BASE_KERNEL_NAME(diag_v2, diag); +PD_REGISTER_BASE_KERNEL_NAME(diag_v2_grad, diag_grad); PD_REGISTER_ARG_MAPPING_FN(diag_v2, phi::DiagOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(diag_v2_grad, phi::DiagGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/elementwise_sig.cc b/paddle/phi/ops/compat/elementwise_sig.cc index fc890fa3a4923aaf452af20fd586c82d506ea1a7..1d2aaa04f05d205483dbda5c738c7499ad068881 100644 --- a/paddle/phi/ops/compat/elementwise_sig.cc +++ b/paddle/phi/ops/compat/elementwise_sig.cc @@ -114,6 +114,14 @@ KernelSignature ElementwiseDivGradOpArgumentMapping( {GradVarName("X"), GradVarName("Y")}); } +KernelSignature ElementwiseFMinGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("elementwise_fmin_grad", + {"X", "Y", GradVarName("Out")}, + {"axis"}, + {GradVarName("X"), GradVarName("Y")}); +} + KernelSignature ElementwiseDivDoubleGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("divide_double_grad", @@ -130,6 +138,14 @@ KernelSignature ElementwiseMulGradOpArgumentMapping( {GradVarName("X"), GradVarName("Y")}); } +KernelSignature ElementwiseFMaxGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("elementwise_fmax_grad", + {"X", "Y", GradVarName("Out")}, + {"axis"}, + {GradVarName("X"), GradVarName("Y")}); +} + KernelSignature ElementwiseMulDoubleGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("multiply_double_grad", @@ -192,3 +208,9 @@ PD_REGISTER_ARG_MAPPING_FN(elementwise_mul_grad_grad, phi::ElementwiseMulDoubleGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(elementwise_mul_triple_grad, phi::ElementwiseMulTripleGradOpArgumentMapping); + +PD_REGISTER_ARG_MAPPING_FN(elementwise_fmax_grad, + phi::ElementwiseFMaxGradOpArgumentMapping); + +PD_REGISTER_ARG_MAPPING_FN(elementwise_fmin_grad, + phi::ElementwiseFMinGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/kron_sig.cc b/paddle/phi/ops/compat/kron_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..06b6545f58e7c12964f82fd8b6199270c519c16a --- /dev/null +++ b/paddle/phi/ops/compat/kron_sig.cc @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature KronGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("kron_grad", + {"X", "Y", GradVarName("Out")}, + {}, + {GradVarName("X"), GradVarName("Y")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(kron_grad, phi::KronGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/matrix_rank_sig.cc b/paddle/phi/ops/compat/matrix_rank_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..40dc29579b40194f57911df0bfb426de4369d9b3 --- /dev/null +++ b/paddle/phi/ops/compat/matrix_rank_sig.cc @@ -0,0 +1,38 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +// we have to return every specific KernelSignature for infrt now +KernelSignature MatrixRankOpArgumentMapping(const ArgumentMappingContext& ctx) { + if (ctx.HasInput("TolTensor")) { + return KernelSignature("matrix_rank_tol", + {"X", "TolTensor"}, + {"use_default_tol", "hermitian"}, + {"Out"}); + } else { + return KernelSignature("matrix_rank", + {"X"}, + { + "tol", "use_default_tol", "hermitian", + }, + {"Out"}); + } +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(matrix_rank, phi::MatrixRankOpArgumentMapping); diff --git a/paddle/phi/ops/compat/one_hot_sig.cc b/paddle/phi/ops/compat/one_hot_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..655969093c889aa32ae780f1de3c9c7c81a78eb1 --- /dev/null +++ b/paddle/phi/ops/compat/one_hot_sig.cc @@ -0,0 +1,37 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature OneHotOpArgumentMapping(const ArgumentMappingContext& ctx) { + if (ctx.HasInput("depth_tensor")) { + return KernelSignature("one_hot_raw", + {"X"}, + {"depth_tensor", "dtype", "allow_out_of_range"}, + {"Out"}); + } else { + return KernelSignature("one_hot_raw", + {"X"}, + {"depth", "dtype", "allow_out_of_range"}, + {"Out"}); + } +} + +} // namespace phi + +PD_REGISTER_BASE_KERNEL_NAME(one_hot_v2, one_hot); + +PD_REGISTER_ARG_MAPPING_FN(one_hot_v2, phi::OneHotOpArgumentMapping); diff --git a/paddle/phi/ops/compat/pool_sig.cc b/paddle/phi/ops/compat/pool_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..390d3db5e785ba7642213e9b7a8db2b718ff19f0 --- /dev/null +++ b/paddle/phi/ops/compat/pool_sig.cc @@ -0,0 +1,154 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature Pool2dOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("pool2d", + {"X"}, + {"ksize", + "strides", + "paddings", + "ceil_mode", + "exclusive", + "data_format", + "pooling_type", + "global_pooling", + "adaptive", + "padding_algorithm"}, + {"Out"}); +} + +KernelSignature Pool2dGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("pool2d_grad", + {"X", "Out", GradVarName("Out")}, + {"ksize", + "strides", + "paddings", + "ceil_mode", + "exclusive", + "data_format", + "pooling_type", + "global_pooling", + "adaptive", + "padding_algorithm"}, + {GradVarName("X")}); +} + +KernelSignature Pool2dDoubleGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("pool2d_double_grad", + {"X"}, + {"ksize", + "strides", + "paddings", + "ceil_mode", + "exclusive", + "data_format", + "pooling_type", + "global_pooling", + "adaptive", + "padding_algorithm"}, + {"Out"}); +} + +KernelSignature MaxPool2dWithIndexOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "max_pool2d_with_index", + {"X"}, + {"ksize", "strides", "paddings", "global_pooling", "adaptive"}, + {"Out", "Mask"}); +} + +KernelSignature MaxPool2dWithIndexGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "max_pool2d_with_index_grad", + {"X", "Mask", GradVarName("Out")}, + {"ksize", "strides", "paddings", "global_pooling", "adaptive"}, + {GradVarName("X")}); +} + +KernelSignature Pool3dOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("pool3d", + {"X"}, + {"ksize", + "strides", + "paddings", + "ceil_mode", + "exclusive", + "data_format", + "pooling_type", + "global_pooling", + "adaptive", + "padding_algorithm"}, + {"Out"}); +} + +KernelSignature Pool3dGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("pool3d_grad", + {"X", "Out", GradVarName("Out")}, + {"ksize", + "strides", + "paddings", + "ceil_mode", + "exclusive", + "data_format", + "pooling_type", + "global_pooling", + "adaptive", + "padding_algorithm"}, + {GradVarName("X")}); +} + +KernelSignature MaxPool3dWithIndexOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "max_pool3d_with_index", + {"X"}, + {"ksize", "strides", "paddings", "global_pooling", "adaptive"}, + {"Out", "Mask"}); +} + +KernelSignature MaxPool3dWithIndexGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "max_pool3d_with_index_grad", + {"X", "Mask", GradVarName("Out")}, + {"ksize", "strides", "paddings", "global_pooling", "adaptive"}, + {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(pool2d, phi::Pool2dOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(pool2d_grad, phi::Pool2dGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(pool2d_double_grad, + phi::Pool2dDoubleGradOpArgumentMapping); + +PD_REGISTER_ARG_MAPPING_FN(max_pool2d_with_index, + phi::MaxPool2dWithIndexOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(max_pool2d_with_index_grad, + phi::MaxPool2dWithIndexGradOpArgumentMapping); + +PD_REGISTER_ARG_MAPPING_FN(pool3d, phi::Pool3dOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(pool3d_grad, phi::Pool3dGradOpArgumentMapping); + +PD_REGISTER_ARG_MAPPING_FN(max_pool3d_with_index, + phi::MaxPool3dWithIndexOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(max_pool3d_with_index_grad, + phi::MaxPool3dWithIndexGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/roi_align_sig.cc b/paddle/phi/ops/compat/roi_align_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..0549103b6fbcb8b2367c34c8a44fb3b52f318859 --- /dev/null +++ b/paddle/phi/ops/compat/roi_align_sig.cc @@ -0,0 +1,32 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature ROIAlignOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("roi_align", + {"X", "ROIs", "RoisNum"}, + {"pooled_height", + "pooled_width", + "spatial_scale", + "sampling_ratio", + "aligned"}, + {"Out"}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(roi_align, phi::ROIAlignOpArgumentMapping); diff --git a/paddle/phi/ops/compat/set_value_sig.cc b/paddle/phi/ops/compat/set_value_sig.cc index eacfff26d53cf1ea73c33e4c603253c58be60222..9653250bded84f8ff87f613f6e17e50e351504fa 100644 --- a/paddle/phi/ops/compat/set_value_sig.cc +++ b/paddle/phi/ops/compat/set_value_sig.cc @@ -731,6 +731,108 @@ KernelSignature SetValueOpArgumentMapping(const ArgumentMappingContext& ctx) { } return KernelSignature("unregistered", {}, {}, {}); } + +KernelSignature SetValueGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + if (ctx.HasInput("StartsTensorList")) { + if (ctx.HasInput("EndsTensorList")) { + if (ctx.HasInput("StepsTensorList")) { + return KernelSignature( + "set_value_grad", + {GradVarName("Out")}, + {"StartsTensorList", + "EndsTensorList", + "StepsTensorList", + "axes", + "decrease_axes", + "none_axes"}, + {GradVarName("Input"), GradVarName("ValueTensor")}); + } else { + return KernelSignature( + "set_value_grad", + {GradVarName("Out")}, + {"StartsTensorList", + "EndsTensorList", + "steps", + "axes", + "decrease_axes", + "none_axes"}, + {GradVarName("Input"), GradVarName("ValueTensor")}); + } + } else { + if (ctx.HasInput("StepsTensorList")) { + return KernelSignature( + "set_value_grad", + {GradVarName("Out")}, + {"StartsTensorList", + "ends", + "StepsTensorList", + "axes", + "decrease_axes", + "none_axes"}, + {GradVarName("Input"), GradVarName("ValueTensor")}); + } else { + return KernelSignature( + "set_value_grad", + {GradVarName("Out")}, + {"StartsTensorList", + "ends", + "steps", + "axes", + "decrease_axes", + "none_axes"}, + {GradVarName("Input"), GradVarName("ValueTensor")}); + } + } + } else { + if (ctx.HasInput("EndsTensorList")) { + if (ctx.HasInput("StepsTensorList")) { + return KernelSignature( + "set_value_grad", + {GradVarName("Out")}, + {"starts", + "EndsTensorList", + "StepsTensorList", + "axes", + "decrease_axes", + "none_axes"}, + {GradVarName("Input"), GradVarName("ValueTensor")}); + } else { + return KernelSignature( + "set_value_grad", + {GradVarName("Out")}, + {"starts", + "EndsTensorList", + "steps", + "axes", + "decrease_axes", + "none_axes"}, + {GradVarName("Input"), GradVarName("ValueTensor")}); + } + } else { + if (ctx.HasInput("StepsTensorList")) { + return KernelSignature( + "set_value_grad", + {GradVarName("Out")}, + {"starts", + "ends", + "StepsTensorList", + "axes", + "decrease_axes", + "none_axes"}, + {GradVarName("Input"), GradVarName("ValueTensor")}); + } else { + return KernelSignature( + "set_value_grad", + {GradVarName("Out")}, + {"starts", "ends", "steps", "axes", "decrease_axes", "none_axes"}, + {GradVarName("Input"), GradVarName("ValueTensor")}); + } + } + } +} + } // namespace phi PD_REGISTER_ARG_MAPPING_FN(set_value, phi::SetValueOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(set_value_grad, phi::SetValueGradOpArgumentMapping); diff --git a/paddle/phi/tests/core/test_meta_fn_utils.cc b/paddle/phi/tests/core/test_meta_fn_utils.cc index f4288c2aa2f9418eeff489aa53fe685aa4a155ec..399112d09c2ad55364b5035e7b759b53d0abaea8 100644 --- a/paddle/phi/tests/core/test_meta_fn_utils.cc +++ b/paddle/phi/tests/core/test_meta_fn_utils.cc @@ -52,7 +52,7 @@ TEST(MetaFnFactory, InferMetaFnExists) { phi::InferMetaContext ctx; ctx.EmplaceBackInput(shared_meat_x); ctx.EmplaceBackOutput(shared_meta_out); - ctx.SetMetaConfig(/*is_runtime=*/true); + ctx.SetMetaConfig({/*is_runtime =*/true, /*is_run_mkldnn_kernel=*/false}); phi::MetaFnFactory::Instance().Get("sign")(&ctx); EXPECT_EQ(dense_out1.dims().size(), dense_out2.dims().size()); @@ -78,7 +78,7 @@ TEST(MetaFnFactory, CopyInferMetaFn) { ctx.EmplaceBackAttr(Backend::CPU); ctx.EmplaceBackAttr(false); ctx.EmplaceBackOutput(shared_meta_out); - ctx.SetMetaConfig(/*is_runtime=*/true); + ctx.SetMetaConfig({/*is_runtime =*/true, /*is_run_mkldnn_kernel=*/false}); phi::MetaFnFactory::Instance().Get("copy_to")(&ctx); EXPECT_EQ(dense_out1.dims().size(), dense_out2.dims().size()); @@ -105,7 +105,7 @@ TEST(MetaFnFactory, SplitInferMetaFn) { ctx.EmplaceBackAttr(num_or_sections); ctx.EmplaceBackAttr(axis); ctx.EmplaceBackOutputs(out); - ctx.SetMetaConfig(/*is_runtime=*/true); + ctx.SetMetaConfig({/*is_runtime =*/true, /*is_run_mkldnn_kernel=*/false}); phi::MetaFnFactory::Instance().Get("split")(&ctx); ASSERT_EQ(dense_out1.dims().size(), 2); diff --git a/paddle/phi/tests/ops/test_op_signature.cc b/paddle/phi/tests/ops/test_op_signature.cc index 88c9193a8f8949bd6f315c9c4bdf89d6029a8696..36923972ea4145a63101f84eeb5da76d73ffce75 100644 --- a/paddle/phi/tests/ops/test_op_signature.cc +++ b/paddle/phi/tests/ops/test_op_signature.cc @@ -484,5 +484,98 @@ TEST(ARG_MAP, set_value) { "set_value"); } +TEST(ARG_MAP, set_value_grad) { + TestArgumentMappingContext arg_case( + {"Out@GRAD", "StartsTensorList", "EndsTensorList"}, + {}, + {}, + {"Input@GRAD", "ValueTensor@GRAD"}, + {}); + ASSERT_EQ(OpUtilsMap::Instance() + .GetArgumentMappingFn("set_value_grad")(arg_case) + .name, + "set_value_grad"); + + TestArgumentMappingContext arg_case1( + {"Out@GRAD", "StartsTensorList", "StepsTensorList"}, + {}, + {}, + {"Input@GRAD", "ValueTensor@GRAD"}, + {}); + ASSERT_EQ(OpUtilsMap::Instance() + .GetArgumentMappingFn("set_value_grad")(arg_case1) + .name, + "set_value_grad"); + + TestArgumentMappingContext arg_case2({"Out@GRAD", "StartsTensorList"}, + {}, + {}, + {"Input@GRAD", "ValueTensor@GRAD"}, + {}); + ASSERT_EQ(OpUtilsMap::Instance() + .GetArgumentMappingFn("set_value_grad")(arg_case2) + .name, + "set_value_grad"); + + TestArgumentMappingContext arg_case3( + {"Out@GRAD", "EndsTensorList", "StepsTensorList"}, + {}, + {}, + {"Input@GRAD", "ValueTensor@GRAD"}, + {}); + ASSERT_EQ(OpUtilsMap::Instance() + .GetArgumentMappingFn("set_value_grad")(arg_case3) + .name, + "set_value_grad"); + + TestArgumentMappingContext arg_case4({"Out@GRAD", "EndsTensorList"}, + {}, + {}, + {"Input@GRAD", "ValueTensor@GRAD"}, + {}); + ASSERT_EQ(OpUtilsMap::Instance() + .GetArgumentMappingFn("set_value_grad")(arg_case4) + .name, + "set_value_grad"); + + TestArgumentMappingContext arg_case5({"Out@GRAD", "StepsTensorList"}, + {}, + {}, + {"Input@GRAD", "ValueTensor@GRAD"}, + {}); + ASSERT_EQ(OpUtilsMap::Instance() + .GetArgumentMappingFn("set_value_grad")(arg_case5) + .name, + "set_value_grad"); +} + +TEST(ARG_MAP, allclose) { + TestArgumentMappingContext arg_case1( + {"Input", "Other", "Rtol"}, + {}, + {{"atol", paddle::any(std::string{"1e-8"})}, + {"equal_nan", paddle::any(false)}}, + {"Out"}, + {}); + auto signature1 = + OpUtilsMap::Instance().GetArgumentMappingFn("allclose")(arg_case1); + ASSERT_EQ(signature1.name, "allclose"); + auto attr_names1 = std::get<1>(signature1.args); + ASSERT_EQ(attr_names1[0], "Rtol"); + + TestArgumentMappingContext arg_case2( + {"Input", "Other", "Atol"}, + {}, + {{"rtol", paddle::any(std::string{"1e-5"})}, + {"equal_nan", paddle::any(false)}}, + {"Out"}, + {}); + auto signature2 = + OpUtilsMap::Instance().GetArgumentMappingFn("allclose")(arg_case2); + ASSERT_EQ(signature2.name, "allclose"); + auto attr_names2 = std::get<1>(signature2.args); + ASSERT_EQ(attr_names2[1], "Atol"); +} + } // namespace tests } // namespace phi diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat index 35b2ce751b18fff2aac8dedfd09e5fe209d95533..75afa4ef43ff602d0fe4b9a6ce7c7c6ad5aab8a0 100644 --- a/paddle/scripts/paddle_build.bat +++ b/paddle/scripts/paddle_build.bat @@ -66,6 +66,7 @@ if not defined WITH_TESTING set WITH_TESTING=ON if not defined MSVC_STATIC_CRT set MSVC_STATIC_CRT=ON if not defined WITH_PYTHON set WITH_PYTHON=ON if not defined ON_INFER set ON_INFER=ON +if not defined WITH_ONNXRUNTIME set WITH_ONNXRUNTIME=OFF if not defined WITH_INFERENCE_API_TEST set WITH_INFERENCE_API_TEST=ON if not defined WITH_STATIC_LIB set WITH_STATIC_LIB=ON if not defined WITH_TPCACHE set WITH_TPCACHE=OFF @@ -757,7 +758,7 @@ for /F %%i in ("%libsize%") do ( ) cd /d %work_dir%\paddle\fluid\inference\api\demo_ci -%cache_dir%\tools\busybox64.exe bash run.sh %work_dir:\=/% %WITH_MKL% %WITH_GPU% %cache_dir:\=/%/inference_demo %WITH_TENSORRT% %TENSORRT_ROOT% %MSVC_STATIC_CRT% +%cache_dir%\tools\busybox64.exe bash run.sh %work_dir:\=/% %WITH_MKL% %WITH_GPU% %cache_dir:\=/%/inference_demo %WITH_TENSORRT% %TENSORRT_ROOT% %WITH_ONNXRUNTIME% %MSVC_STATIC_CRT% goto:eof :test_inference_error @@ -857,7 +858,7 @@ echo Step 7. Testing fluid library with infer_ut for inference ... echo ======================================== cd /d %work_dir%\paddle\fluid\inference\tests\infer_ut -%cache_dir%\tools\busybox64.exe bash run.sh %work_dir:\=/% %WITH_MKL% %WITH_GPU% %cache_dir:\=/%/inference_demo %TENSORRT_ROOT% %MSVC_STATIC_CRT% +%cache_dir%\tools\busybox64.exe bash run.sh %work_dir:\=/% %WITH_MKL% %WITH_GPU% %cache_dir:\=/%/inference_demo %TENSORRT_ROOT% %WITH_ONNXRUNTIME% %MSVC_STATIC_CRT% goto:eof :test_inference_ut_error diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 84f7a57999fd66a6c24ae3ccf88c93f9beaa97e5..39676b916e50470ac9774f3564b4bdc3a8fcb20f 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -229,6 +229,7 @@ function cmake_base() { -DWITH_CNCL=${WITH_CNCL:-OFF} -DWITH_XPU=${WITH_XPU:-OFF} -DWITH_MLU=${WITH_MLU:-OFF} + -DWITH_IPU=${WITH_IPU:-OFF} -DLITE_GIT_TAG=release/v2.10 -DWITH_UNITY_BUILD=${WITH_UNITY_BUILD:-OFF} -DWITH_XPU_BKCL=${WITH_XPU_BKCL:-OFF} @@ -280,6 +281,7 @@ EOF -DLITE_GIT_TAG=release/v2.10 \ -DWITH_XPU=${WITH_XPU:-OFF} \ -DWITH_MLU=${WITH_MLU:-OFF} \ + -DWITH_IPU=${WITH_IPU:-OFF} \ -DWITH_CNCL=${WITH_CNCL:-OFF} \ -DXPU_SDK_ROOT=${XPU_SDK_ROOT:-""} \ -DWITH_LITE=${WITH_LITE:-OFF} \ @@ -948,8 +950,17 @@ function generate_upstream_develop_api_spec() { git checkout . git checkout -b develop_base_pr upstream/$BRANCH startTime_firstBuild=`date +%s` - cmake_gen $1 - build $2 + + dev_commit=`git log -1|head -1|awk '{print $2}'` + dev_url="https://xly-devops.bj.bcebos.com/PR/build_whl/0/${dev_commit}/paddlepaddle_gpu-0.0.0-cp37-cp37m-linux_x86_64.whl" + url_return=`curl -s -m 5 -IL ${dev_url} |awk 'NR==1{print $2}'` + if [ "$url_return" == '200' ];then + mkdir -p ${PADDLE_ROOT}/build/python/dist && wget -q -P ${PADDLE_ROOT}/build/python/dist ${dev_url} + else + cmake_gen $1 + build $2 + fi + cp ${PADDLE_ROOT}/python/requirements.txt /tmp pr_whl_size=`du -m ${PADDLE_ROOT}/build/python/dist/*.whl|awk '{print $1}'` echo "pr_whl_size: ${pr_whl_size}" @@ -1274,6 +1285,8 @@ function card_test() { CUDA_DEVICE_COUNT=$(rocm-smi -i | grep GPU | wc -l) elif [ "${WITH_MLU}" == "ON" ];then CUDA_DEVICE_COUNT=1 + elif [ "${WITH_IPU}" == "ON" ];then + CUDA_DEVICE_COUNT=1 else CUDA_DEVICE_COUNT=$(nvidia-smi -L | wc -l) fi @@ -2231,6 +2244,130 @@ set -ex fi } +function parallel_test_base_ipu() { + mkdir -p ${PADDLE_ROOT}/build + cd ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/ipu + if [ ${WITH_TESTING:-ON} == "ON" ] ; then + cat <> ${PADDLE_ROOT}/build/build_summary.txt + ut_actual_total_endTime_s=`date +%s` + echo "ipipe_log_param_actual_TestCases_Total_Time: $[ $ut_actual_total_endTime_s - $ut_actual_total_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt + if [[ "$EXIT_CODE" != "0" ]]; then + show_ut_retry_result + fi +set -ex + fi +} + function parallel_test() { mkdir -p ${PADDLE_ROOT}/build cd ${PADDLE_ROOT}/build @@ -2248,6 +2385,8 @@ function parallel_test() { parallel_test_base_npu elif [ "$WITH_MLU" == "ON" ];then parallel_test_base_mlu + elif [ "$WITH_IPU" == "ON" ];then + parallel_test_base_ipu else parallel_test_base_cpu ${PROC_RUN:-1} fi @@ -3013,6 +3152,11 @@ function main() { parallel_test check_coverage ;; + check_ipu_coverage) + cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number} + parallel_test + check_coverage + ;; reuse_so_cicheck_py35) reuse_so_cache parallel_test diff --git a/python/paddle/autograd/backward_mode.py b/python/paddle/autograd/backward_mode.py index 36ca048c51210ff7c12679731653ce026206b3c6..6fc6f7d3d494a28b822c3044716ec66867538a3d 100644 --- a/python/paddle/autograd/backward_mode.py +++ b/python/paddle/autograd/backward_mode.py @@ -81,15 +81,14 @@ def backward(tensors, grad_tensors=None, retain_graph=False): if isinstance(in_out_list, (list, tuple)): assert len(in_out_list) > 0, "{} connot be empyt".format(name) for each_var in in_out_list: - assert isinstance( - each_var, paddle. - Tensor), "Elements of {} must be paddle.Tensor".format(name) + assert isinstance(each_var, ( + paddle.Tensor, core.eager.Tensor + )), "Elements of {} must be paddle.Tensor".format(name) return in_out_list else: - assert isinstance( - in_out_list, - paddle.Tensor), "{} must be Tensor or list of Tensor".format( - name) + assert isinstance(in_out_list, ( + paddle.Tensor, core.eager.Tensor + )), "{} must be Tensor or list of Tensor".format(name) return [in_out_list] tensors = check_tensors(tensors, "tensors") @@ -105,10 +104,13 @@ def backward(tensors, grad_tensors=None, retain_graph=False): for each_tensor in grad_tensors: if each_tensor is not None: assert isinstance( - each_tensor, paddle.Tensor + each_tensor, (paddle.Tensor, core.eager.Tensor) ), "The argument 'grad_tensors' of paddle.autograd.backward is invalid, it can be 'None', 'paddle.Tensor' or 'list[None/paddle.Tensor]'." else: - grad_tensors = [None] * len(tensors) + if core._in_eager_mode(): + grad_tensors = [] + else: + grad_tensors = [None] * len(tensors) if len(grad_tensors) > 0: assert len(tensors) == len( @@ -116,5 +118,8 @@ def backward(tensors, grad_tensors=None, retain_graph=False): assert isinstance(retain_graph, bool), "retain_graph must be True or False" - core.dygraph_run_backward(tensors, grad_tensors, retain_graph, - framework._dygraph_tracer()) + if core._in_eager_mode(): + core.eager.run_backward(tensors, grad_tensors, retain_graph) + else: + core.dygraph_run_backward(tensors, grad_tensors, retain_graph, + framework._dygraph_tracer()) diff --git a/python/paddle/distributed/auto_parallel/converter.py b/python/paddle/distributed/auto_parallel/converter.py new file mode 100644 index 0000000000000000000000000000000000000000..d88f9fe7501b56be255448a412fdcc6ec56cd13b --- /dev/null +++ b/python/paddle/distributed/auto_parallel/converter.py @@ -0,0 +1,455 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import warnings +import logging +import numpy as np +from ..utils import get_logger + + +class Converter(object): + """ + Converter is a class object for auto parallel to convert tensors from + one parallel strategy to another one. Tensors will merge and slice value + with their strategy when strategies are different. + """ + + def __init__(self, tensors_dict, pre_strategy, cur_strategy): + """ + Args: + tensors_dict(dict): tensors' value of all ranks that to be converted. + key is tensor's name(str), value is all ranks' data(list(numpy.ndarray)) + pre_strategy(dict): tensors' distributed attribute of last training process. + key is tensor's name(str), value is tensor's distributed attribute in last + training process. + cur_strategy(dict): tensors' distributed attribute of current rank. + key is tensor's name(str), value is tensor's distributed attribute in current + rank. + """ + self._tensors_dict = self._check_tensor_dict(tensors_dict) + self._pre_strategy = self._check_pre_strategy(pre_strategy) + self._cur_strategy = self._check_cur_strategy(cur_strategy) + self._logger = get_logger(logging.INFO) + + def _check_tensor_dict(self, tensors_dict): + if not tensors_dict: + raise ValueError("'tensors_dict' is None, " + "the tensors to be converted cannot be None.") + if not isinstance(tensors_dict, dict): + raise TypeError( + "The type of 'tensors_dict' should be 'dict', but got '{}'.". + format(str(type(tensors_dict)))) + return tensors_dict + + def _check_pre_strategy(self, pre_strategy): + if not pre_strategy: + raise ValueError("'pre_strategy' is None, " + "there are not tensors in pre process.") + if not isinstance(pre_strategy, dict): + raise TypeError("The type of 'pre_strategy' should be 'dict', " + "but got '{}'.".format(str(type(pre_strategy)))) + return pre_strategy + + def _check_cur_strategy(self, cur_strategy): + if not cur_strategy: + warnings.warn("'cur_strategy' is None, " + "there are not tensors in cur process") + if not isinstance(cur_strategy, dict): + raise TypeError("The type of 'cur_strategy' should be 'dict', " + "but got '{}'.".format(str(type(cur_strategy)))) + return cur_strategy + + def convert(self, strict=True): + """ + Convert tensors + + Args: + strict(bool): whether to strict convert tensor with tensor's name. If False, it will + convert tensors by prefix matching. Otherwise, tensors will be converted with + their name strictly. + + Returns: + converted tensors(dict) + + Examples: + .. code-block:: python + + import numpy as np + complete_tensors = np.arange(4).reshape([2, 2]) + partitial_tensors = np.split(complete_tensors, 2, axis=0) + name = "tmp_0" + tensors_dict = {name: partitial_tensors} + strategy_1 = { + name: { + "process_shape": [2], + "process_group": [0, 1], + "dims_mapping": [0, -1] + } + } + strategy_2 = { + name: { + "process_shape": [2], + "process_group": [0, 1], + "dims_mapping": [-1, -1] + } + } + converter = Converter(tensors_dict, strategy_1, strategy_2) + result = converter.convert() + # the result's value is equal to `complete_tensors` + """ + tensors_dict = {} + # the name which is in cur_process but not in pre_process + tensor_not_in_pre = [] + # the name which is in pre_process but not in cur_process + tensor_not_in_cur = [] + # the name which is in strategy but not in ckpt files + tensor_not_in_ckpt = [] + self._logger.info("Start to convert tensors.") + for tensor_name in self._cur_strategy: + if tensor_name not in self._pre_strategy: + tensor_not_in_pre.append(tensor_name) + continue + if tensor_name not in self._tensors_dict: + tensor_not_in_ckpt.append(tensor_name) + continue + self._pre_name = tensor_name + self._cur_name = tensor_name + tensor_list = self._tensors_dict[tensor_name] + pre_dist_attr = self._pre_strategy[tensor_name] + cur_dist_attr = self._cur_strategy[tensor_name] + try: + tensors_dict[tensor_name] = Converter.merge_and_slice( + tensor_list, pre_dist_attr, cur_dist_attr) + except ValueError as err: + raise ValueError("Fail to convert tensor '{}'. " + .format(str(tensor_name)) + str(err)) + + for tensor_name in self._pre_strategy: + if tensor_name not in self._cur_strategy: + tensor_not_in_cur.append(tensor_name) + + if not strict: + tensors_dict, tensor_match_with_pre, tensor_match_with_cur = self.convert_with_prefix_match( + tensors_dict, tensor_not_in_pre, tensor_not_in_cur) + else: + tensors_dict, tensor_match_with_pre, tensor_match_with_cur = tensors_dict, [], [] + + tensor_not_in_pre = set(tensor_not_in_pre) - set(tensor_match_with_pre) + tensor_not_in_cur = set(tensor_not_in_cur) - set(tensor_match_with_cur) + if tensor_not_in_pre: + warnings.warn( + "tensors [{}] are not found in last training strategy." + .format(str(tensor_not_in_pre))) + if tensor_not_in_cur: + warnings.warn( + "tensors [{}] are not found in current training strategy." + .format(str(tensor_not_in_cur))) + if tensor_not_in_ckpt: + warnings.warn( + "tensors [{}] are found in pre_strategy, but are not found" + "in checkpoint files, please check your checkpoint files." + .format(str(tensor_not_in_ckpt))) + + return tensors_dict + + def convert_with_prefix_match(self, tensors_dict, tensor_not_in_pre, + tensor_not_in_cur): + # the name which in cur_process and can match with pre_process + tensor_match_with_pre = [] + # the name which in pre_process and can match with cur_process + tensor_match_with_cur = [] + for cur_name in tensor_not_in_pre: + prefix_name = cur_name + while prefix_name.find("_") != -1: + prefix_name = prefix_name[:prefix_name.rfind("_")] + for pre_name in tensor_not_in_cur: + if prefix_name in pre_name: + # 'cur_name' of cur_process can match with 'pre_name' of pre_process + self._pre_name = pre_name + self._cur_name = cur_name + pre_tensor_list = self._tensors_dict[pre_name] + pre_dist_attr = self._pre_strategy[pre_name] + cur_dist_attr = self._cur_strategy[cur_name] + try: + tensors_dict[cur_name] = Converter.merge_and_slice( + pre_tensor_list, pre_dist_attr, cur_dist_attr) + except ValueError as err: + raise ValueError( + "Fail to convert tensor '{}' by '{}'. ".format( + str(cur_name), str(pre_name)) + str(err)) + self._logger.info( + "tensor [{}] is matched with tensor [{}]".format( + cur_name, pre_name)) + tensor_match_with_pre.append(cur_name) + tensor_match_with_cur.append(pre_name) + break + break + + return tensors_dict, tensor_match_with_pre, tensor_match_with_cur + + @staticmethod + def merge_and_slice(tensor_list, pre_dist_attr, cur_dist_attr): + """ + Merge tensors with previous dist_attr and slice tensors with current dist_attr + + Returns: + tensor(numpy.narray): a tensor's value of current rank. + """ + assert isinstance(tensor_list, list) + assert all(isinstance(p, np.ndarray) for p in tensor_list) + + if pre_dist_attr == cur_dist_attr: + # skip merge and slice tensor + rank_id = paddle.distributed.get_rank() + index = cur_dist_attr["process_group"].index(rank_id) + tensor = tensor_list[index] + else: + pre_dims_mapping = pre_dist_attr["dims_mapping"] + cur_dims_mapping = cur_dist_attr["dims_mapping"] + if len(set(pre_dims_mapping)) > 1 or -1 not in pre_dims_mapping: + # merge tensor + tensor = Converter.merge_with_dist_attr(tensor_list, + pre_dist_attr) + else: + # skip merge tensor + tensor = tensor_list[0] + + if len(set(cur_dims_mapping)) > 1 or -1 not in cur_dims_mapping: + # slice tensor + tensor = Converter.slice_with_dist_attr(tensor, cur_dist_attr) + + return tensor + + @staticmethod + def merge_with_dist_attr(tensor_list, dist_attr): + """ Merge tensor with distributed attribute """ + from .reshard import _compute_complete_shape, _compute_partition_index + + dims_mapping = dist_attr["dims_mapping"] + process_shape = dist_attr["process_shape"] + process_group = dist_attr["process_group"] + # get the complete shape of the tensor + complete_shape = _compute_complete_shape(tensor_list[0].shape, + process_shape, dims_mapping) + # merge the tensor with dist_attr + partition_tensor_list = [] + merged_partiton = [] + for process in process_group: + partition_index = _compute_partition_index( + process, complete_shape, dims_mapping, process_shape, + process_group) + index = process_group.index(process) + if partition_index not in merged_partiton: + merged_partiton.append(partition_index) + Converter.merge(partition_tensor_list, tensor_list[index], + partition_index, complete_shape) + + if len(partition_tensor_list) != 1: + raise ValueError("Fail to merge tensor with dist_attr '{}'.".format( + str(dist_attr))) + complete_tensor = partition_tensor_list[0][0] + return complete_tensor + + @staticmethod + def slice_with_dist_attr(tensor, dist_attr): + """ Slice tensor with distributed attribute """ + dims_mapping = dist_attr["dims_mapping"] + process_shape = dist_attr["process_shape"] + process_group = dist_attr["process_group"] + # slice the tensor with dist_attr + partition_index_list = Converter._get_split_indices( + tensor.shape, dims_mapping, process_shape, process_group) + sliced_tensor_list = Converter.split(tensor, partition_index_list, + len(partition_index_list)) + # get the current tensor's index in sliced_tensor_list + rank_id = paddle.distributed.get_rank() + sliced_tensor_index = Converter._get_sliced_index( + rank_id, tensor.shape, dims_mapping, process_shape, process_group) + if sliced_tensor_index not in range(len(sliced_tensor_list)): + raise ValueError("Fail to slice tensor with dist_attr '{}'.".format( + str(dist_attr))) + sliced_tensor = sliced_tensor_list[sliced_tensor_index] + return sliced_tensor + + @staticmethod + def merge(partition_tensor_list, tensor, partition_index, complete_shape): + """ + Merge partitial tensors to a complete. + + Returns: + None + + Examples: + .. code-block:: python + + import numpy as np + partition_tensor_list = [(np.array([[[1.11, 1.12]]]), [[0,1],[0,1],[0,2]])] + tensor = np.array([[[1.13, 1.14]]]) + partition_index = [[0,1],[0,1],[2,4]] + + _merge_tensor(partition_tensor_list, tensor, partition_index) + # partition_tensor_list: [(np.array([[[1.11, 1.12, 1.13, 1.14]]]), [[0,1],[0,1],[0,4]])] + """ + from .reshard import _compute_concat_info + + if len(partition_tensor_list) == 1: + is_complete_data = True + for idx, item in enumerate(partition_tensor_list[0][1]): + if item[0] != 0 or item[1] != complete_shape[idx]: + is_complete_data = False + break + if is_complete_data: + return + + if not partition_tensor_list: + partition_tensor_list.append((tensor, partition_index)) + else: + i = 0 + while i < len(partition_tensor_list): + concat_axis, first_order, new_partition = _compute_concat_info( + partition_tensor_list[i][1], partition_index) + if concat_axis != -1: + if first_order == 0: + new_tensor = np.concatenate( + (partition_tensor_list[i][0], tensor), + axis=concat_axis) + else: + new_tensor = np.concatenate( + (tensor, partition_tensor_list[i][0]), + axis=concat_axis) + + partition_tensor_list.pop(i) + Converter.merge(partition_tensor_list, new_tensor, + new_partition, complete_shape) + break + i += 1 + + @staticmethod + def split(complete_tensor, partition_index_list, length): + """ + Slice a complete tensor. + + Returns: + sliced_tensor_list(list): sliced tensors with 'partition_index_list' + + Examples: + .. code-block:: python + + import numpy as np + complete_tensor = np.array([[[1.11, 1.12, 1.13, 1.14, 1.15, 1.16]]]) + rank = 2 + complete_shape = [1, 1, 6] + dims_mapping = [-1, -1, 0] + process_shape = [3] + process_group = [0, 1, 2] + + sliced_tensor_list = split(complete_tensor, [[], [], [2, 4]], 3) + # [array([[[1.11, 1.12]]]), array([[[1.13, 1.14]]]), array([[[1.15, 1.16]]])] + """ + sliced_tensor_list = [] + axis = len(complete_tensor.shape) - length + sliced_tensor = np.split( + complete_tensor, partition_index_list[axis], axis=axis) + if length == 1: + return sliced_tensor + for tensor in sliced_tensor: + sliced_tensor_list.extend( + Converter.split(tensor, partition_index_list, length - 1)) + return sliced_tensor_list + + @staticmethod + def _get_split_indices(complete_shape, dims_mapping, process_shape, + process_group): + """ + Get split indices of every dimension. + + Returns: + split_indices_list(list): the split indices of every dimension of the tensor + + Examples: + .. code-block:: python + + import numpy as np + complete_tensor = np.array([[[1.11, 1.12, 1.13, 1.14, 1.15, 1.16]]]) + complete_shape = [1, 1, 6] + dims_mapping = [-1, -1, 0] + process_shape = [3] + process_group = [0, 1, 2] + + index = _get_split_indices(complete_shape, dims_mapping, process_shape, process_group) + # index: [[], [], [2, 4]] + """ + from .reshard import _compute_partition_index + + split_indices_list = [] + for process in process_group: + partition_index = _compute_partition_index( + process, complete_shape, dims_mapping, process_shape, + process_group) + if split_indices_list: + for dim in range(len(partition_index)): + split_indices_list[dim].extend(partition_index[dim]) + else: + split_indices_list = partition_index + split_indices_list = list( + map(lambda x, y: list(set(x) - set([y]) - set([0])), + split_indices_list, complete_shape)) + split_indices_list = [sorted(x) for x in split_indices_list] + return split_indices_list + + @staticmethod + def _get_sliced_index(rank_id, complete_shape, dims_mapping, process_shape, + process_group): + """ + Get sliced_tensor's index of current rank in all sliced tensors list. + + Returns: + sliced_tensor_index(int): the index of sliced tensor in sliced_tensor_list + + Examples: + .. code-block:: python + + import numpy as np + complete_tensor = np.array([[[1.11, 1.12, 1.13, 1.14, 1.15, 1.16]]]) + rank = 2 + complete_shape = [1, 1, 6] + dims_mapping = [-1, -1, 0] + process_shape = [3] + process_group = [0, 1, 2] + + slice_tensor = _slice_tensor(complete_tensor, [[], [], [2, 4]], 3) + # slice_tensor: + # [array([[[1.11, 1.12]]]), array([[[1.13, 1.14]]]), array([[[1.15, 1.16]]])] + + index = _get_sliced_index(rank, complete_shape, dims_mapping + process_shape, process_group) + # index: 2 + """ + from .reshard import _compute_partition_index + + partition_index = _compute_partition_index( + rank_id, complete_shape, dims_mapping, process_shape, process_group) + sliced_index = 0 + for i, shape in enumerate(complete_shape): + if dims_mapping[i] == -1: + slice_shape = shape + else: + slice_shape = shape // process_shape[dims_mapping[i]] + if shape == 1: + index = 0 + else: + index = (partition_index[i][0] + 1) // slice_shape + sliced_index = sliced_index * (shape // slice_shape) + index + return sliced_index diff --git a/python/paddle/distributed/auto_parallel/tuner/__init__.py b/python/paddle/distributed/auto_parallel/tuner/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..513558501a0eb218b772a8c02142d3c320675710 --- /dev/null +++ b/python/paddle/distributed/auto_parallel/tuner/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/paddle/distributed/auto_parallel/tuner/storable.py b/python/paddle/distributed/auto_parallel/tuner/storable.py new file mode 100644 index 0000000000000000000000000000000000000000..d61e53a02724088c89f2e8cfafc91ca0047aa967 --- /dev/null +++ b/python/paddle/distributed/auto_parallel/tuner/storable.py @@ -0,0 +1,36 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json + + +class Storable(object): + def get_state(self): + raise NotImplementedError + + def set_state(self, state): + raise NotImplementedError + + def save(self, path): + state = self.get_state() + state_json = json.dumps(state) + with open(path, "w") as f: + f.write(state_json) + return str(path) + + def load(self, path): + with open(path, "r") as f: + state_data = f.read() + state = json.loads(state_data) + self.set_state(state) diff --git a/python/paddle/distributed/auto_parallel/tuner/tunable_space.py b/python/paddle/distributed/auto_parallel/tuner/tunable_space.py new file mode 100644 index 0000000000000000000000000000000000000000..f63364c5b75ef03a81d8b293515f3bc5a55fce78 --- /dev/null +++ b/python/paddle/distributed/auto_parallel/tuner/tunable_space.py @@ -0,0 +1,151 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import collections +import contextlib +import copy +import math +import random +import numpy as np + +from .tunable_variable import Boolean +from .tunable_variable import Fixed +from .tunable_variable import Choice +from .tunable_variable import IntRange +from .tunable_variable import FloatRange + + +class TunableSpace(object): + """ + A TunableSpace is constructed by the tunable variables. + """ + + def __init__(self): + # Tunable variables for this tunable variables + self._variables = {} + # Specific values coresponding to each tunable variable + self._values = {} + + @property + def variables(self): + return self._variables + + @property + def values(self): + return self._values + + def get_value(self, name): + if name in self.values: + return self.values[name] + else: + raise KeyError("{} does not exist.".format(name)) + + def set_value(self, name, value): + if name in self.values: + self.values[name] = value + else: + raise KeyError("{} does not exist.".format(name)) + + def _exists(self, name): + if name in self._variables: + return True + return False + + def _retrieve(self, tv): + tv = tv.__class__.from_state(tv.get_state()) + if self._exists(tv.name): + return self.get_value(tv.name) + return self._register(tv) + + def _register(self, tv): + self._variables[tv.name] = tv + if tv.name not in self.values: + self.values[tv.name] = tv.default + return self.values[tv.name] + + def __getitem__(self, name): + return self.get_value(name) + + def __setitem__(self, name, value): + self.set_value(name, value) + + def __contains__(self, name): + try: + self.get_value(name) + return True + except (KeyError, ValueError): + return False + + def fixed(self, name, default): + tv = Fixed(name=name, default=default) + return self._retrieve(tv) + + def boolean(self, name, default=False): + tv = Boolean(name=name, default=default) + return self._retrieve(tv) + + def choice(self, name, values, default=None): + tv = Choice(name=name, values=values, default=default) + return self._retrieve(tv) + + def int_range(self, name, start, stop, step=1, default=None): + tv = IntRange( + name=name, start=start, stop=stop, step=step, default=default) + return self._retrieve(tv) + + def float_range(self, name, start, stop, step=None, default=None): + tv = FloatRange( + name=name, start=start, stop=stop, step=step, default=default) + return self._retrieve(tv) + + def get_state(self): + return { + "variables": [{ + "class_name": v.__class__.__name__, + "state": v.get_state() + } for v in self._variables.values()], + "values": dict((k, v) for (k, v) in self.values.items()) + } + + @classmethod + def from_state(cls, state): + ts = cls() + for v in state["variables"]: + v = _deserialize_tunable_variable(v) + ts._variables[v.name] = v + ts._values = dict((k, v) for (k, v) in state["values"].items()) + return ts + + +def _deserialize_tunable_variable(state): + classes = (Boolean, Fixed, Choice, IntRange, FloatRange) + cls_name_to_cls = {cls.__name__: cls for cls in classes} + + if isinstance(state, classes): + return state + + if (not isinstance(state, dict) or "class_name" not in state or + "state" not in state): + raise ValueError( + "Expect state to be a python dict containing class_name and state as keys, but found {}" + .format(state)) + + cls_name = state["class_name"] + cls = cls_name_to_cls[cls_name] + if cls is None: + raise ValueError("Unknown class name {}".format(cls_name)) + + cls_state = state["state"] + deserialized_object = cls.from_state(cls_state) + return deserialized_object diff --git a/python/paddle/distributed/auto_parallel/tuner/tunable_variable.py b/python/paddle/distributed/auto_parallel/tuner/tunable_variable.py new file mode 100644 index 0000000000000000000000000000000000000000..9549b44c48ecb0b04ac22fafa6dcf5b6ff9aa0ae --- /dev/null +++ b/python/paddle/distributed/auto_parallel/tuner/tunable_variable.py @@ -0,0 +1,242 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + + +class TunableVariable(object): + """ + Tunablevariable base class. + """ + + def __init__(self, name, default=None): + self.name = name + self._default = default + + @property + def default(self): + return self._default + + def get_state(self): + return {"name": self.name, "default": self.default} + + @classmethod + def from_state(cls, state): + return cls(**state) + + +class Fixed(TunableVariable): + """ + Fixed variable which cannot be changed. + """ + + def __init__(self, name, default): + super(Fixed, self).__init__(name=name, default=default) + self.name = name + if not isinstance(default, (str, int, float, bool)): + raise ValueError( + "Fixed must be an str, int, float or bool, but found {}" + .format(default)) + self._default = default + + def random(self, seed=None): + return self._default + + def __repr__(self): + return "Fixed(name: {}, value: {})".format(self.name, self.default) + + +class Boolean(TunableVariable): + """ + Choice between True and False. + """ + + def __init__(self, name, default=False): + super(Boolean, self).__init__(name=name, default=default) + if default not in {True, False}: + raise ValueError( + "default must be a Python boolean, but got {}".format(default)) + + def random(self, seed=None): + rng = np.random.default_rng(seed) + return rng.choice((True, False)) + + def __repr__(self): + return 'Boolean(name: "{}", default: {})'.format(self.name, + self.default) + + +class Choice(TunableVariable): + def __init__(self, name, values, default=None): + super(Choice, self).__init__(name=name, default=default) + + types = set(type(v) for v in values) + if len(types) > 1: + raise TypeError( + "Choice can contain only one type of value, but found values: {} with types: {}." + .format(str(values), str(types))) + + if isinstance(values[0], str): + values = [str(v) for v in values] + if default is not None: + default = str(default) + elif isinstance(values[0], int): + values = [int(v) for v in values] + if default is not None: + default = int(default) + elif isinstance(values[0], float): + values = [float(v) for v in values] + if default is not None: + default = float(default) + elif isinstance(values[0], bool): + values = [bool(v) for v in values] + if default is not None: + default = bool(default) + else: + raise TypeError( + "Choice can only contain str, int, float, or boll, but found: {} " + .format(str(values))) + self.values = values + + if default is not None and default not in values: + raise ValueError( + "The default value should be one of the choices {}, but found {}". + format(values, default)) + self._default = default + + @property + def default(self): + if self._default is None: + if None in self.values: + return None + return self.values[0] + return self._default + + def random(self, seed=None): + rng = np.random.default_rng(seed) + return rng.choice(self.values) + + def get_state(self): + state = super(Choice, self).get_state() + state["values"] = self.values + return state + + def __repr__(self): + return 'Choice(name: "{}", values: {}, default: {})'.format( + self.name, self.values, self.default) + + +class IntRange(TunableVariable): + """ + Integer range. + """ + + def __init__(self, name, start, stop, step=1, default=None, endpoint=False): + super(IntRange, self).__init__(name=name, default=default) + self.start = self._check_int(start) + self.stop = self._check_int(stop) + self.step = self._check_int(step) + self._default = default + self.endpoint = endpoint + + @property + def default(self): + if self._default is not None: + return self._default + return self.start + + def random(self, seed=None): + rng = np.random.default_rng(seed) + value = (self.stop - self.start) * rng.random() + self.start + if self.step is not None: + if self.endpoint: + values = np.arange(self.start, self.stop + 1e-7, step=self.step) + else: + values = np.arange(self.start, self.stop, step=self.step) + closest_index = np.abs(values - value).argmin() + value = values[closest_index] + return int(value) + + def get_state(self): + state = super(IntRange, self).get_state() + state["start"] = self.start + state["stop"] = self.stop + state["step"] = self.step + state["default"] = self._default + return state + + def _check_int(self, val): + int_val = int(val) + if int_val != val: + raise ValueError("Expects val is an int, but found: {}.".format( + str(val))) + return int_val + + def __repr__(self): + return "IntRange(name: {}, start: {}, stop: {}, step: {}, default: {})".format( + self.name, self.start, self.stop, self.step, self.default) + + +class FloatRange(TunableVariable): + """ + Float range. + """ + + def __init__(self, + name, + start, + stop, + step=None, + default=None, + endpoint=False): + super(FloatRange, self).__init__(name=name, default=default) + self.stop = float(stop) + self.start = float(start) + if step is not None: + self.step = float(step) + else: + self.step = None + self._default = default + self.endpoint = endpoint + + @property + def default(self): + if self._default is not None: + return self._default + return self.start + + def random(self, seed=None): + rng = np.random.default_rng(seed) + value = (self.stop - self.start) * rng.random() + self.start + if self.step is not None: + if self.endpoint: + values = np.arange(self.start, self.stop + 1e-7, step=self.step) + else: + values = np.arange(self.start, self.stop, step=self.step) + closest_index = np.abs(values - value).argmin() + value = values[closest_index] + return value + + def get_state(self): + state = super(FloatRange, self).get_state() + state["start"] = self.start + state["stop"] = self.stop + state["step"] = self.step + state["endpoint"] = self.endpoint + return state + + def __repr__(self): + return "FloatRange(name: {}, start: {}, stop: {}, step: {}, default: {}, endpoint: {})".format( + self.name, self.start, self.stop, self.step, self.default, + self.endpoint) diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py index b42f21989abd77679993a1c8b52681351e4dfb40..1a3a8a4883d8beb84181609740d2b836f548bc2c 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py @@ -900,11 +900,12 @@ def save_persistables(exe, dirname, main_program, filename=None): def is_opt_vars(var): # NOTE(JZ-LIANG): The checks should be updated when add new compatible optimizer - # now only Momentum and adam are compatible with sharding - # support EMA optimizer + # now only Momentum and adam are compatible with sharding, + # support EMA optimizer with '_ema_0', + # support offload with '@offload_0' and '.cast_fp16' checks = [ "_moment1_0", "_moment2_0", "_beta1_pow_acc_0", "_beta2_pow_acc_0", - "_velocity_0", "_ema_0" + "_velocity_0", "_ema_0", "@offload_0", ".cast_fp16" ] for check in checks: if var.name.endswith(check) and var.persistable: diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py index 9886ca4e2deace4c625ead51852841e7c761be21..f96273cc84caf46f4f02c62e648ce70445b52d28 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py @@ -912,7 +912,6 @@ def _device2cpu(trans_param, convert_dtype=False): def _cpu2device(param): tmp_p = param.fw_storage.cuda(DEV_ID) - param.fw_storage._clear() if tmp_p.dtype == Type.fp32.value and param2dtype[ param.name] == Type.fp16.value: tmp_p = paddle.cast(tmp_p, Type.fp16.value) diff --git a/python/paddle/distributed/run/__init__.py b/python/paddle/distributed/run/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f25ddb794cc4d573429ac960e646bd8125c48d16 --- /dev/null +++ b/python/paddle/distributed/run/__init__.py @@ -0,0 +1,86 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .job.container import Container +from .job.pod import Pod +from .job.job import Job +from . import plugins + +#__all__ = [Container, Pod, Job] +''' +Paddle distribution training entry ``python -m paddle.distributed.run``. + +Help + +# for arg usage and explanation, try the following command +# python -m paddle.distributed.run -h + +Collective Mode + +Case 1: 1 node + +use all visible devices +# python -m paddle.distributed.run train.py + +use specified devices +# python -m paddle.distributed.run --devices=0,1,2,3 train.py + +Case 2: multi-node, auto detect ip/port + +# python -m paddle.distributed.run --np 2 train.py +# auto print following command +# python -m paddle.distributed.run --master 10.0.0.1:13538 --np 2 demo.py +# then copy and paste above command to other nodes + +Case 3: multi-node, specified master/rendezvous server + +# python -m paddle.distributed.run --np 2 --master 10.0.0.1:2379 train.py +# the master ip must be one of the node and the port must available + +Parameter Server Mode + +Case 1.1: 1 node, 1 ps, 1 trainer + +# python -m paddle.distributed.run --mode ps train.py +# python -m paddle.distributed.run --server_num=1 --trainer_num=1 train.py + +Case 1.2: 1 node, 2 ps, 2 trainer + +# python -m paddle.distributed.run --server_num=2 --trainer_num=2 train.py + +Case 2: 2 node, 2 ps, 2 trainer per node + +# python -m paddle.distributed.run --server_num=2 --trainer_num=2 --np 2 train.py +# auto print following command +# python -m paddle.distributed.run --master 10.0.0.1:13538 --server_num=2 --trainer_num=2 --np 2 train.py +# then copy and paste above command to other nodes + +Case 3: multi-node, specified master/rendezvous server + +# python -m paddle.distributed.run --master 10.0.0.1:13538 --server_num=2 --trainer_num=2 --np 2 train.py +# the master ip must be one of the node and the port must available + +Case 4: specified servers and trainers in each node + +python -m paddle.distributed.run --servers 127.0.0.1:8900,127.0.0.1:8901 --trainers 127.0.0.1:8902,127.0.0.1:8903 train.py + + +Elastic Mode + +# run following command in 3 node to run immediately, or in 2 node to run after elastic_timeout +# python -m paddle.distributed.run --master etcd://10.0.0.1:2379 --np 2:3 train.py + +# once the peer number changes between 2:3, the strategy holds + +''' diff --git a/python/paddle/distributed/run/__main__.py b/python/paddle/distributed/run/__main__.py new file mode 100644 index 0000000000000000000000000000000000000000..e32df59a328081e33aa86b42ed9b8e489ac399e8 --- /dev/null +++ b/python/paddle/distributed/run/__main__.py @@ -0,0 +1,28 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .context import Context +from . import controllers + +# initialize the context to run +ctx = Context() + +# initialize the selected controller +c = controllers.init(ctx) + +# run the pods +c.run() + +# manager or just wait pod +c.finalize() diff --git a/python/paddle/distributed/run/context/__init__.py b/python/paddle/distributed/run/context/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..86dff0f1f8056e784268a6ef3a3ebabb44aa9c6d --- /dev/null +++ b/python/paddle/distributed/run/context/__init__.py @@ -0,0 +1,219 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from argparse import ArgumentParser, REMAINDER +import os, copy + +from paddle.distributed.run import plugins + +from .node import Node +from .status import Status + +import logging + + +class Context(object): + def __init__(self, enable_plugin=True): + os.environ.pop('http_proxy', None) + os.environ.pop('https_proxy', None) + + self.args = self.parse_args() + self.envs = self.fetch_envs() + self.logger = self.get_logger() + + self.node = Node() + self.status = Status() + + self.set_env_in_args() + + # design for event queue, later + self.events = [] + + if enable_plugin: + self._enable_plugin() + + def get_envs(self): + return self.envs.copy() + + def _enable_plugin(self): + for pl in plugins.enabled_plugins: + pl(self) + + def parse_args(self): + parser = ArgumentParser() + + base_group = parser.add_argument_group("Base Parameters") + + base_group.add_argument( + "--master", + type=str, + default=None, + help="the master/rendezvous server, ip:port") + + base_group.add_argument( + "--rank", type=int, default=-1, help="the peer rank") + + base_group.add_argument( + "--log", type=str, default="INFO", help="log level. Default INFO") + + base_group.add_argument( + "--np", + type=str, + default="1", + help="the number of peers, i.e. pod/node number") + + base_group.add_argument( + "--nproc_per_node", + type=int, + default=None, + help="the number of processes in a pod") + + base_group.add_argument( + "--log_dir", + type=str, + default="log", + help="the path for each process's log. Default ./log") + base_group.add_argument( + "--mode", + type=str, + default="collective", + help="run mode of the job, collective/ps/ps-heter") + + base_group.add_argument( + "--id", + type=str, + default="default", + help="unique id of the job. Default default") + + base_group.add_argument( + "--devices", + type=str, + default=None, + help="accelerate devices. as --gpus,npus,xps") + + base_group.add_argument( + "--host", type=str, default=None, help="host ip") + + base_group.add_argument( + "training_script", + type=str, + help="the full path of py script," + "followed by arguments for the " + "training script") + + base_group.add_argument('training_script_args', nargs=REMAINDER) + + ps_group = parser.add_argument_group("Parameter-Server Parameters") + # for parameter server + ps_group.add_argument( + "--servers", + type=str, + default='', + help="servers endpoints full list") + ps_group.add_argument( + "--trainers", + type=str, + default='', + help="trainers endpoints full list") + + ps_group.add_argument( + "--trainer_num", type=int, default=None, help="number of trainers") + ps_group.add_argument( + "--server_num", type=int, default=None, help="number of servers") + ps_group.add_argument( + "--gloo_port", type=int, default=6767, help="gloo http port") + ps_group.add_argument( + "--with_gloo", type=str, default="0", help="use gloo or not") + + # parameter elastic mode + elastic_group = parser.add_argument_group("Elastic Parameters") + elastic_group.add_argument( + "--max_restart", + type=int, + default=3, + help="the times can restart. Default 3") + + elastic_group.add_argument( + "--elastic_level", + type=int, + default=-1, + help="elastic level: -1 disable, 0 failed exit, peers hold, 1 internal restart" + ) + + elastic_group.add_argument( + "--elastic_timeout", + type=int, + default=30, + help="seconds to wait before elastic perform training") + return parser.parse_args() + + def _valide_env(self, key): + if key in ['POD_IP']: + return True + if key.endswith('_VISIBLE_DEVICES'): + return True + if key.startswith('PADDLE_'): + return True + + return False + + def fetch_envs(self): + ge = os.environ.copy() + + black_env_list = ['http_proxy', 'https_proxy'] + for key in black_env_list: + ge.pop(key, None) + + return ge + ''' + # use black list instead white list + return {k: ge[k] for k in ge if self._valide_env(k)} + ''' + + def get_logger(self, level=logging.INFO): + logger = logging.getLogger("PADDLERUN") + logger.setLevel(self.args.log.upper() or level) + formatter = logging.Formatter( + fmt='%(name)s %(levelname)s %(asctime)s %(message)s') + ch = logging.StreamHandler() + ch.setFormatter(formatter) + logger.addHandler(ch) + return logger + + def set_env_in_args(self): + env_args = { + 'POD_IP': 'host', + 'PADDLE_MASTER': 'master', + 'PADDLE_DEVICES': 'devices', + 'PADDLE_NP': 'np', + 'PADDLE_MODE': 'mode', + 'PADDLE_LOG': 'log', + 'PADDLE_NPROC_PER_NODE': 'nproc_per_node', + 'PADDLE_JOB_ID': 'id', + 'PADDLE_RANK': 'rank', + 'PADDLE_LOG_DIR': 'log_dir', + 'PADDLE_MAX_RESTlRT': 'max_restart', + 'PADDLE_ELASTIC_LEVEL': 'elastic_level', + 'PADDLE_ELASTIC_TIMEOUT': 'elastic_timeout', + 'PADDLE_SERVER_NUM': 'server_num', + 'PADDLE_TRAINER_NUM': 'trainer_num', + 'PADDLE_SERVERS_ENDPOINTS': 'servers', + 'PADDLE_TRAINERS_ENDPOINTS': 'trainers', + 'PADDLE_GLOO_PORT': 'gloo_port', + 'PADDLE_WITH_GLOO': 'with_gloo', + } + + for k, v in env_args.items(): + if k in self.envs: + setattr(self.args, v, self.envs[k]) diff --git a/python/paddle/distributed/run/context/device.py b/python/paddle/distributed/run/context/device.py new file mode 100644 index 0000000000000000000000000000000000000000..d8bbd851ccf83a1ebfac60758576384bbe1aa4f4 --- /dev/null +++ b/python/paddle/distributed/run/context/device.py @@ -0,0 +1,88 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + + +class DeviceType: + CPU = 'cpu' + GPU = 'gpu' + XPU = 'xpu' + NPU = 'npu' + + +class Device(object): + def __init__(self, dtype=None, count=1, memory="", labels=""): + self.dtype = dtype + self.count = count + self.memory = memory + self.labels = labels + + def __str__(self): + return ",".join(self.labels) + + @classmethod + def parse_device(self): + dev = Device() + visible_devices = None + if 'CUDA_VISIBLE_DEVICES' in os.environ or 'NVIDIA_VISIBLE_DEVICES' in os.environ: + dev.dtype = DeviceType.GPU + visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") or os.getenv( + "NVIDIA_VISIBLE_DEVICES") + elif 'XPU_VISIBLE_DEVICES' in os.environ: + dev.dtype = DeviceType.XPU + visible_devices = os.getenv("XPU_VISIBLE_DEVICES") + elif 'ASCEND_VISIBLE_DEVICES' in os.environ: + dev.dtype = DeviceType.NPU + visible_devices = os.getenv("ASCEND_VISIBLE_DEVICES") + + if visible_devices and visible_devices != 'all': + dev.labels = visible_devices.split(',') + dev.count = len(dev.labels) + else: + return self.detect_device() + + return dev + + @classmethod + def detect_device(self): + import paddle.fluid as fluid + + dev = Device() + num = 0 + visible_devices = None + if fluid.core.is_compiled_with_cuda(): + dev.dtype = DeviceType.GPU + num = fluid.core.get_cuda_device_count() + visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") or os.getenv( + "NVIDIA_VISIBLE_DEVICES") + elif fluid.core.is_compiled_with_xpu(): + dev.dtype = DeviceType.XPU + num = fluid.core.get_xpu_device_count() + visible_devices = os.getenv("XPU_VISIBLE_DEVICES") + elif fluid.core.is_compiled_with_npu(): + dev.dtype = DeviceType.NPU + num = fluid.core.get_npu_device_count() + visible_devices = os.getenv("ASCEND_VISIBLE_DEVICES") + + if num == 0: + dev.dtype = DeviceType.CPU + elif visible_devices is None or visible_devices == "all" or visible_devices == "": + dev.labels = [str(x) for x in range(0, num)] + dev.count = num + else: + dev.labels = visible_devices.split(',') + dev.count = len(dev.labels) + + return dev diff --git a/python/paddle/distributed/run/context/event.py b/python/paddle/distributed/run/context/event.py new file mode 100644 index 0000000000000000000000000000000000000000..23e8e7a5014002b480b717623dec2d5ee62eb743 --- /dev/null +++ b/python/paddle/distributed/run/context/event.py @@ -0,0 +1,20 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class Event(object): + def __init__(self, kind="status", message="", fatal=False): + self.kind = kind + self.message = message + self.fatal = fatal diff --git a/python/paddle/distributed/run/context/node.py b/python/paddle/distributed/run/context/node.py new file mode 100644 index 0000000000000000000000000000000000000000..1ece4db0fbbeed379c2cda343022dd371a9e7540 --- /dev/null +++ b/python/paddle/distributed/run/context/node.py @@ -0,0 +1,64 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .device import Device + +import socket +import struct +from contextlib import closing + + +class Node(object): + def __init__(self): + # self.device = Device.detect_device() + self.device = Device.parse_device() + self.ip = self.get_host_ip() + self.free_ports = [] + + def get_host_ip(self): + try: + self.hostname = socket.gethostname() + self.ip = socket.gethostbyname(socket.getfqdn(self.hostname)) + return self.ip + except: + return '127.0.0.1' + + def get_free_ports(self, n=1): + free_ports = [self.get_free_port() for i in range(n)] + self.free_ports += free_ports + return free_ports + + def get_ports_occupied(self): + return self.free_ports + + @classmethod + def get_free_port(self): + with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s: + s.setsockopt(socket.SOL_SOCKET, socket.SO_LINGER, + struct.pack('ii', 1, 0)) + s.bind(('', 0)) + return s.getsockname()[1] + + @classmethod + def is_server_ready(self, ip, port): + with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock: + #sock.settimeout(0.01) + #sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + if hasattr(socket, 'SO_REUSEPORT'): + sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, 1) + result = sock.connect_ex((ip, int(port))) + if result == 0: + return True + else: + return False diff --git a/python/paddle/distributed/run/context/resource.py b/python/paddle/distributed/run/context/resource.py new file mode 100644 index 0000000000000000000000000000000000000000..faffed704c1f078f9fed131ef1ade98add60b5d9 --- /dev/null +++ b/python/paddle/distributed/run/context/resource.py @@ -0,0 +1,18 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class Resource(object): + def __init__(self): + self.devices = [] diff --git a/python/paddle/distributed/run/context/status.py b/python/paddle/distributed/run/context/status.py new file mode 100644 index 0000000000000000000000000000000000000000..cfbf3623ec22ed56b5ce136d8a6813291be69e8f --- /dev/null +++ b/python/paddle/distributed/run/context/status.py @@ -0,0 +1,58 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class Status(object): + UNINIT = "uninit" + READY = "ready" + RUNNING = "running" + FAILED = "failed" + TERMINATING = "terminating" + RESTARTING = "restarting" + UNKNOWN = "unknown" + COMPLETED = "completed" + DONE = "done" # should exit whatever status + + def __init__(self): + self._current_status = None + + def current(self): + return self._current_status + + def is_running(self): + return self._current_status == self.RUNNING + + def is_restarting(self): + return self._current_status == self.RESTARTING + + def is_done(self): + if self._current_status in [self.DONE, self.COMPLETED, self.FAILED]: + return True + else: + return False + + def run(self): + self._current_status = self.RUNNING + + def fail(self): + self._current_status = self.FAILED + + def complete(self): + self._current_status = self.COMPLETED + + def restart(self): + self._current_status = self.RESTARTING + + def done(self): + self._current_status = self.DONE diff --git a/python/paddle/distributed/run/controllers/__init__.py b/python/paddle/distributed/run/controllers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e5557151ad5489cb4af0c34b3ad47c31774b3326 --- /dev/null +++ b/python/paddle/distributed/run/controllers/__init__.py @@ -0,0 +1,32 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = ["init"] + +from .collective import CollectiveController +from .collective import CollectiveElasticController +from .ps import PSController + +# the order is extremely important +_controllers = [ + CollectiveElasticController, + PSController, + CollectiveController, +] + + +def init(ctx): + for c in _controllers: + if c.enable(ctx): + return c(ctx) diff --git a/python/paddle/distributed/run/controllers/collective.py b/python/paddle/distributed/run/controllers/collective.py new file mode 100644 index 0000000000000000000000000000000000000000..c4feb54428a07265693c0969e6e385a380e22f3d --- /dev/null +++ b/python/paddle/distributed/run/controllers/collective.py @@ -0,0 +1,185 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .controller import Controller + +import json +import os +import six +import time + + +class CollectiveController(Controller): + @classmethod + def enable(cls, ctx): + if ctx: + ctx.logger.debug("{} enabled".format(cls.__name__)) + return True + else: + return False + + def build_pod(self): + self.pod.replicas = self.pod_replicas() + + # rank will be reset when restart + self.pod.rank = self.ctx.args.rank + + port = self.ctx.node.get_free_port() + + # compatible + endpoints = [ + "{}:{}".format(self.ctx.node.ip, p) + for p in self.ctx.node.get_free_ports(self.pod.replicas) + ] + + data = json.dumps({ + 'name': self.pod.name, + 'rank': self.pod.rank, + 'replicas': self.pod.replicas, + 'dtype': self.ctx.node.device.dtype, + 'candidate': '{}:{}'.format(self.ctx.node.ip, port), + 'endpoints': ",".join(endpoints), + }) + + peer_list, rank = self.master.sync_peers( + '/{}/info'.format(self.job.id), self.pod.name, data, + self.job.replicas, self.pod.rank) + self.pod.rank = rank + + if len(peer_list) < 1: + return False + + peer_list = [json.loads(i) for i in peer_list] + + self.ctx.logger.debug("sync peers done {}".format(peer_list)) + self.save_pod_log(peer_list) + + global_size = sum([i['replicas'] for i in peer_list]) + rank_offset = sum([i['replicas'] for i in peer_list[:rank]]) + ''' + The new designed collective need nothing but a master endpoint + ''' + collective_master = peer_list[0]['candidate'] + + job_endpoints = [i['endpoints'] for i in peer_list] + + self.pod.reset() + for i in range(self.pod.replicas): + e = { + "PADDLE_MASTER": collective_master, + "PADDLE_GLOBAL_SIZE": "{}".format(global_size), + "PADDLE_LOCAL_SIZE": "{}".format(self.pod.replicas), + "PADDLE_GLOBAL_RANK": "{}".format(i + rank_offset), + "PADDLE_LOCAL_RANK": "{}".format(i), + ## compatible env + "PADDLE_TRAINER_ENDPOINTS": ",".join(job_endpoints), + "PADDLE_CURRENT_ENDPOINT": endpoints[i], + "PADDLE_TRAINER_ID": "{}".format(i + rank_offset), + "PADDLE_TRAINERS_NUM": "{}".format(global_size), + "PADDLE_RANK_IN_NODE": str(i), + } + self.add_container(envs=e, log_tag=i) + + return True + + +class CollectiveElasticController(CollectiveController): + @classmethod + def enable(cls, ctx): + if ctx.args.master and ctx.args.master.startswith("etcd://"): + ctx.logger.debug("{} enabled".format(cls.__name__)) + return True + else: + return False + + def register(self): + if self.job.id == 'default': + self.ctx.logger.warning( + 'Using default job name may cause conflict, add --id in args') + + self.master.register_heartbeat(self.job.id, self.pod.name) + + def watch(self) -> bool: + ''' + watch self and peer status, return true to exit + ''' + while not self.ctx.status.is_done(): + # self status + status = self.pod.watch(timeout=2) + self.ctx.logger.debug("Pod status {}, Ctx status {}".format( + status, self.ctx.status.current())) + + # completed + if status == self.ctx.status.COMPLETED: + self.master.set_status(status) + self.ctx.status.complete() + self.ctx.logger.info("Pod complete {}".format(status)) + return True + + # self failure + elif status == self.ctx.status.FAILED: + self.master.set_status(status) + self.master.restart_peer() + self.ctx.logger.info("Pod failed {}".format(status)) + self.pod.stop() + + if self.ctx.args.elastic_level <= 0: + return True + else: + return False + + # peer failure + if self.ctx.status.is_restarting() and self.master.get_status( + ) != self.ctx.status.COMPLETED: + self.pod.stop() + return False + + #peers = self.master.fetch_peer_alive() + #print("peers {}".format(peers)) + + def run(self): + + timeout = self.ctx.args.elastic_timeout if self.job.elastic else self.ctx.args.elastic_timeout * 10 + self.register() + + while self.pod.restart <= self.ctx.args.max_restart: + + self.build_job() + + ok, replicas = self.master.wait_peer_ready( + self.job.replicas_min, self.job.replicas_max, timeout) + if ok: + self.job.replicas = replicas + else: + self.ctx.logger.warnning("peer not ready {}".format(self.job)) + break + + self.ctx.logger.debug("Run {}".format(self.job)) + + if not self.build_pod(): + continue + + self.master.set_status(self.ctx.status.RUNNING) + self.ctx.status.run() + + assert len(self.pod.containers) > 0, "No container in the pod" + self.ctx.logger.debug("Run {}".format(self.pod)) + self.ctx.logger.debug("Run {}".format(self.pod.containers[0])) + + self.pod.deploy() + + if self.watch(): + break + + self.ctx.logger.debug("Job done {}".format(self.job)) diff --git a/python/paddle/distributed/run/controllers/controller.py b/python/paddle/distributed/run/controllers/controller.py new file mode 100644 index 0000000000000000000000000000000000000000..2d904cf2a2cca5b9abaab06d1545c03c160e3d93 --- /dev/null +++ b/python/paddle/distributed/run/controllers/controller.py @@ -0,0 +1,192 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import os +import signal + +from paddle.distributed.run.job import Job +from paddle.distributed.run.job import Pod +from paddle.distributed.run.job import Container + +from .master import Master + +import time + + +class ControleMode: + COLLECTIVE = "collective" + PS = "ps" + + +class ControllerBase(object): + def __init__(self, ctx): + signal.signal(signal.SIGTERM, self.signal_handler) + signal.signal(signal.SIGABRT, self.signal_handler) + signal.signal(signal.SIGINT, self.signal_handler) + + self.ctx = ctx + self.master = Master.factory(self.ctx) + + self.job = Job(np=self.ctx.args.np, + mode=self.ctx.args.mode, + id=self.ctx.args.id) + self.pod = Pod() + + self.join_server = None + + def run(self): + self.build_job() + self.build_pod() + + if len(self.pod.containers) < 1: + self.ctx.logger.error("No container in the pod {}".format(self.pod)) + return + + self.ctx.logger.info("Run {}".format(self.pod)) + self.ctx.logger.debug(self.pod.containers[0]) + + self.pod.deploy() + + self.watch() + + def watch(self) -> bool: + status = self.pod.watch() + + if status == self.ctx.status.COMPLETED: + self.ctx.logger.info("Pod {}".format(status)) + elif status == self.ctx.status.FAILED: + self.ctx.logger.info("Pod {}".format(status)) + self.ctx.logger.error("Container failed !!!\n{}".format( + self.pod.failed_container())) + self.pod.tail() + self.pod.stop() + + def stop(self, sigint=None): + self.ctx.logger.debug("Controller stop") + self.master.stop() + self.pod.stop(sigint) + + def finalize(self): + self.pod.join() + self.master.stop() + + self.ctx.logger.info("Exit code {}".format(self.pod.exit_code)) + sys.exit(self.pod.exit_code) + + def signal_handler(self, sigint, frame): + self.ctx.logger.info("Terminating with signal {}".format(sigint)) + + if hasattr(self, 'sigint'): + time.sleep(5) + sys.exit(sigint) + + self.sigint = sigint + self.ctx.status.done() + self.stop(sigint) + time.sleep(1) + self.ctx.logger.debug("Exit with signal {}".format(sigint)) + sys.exit(sigint) + + +class Controller(ControllerBase): + ''' + Controller API for customization + ''' + + def build_job(self): + ''' + build job fill the job info. + ''' + self.ctx.logger.info(self.job) + + def build_pod(self) -> bool: + ''' + build pod includes creating containers etc. + + Return True if succeed + ''' + raise NotImplementedError + + def _get_entrypoint(self): + entrypoint = [sys.executable, "-u", self.ctx.args.training_script] + entrypoint.extend(self.ctx.args.training_script_args) + return entrypoint + + def _get_out_err_file(self, out=None, err=None): + if out and self.ctx.args.log_dir != "": + out = os.path.join(self.ctx.args.log_dir, out) + if err and self.ctx.args.log_dir != "": + err = os.path.join(self.ctx.args.log_dir, err) + return out, (err or out) + + def new_container(self, + entrypoint=None, + envs={}, + use_ctx_env=True, + out=None, + err=None): + c = Container( + entrypoint=(entrypoint or self._get_entrypoint()), + env=(self.ctx.get_envs() if use_ctx_env else {}), ) + c.outfile, c.errfile = self._get_out_err_file(out, err) + c.update_env(envs) + return c + + def add_container(self, + container=None, + entrypoint=None, + envs={}, + log_tag=None, + is_init=False): + if not is_init and log_tag is not None: + log_file = "{}.{}.{}.log".format(self.job.id, self.pod.name, + log_tag) + else: + log_file = None + + if not container: + container = self.new_container( + entrypoint=entrypoint, envs=envs, out=log_file, err=log_file) + + if is_init: + self.pod.add_init_container(container) + else: + self.pod.add_container(container) + + def pod_replicas(self): + ''' + how many process/container should be run in pod + ''' + + if self.ctx.args.nproc_per_node: + return int(self.ctx.args.nproc_per_node) + else: + return self.ctx.node.device.count + + def save_pod_log(self, info): + ''' + save_pod_log append *info* to the log file of pod.name + ''' + if not self.ctx.args.log_dir: + return + + f = os.path.join(self.ctx.args.log_dir, + '{}.{}.log'.format(self.job.id, self.pod.name)) + try: + os.makedirs(os.path.dirname(f), exist_ok=True) + with open(f, 'a+') as fd: + fd.write(str(info)) + except Exception as e: + self.ctx.logger.error("save log failed because {}".format(e)) diff --git a/python/paddle/distributed/run/controllers/master.py b/python/paddle/distributed/run/controllers/master.py new file mode 100644 index 0000000000000000000000000000000000000000..257ba3bad8da3c331ac303b7a3ee415461fd13b8 --- /dev/null +++ b/python/paddle/distributed/run/controllers/master.py @@ -0,0 +1,289 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.distributed.run.utils.kv_client import KVClient +from paddle.distributed.run.utils.kv_server import KVServer + +import time +import sys +import six +import threading +import copy +import random + +ETCD_PROTOCAL = 'etcd://' + + +class Master(object): + ''' + Master is a distributed store design to exchange info among nodes + ''' + + MAIN = "main" + STANDBY = "standby" + PATICIPANT = "participant" + + def __init__(self, ctx): + self.ctx = ctx + self.server = None + self.initialized = False + self.endpoint = None + + def stop(self): + raise NotImplementedError + + def sync_peers(self, prefix, key, value, size, rank=-1) -> (list, int): + raise NotImplementedError + + @classmethod + def factory(cls, ctx): + if ctx.args.master and ctx.args.master.startswith(ETCD_PROTOCAL): + return ETCDMaster(ctx) + else: + return HTTPMaster(ctx) + + +class HTTPMaster(Master): + def lazy_init(self): + if self.initialized: + return + + self.role = Master.PATICIPANT + + if self.ctx.args.master: + self.endpoint = self.ctx.args.master + ip, port = self.endpoint.split(':') + if ip in ['127.0.0.1', self.ctx.node.ip]: + time.sleep(2 * random.random()) + while not self.ctx.node.is_server_ready(ip, int(port)): + try: + self.server = KVServer(int(port)) + self.role = Master.MAIN + break + except Exception as e: + self.ctx.logger.warning("start master failed {}".format( + e)) + time.sleep(0.1) + continue + else: + port = self.ctx.node.get_free_port() + self.endpoint = "{}:{}".format(self.ctx.node.ip, port) + self.server = KVServer(port) + self.role = Master.MAIN + + print("Copy the following command to other nodes to run.") + cmd = [ + sys.executable.split('/')[-1], "-m", "paddle.distributed.run" + ] + cmd.extend(["--master", self.endpoint]) + cmd.extend(sys.argv[1:]) + print("-" * 80) + print(" ".join(cmd)) + print("-" * 80) + + if self.ctx.args.rank >= 0: + self.ctx.logger.warning( + "--rank set in the command may not compatible in auto mode") + + if '127.0.0.1' in self.endpoint: + self.endpoint = self.endpoint.replace('127.0.0.1', self.ctx.node.ip) + self.client = KVClient(self.endpoint) + + self.initialized = True + + self._start_server() + + def _start_server(self): + if self.server and not self.server.started: + self.server.start() + self.ctx.logger.debug("KV server start at {}".format(self.endpoint)) + + def _stop_server(self): + if self.server and not self.server.stopped: + self.server.stop() + self.ctx.logger.debug("KV server stopped") + + def stop(self): + self._stop_server() + + def sync_peers(self, prefix, key, value, size, rank=-1) -> (list, int): + if size < 2: + return [value], 0 + + self.lazy_init() + + while not self.ctx.status.is_done(): + if self.client.wait_server_ready(timeout=5): + break + else: + self.ctx.logger.warning("master not ready") + time.sleep(0.1) + + # 'aaaaaa' make suer main pod (master server) as rank 0 + ky = 'aaaaaa' if rank < 0 and self.role == Master.MAIN else key + k = "{}/{}/{}".format(prefix, ky, rank) + + while not self.ctx.status.is_done(): + if not self.client.put(k, value): + self.ctx.logger.warning("put value failed") + time.sleep(0.1) + continue + + rjson = self.client.get_prefix(prefix) + self.ctx.logger.debug("sync peers {}".format(rjson)) + if rjson and len(rjson) == size: + if rank < 0: + keys = list(rjson.keys()) + keys.sort() + ret = [rjson[k] for k in keys] + idx = ret.index(value) + return ret, idx + else: + ret = [None] * size + for k, v in rjson.items(): + ret[int(k.split('/')[-1])] = v + return ret, rank + else: + time.sleep(0.5) + return [], 0 + + +class ETCDMaster(Master): + def __init__(self, ctx): + super().__init__(ctx) + + if self.ctx.args.master: + # etcd://localhost:2379 + self.endpoint = self.ctx.args.master.strip("etcd://") + + import etcd3 + + host, port = self.endpoint.split(':') + self.client = etcd3.client(host=host, port=port) + + def sync_peers(self, prefix, key, value, size, rank=-1) -> (list, int): + ''' + sync_peers gather all value for key under scope prefix + result always be sorted either by rank or alphabet of pod.name + ''' + path = "{}/{}/{}".format(prefix, key, rank) + + self.client.delete_prefix(prefix) + + self.ctx.logger.debug("sync path {} value {}".format(path, value)) + + while not self.ctx.status.is_done(): + self.client.put(path, six.b(value)) + + result = [i for i in self.client.get_prefix(prefix)] + result = copy.deepcopy(result) + self.ctx.logger.debug("sync peers {}".format(result)) + + if len(result) == size: + if rank < 0: + keys = [six.ensure_str(i[1].key) for i in result] + sorted_keys = [six.ensure_str(i[1].key) for i in result] + sorted_keys.sort() + values = [six.ensure_str(i[0]) for i in result] + ret = [values[keys.index(k)] for k in sorted_keys] + idx = ret.index(value) + return ret, idx + else: + ret = [None] * size + for v, k in result: + ii = int(six.ensure_str(k.key).split('/')[-1]) + if ii < 0: + self.ctx.logger.error( + "rank {} error in sync".format(ii)) + ret[ii] = six.ensure_str(v) + return ret, rank + else: + time.sleep(0.5) + + def register_heartbeat(self, job_id, pod_id, ttl=10): + if hasattr(self, 'heartbeat_prefix'): + self.ctx.logger.warning("Heartbeat already done") + return + + self.job_prefix = '/paddle/{}'.format(job_id) + self.heartbeat_prefix = '{}/heartbeat'.format(self.job_prefix) + + lease = self.client.lease(ttl) + + #self.client.delete_prefix(self.job_prefix) + + beat_path = "{}/{}".format(self.heartbeat_prefix, pod_id) + self.client.put(beat_path, six.b(pod_id), lease=lease) + + def _beat_watch(event): + self.ctx.status.restart() + + beat_watch = self.client.add_watch_prefix_callback( + self.heartbeat_prefix, _beat_watch) + + def _heartbeat(): + while not self.ctx.status.is_done(): + try: + lease.refresh() + if pod_id not in self.fetch_peer_alive(): + self.client.put(beat_path, six.b(pod_id), lease=lease) + self.ctx.logger.debug("Heartbeat register again") + except Exception as e: + self.ctx.logger.error("Heartbeat error {}".format(e)) + time.sleep(ttl / 2) + self.ctx.logger.debug("Heartbeat done") + self.client.cancel_watch(beat_watch) + + self.beat_thread = threading.Thread( + name='heartbeat', target=_heartbeat, daemon=True) + self.beat_thread.start() + + def fetch_peer_alive(self): + peer_alive = [ + six.ensure_str(i[0]) + for i in self.client.get_prefix(self.heartbeat_prefix) + ] + self.ctx.logger.debug("peer alive {}".format(peer_alive)) + return peer_alive + + def wait_peer_ready(self, replicas_min, replicas_max, timeout): + end = time.time() + timeout + while not self.ctx.status.is_done() and time.time() < end: + if len(self.fetch_peer_alive()) == replicas_max: + return (True, replicas_max) + else: + time.sleep(0.5) + + np = len(self.fetch_peer_alive()) + if np >= replicas_min and np <= replicas_max: + return (True, np) + else: + return (False, np) + + def restart_peer(self): + self.client.delete_prefix(self.heartbeat_prefix) + + def set_status(self, status): + assert self.client.put( + self.job_prefix, six.b(status), + lease=self.client.lease(600)), "set status failed {}".format(status) + + def get_status(self): + return six.ensure_str(self.client.get(self.job_prefix)[0] or '') + + def stop(self): + if hasattr(self, 'beat_thread'): + self.ctx.status.done() + # TODO(kuizhiqing) thread should exit + #self.beat_thread.join() diff --git a/python/paddle/distributed/run/controllers/ps.py b/python/paddle/distributed/run/controllers/ps.py new file mode 100644 index 0000000000000000000000000000000000000000..cc43c336cf1862fe075e5a4463b1f5b666a5005c --- /dev/null +++ b/python/paddle/distributed/run/controllers/ps.py @@ -0,0 +1,221 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .controller import Controller, ControleMode + +import json +import os, shutil + + +class PSController(Controller): + @classmethod + def enable(cls, ctx): + if ctx.args.mode == ControleMode.PS or ctx.args.server_num or len( + ctx.args.servers) > 0: + ctx.logger.debug("{} enabled".format(cls.__name__)) + ctx.args.mode = ControleMode.PS + return True + else: + return False + + def build_pod(self): + if self.ctx.args.servers and self.ctx.args.trainers: + self._build_pod_with_args() + else: + self._build_pod_with_master() + + def _build_pod_with_args(self): + if '127.0.0.1' in self.ctx.args.servers: + host = '127.0.0.1' + else: + host = self.ctx.node.ip + + server_endpoints = [s for s in self.ctx.args.servers.split(",")] + trainer_endpoints = [s for s in self.ctx.args.trainers.split(",")] + servers = [ + s for s in self.ctx.args.servers.split(",") if s.startswith(host) + ] + trainers = [ + s for s in self.ctx.args.trainers.split(",") if s.startswith(host) + ] + server_num = len(servers) + trainer_num = len(trainers) + + self.pod.replicas = server_num + trainer_num + + self.save_pod_log([server_endpoints, trainer_endpoints]) + + import tempfile + gloo_rendezvous_dir = tempfile.mkdtemp() + if os.path.exists(gloo_rendezvous_dir): + shutil.rmtree(gloo_rendezvous_dir) + + gloo_port = self.ctx.args.gloo_port + gloo_http = "{}:{}".format(server_endpoints[0].split(":")[0], gloo_port) + + _gloo_envs = { + "PADDLE_GLOO_RENDEZVOUS": "3", + "PADDLE_GLOO_FS_PATH": gloo_rendezvous_dir, + "PADDLE_GLOO_HTTP_ENDPOINT": gloo_http, + "PADDLE_WITH_GLOO": self.ctx.args.with_gloo + } + + for i in range(server_num): + e = { + "PADDLE_PSERVERS_IP_PORT_LIST": self.ctx.args.servers, + "PADDLE_TRAINER_ENDPOINTS": self.ctx.args.trainers, + "PADDLE_PORT": servers[i].split(":")[1], + "PADDLE_ROLE": "PSERVER", + "TRAINING_ROLE": "PSERVER", + "PADDLE_TRAINERS_NUM": "{}".format(len(trainer_endpoints)), + "POD_IP": self.ctx.node.ip, + } + e.update(_gloo_envs) + log_tag = "ps.{}".format(i) + self.add_container(envs=e, log_tag=log_tag) + + trainer_rank_offset = 0 + for s in trainer_endpoints: + if s.startswith(host): + break + else: + trainer_rank_offset += 1 + + for i in range(trainer_num): + e = { + "PADDLE_PSERVERS_IP_PORT_LIST": ",".join(server_endpoints), + "PADDLE_TRAINER_ENDPOINTS": ",".join(trainer_endpoints), + "PADDLE_PORT": trainers[i].split(":")[1], + "PADDLE_ROLE": "TRAINER", + "TRAINING_ROLE": "TRAINER", + "PADDLE_TRAINER_ID": "{}".format(i + trainer_rank_offset), + "PADDLE_TRAINERS_NUM": "{}".format(len(trainer_endpoints)), + "POD_IP": self.ctx.node.ip, + } + e.update(_gloo_envs) + log_tag = "trainer.{}".format(i) + self.add_container(envs=e, log_tag=log_tag) + + def _build_pod_with_master(self): + + self.pod.rank = self.ctx.args.rank + + server_num = self.ctx.args.server_num or 1 + servers = [ + "{}:{}".format(self.ctx.node.ip, p) + for p in self.ctx.node.get_free_ports(server_num) + ] + trainer_num = self.ctx.args.trainer_num or 1 + trainers = [ + "{}:{}".format(self.ctx.node.ip, p) + for p in self.ctx.node.get_free_ports(trainer_num) + ] + + data = json.dumps({ + 'name': self.pod.name, + 'rank': self.pod.rank, + 'servers': servers, + 'trainers': trainers, + 'dtype': self.ctx.node.device.dtype, + 'gloo_port': self.ctx.node.get_free_port(), + }) + + peer_list, rank = self.master.sync_peers( + '/{}/info'.format(self.job.id), self.pod.name, data, + self.job.replicas, self.pod.rank) + + self.ctx.logger.debug("sync peers done {}".format(peer_list)) + + peer_list = [json.loads(i) for i in peer_list] + + self.save_pod_log(peer_list) + + server_endpoints = [j for i in peer_list for j in i['servers']] + trainer_endpoints = [j for i in peer_list for j in i['trainers']] + #rank_offset = sum([i['replicas'] for i in peer_list[:rank]]) + + server_rank_offset = sum([len(i['servers']) for i in peer_list[:rank]]) + trainer_rank_offset = sum( + [len(i['trainers']) for i in peer_list[:rank]]) + + self.pod.rank = rank + + self.pod.replicas = server_num + trainer_num + + import tempfile + gloo_rendezvous_dir = tempfile.mkdtemp() + if os.path.exists(gloo_rendezvous_dir): + shutil.rmtree(gloo_rendezvous_dir) + + gloo_port = peer_list[0]['gloo_port'] + gloo_http = "{}:{}".format(server_endpoints[0].split(":")[0], gloo_port) + + _gloo_envs = { + "PADDLE_GLOO_RENDEZVOUS": "3", + "PADDLE_GLOO_FS_PATH": gloo_rendezvous_dir, + "PADDLE_GLOO_HTTP_ENDPOINT": gloo_http, + "PADDLE_WITH_GLOO": self.ctx.args.with_gloo + } + + for i in range(server_num): + e = { + "PADDLE_PSERVERS_IP_PORT_LIST": ",".join(server_endpoints), + "PADDLE_TRAINER_ENDPOINTS": ",".join(trainer_endpoints), + "PADDLE_PORT": + server_endpoints[i + server_rank_offset].split(":")[1], + "PADDLE_ROLE": "PSERVER", + "TRAINING_ROLE": "PSERVER", + "PADDLE_TRAINERS_NUM": "{}".format(len(trainer_endpoints)), + "POD_IP": self.ctx.node.ip, + } + e.update(_gloo_envs) + log_tag = "ps.{}".format(i) + self.add_container(envs=e, log_tag=log_tag) + + for i in range(trainer_num): + e = { + "PADDLE_PSERVERS_IP_PORT_LIST": ",".join(server_endpoints), + "PADDLE_TRAINER_ENDPOINTS": ",".join(trainer_endpoints), + "PADDLE_PORT": + trainer_endpoints[i + trainer_rank_offset].split(":")[1], + "PADDLE_ROLE": "TRAINER", + "TRAINING_ROLE": "TRAINER", + "PADDLE_TRAINER_ID": "{}".format(i + trainer_rank_offset), + "PADDLE_TRAINERS_NUM": "{}".format(len(trainer_endpoints)), + "POD_IP": self.ctx.node.ip, + } + e.update(_gloo_envs) + log_tag = "trainer.{}".format(i) + self.add_container(envs=e, log_tag=log_tag) + ''' NEW VERSION + for i in range(server_num): + e = { + "PADDLE_PSERVER_ENDPOINTS": ",".join(server_endpoints), + "PADDLE_TRAINER_ENDPOINTS": ",".join(trainer_endpoints), + "PADDLE_ROLE": "PSERVER", + "PADDLE_RANK": "{}".format(i + server_rank_offset), + } + log_tag = "ps.{}".format(i) + self.add_container(envs=e, log_tag=log_tag) + + for i in range(trainer_num): + e = { + "PADDLE_PSERVER_ENDPOINTS": ",".join(server_endpoints), + "PADDLE_TRAINER_ENDPOINTS": ",".join(trainer_endpoints), + "PADDLE_ROLE": "TRAINER_CPU", + "PADDLE_RANK": "{}".format(i + trainer_rank_offset), + } + log_tag = "trainer.{}".format(i) + self.add_container(envs=e, log_tag=log_tag) + ''' diff --git a/python/paddle/distributed/run/job/__init__.py b/python/paddle/distributed/run/job/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..66d2abbce21ebc72cda5373a3d3a242c077beaa8 --- /dev/null +++ b/python/paddle/distributed/run/job/__init__.py @@ -0,0 +1,25 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .pod import Pod +from .job import Job +from .container import Container +from .status import Status + +__all__ = [ + 'Pod', + 'Job', + 'Container', + 'Status', +] diff --git a/python/paddle/distributed/run/job/container.py b/python/paddle/distributed/run/job/container.py new file mode 100644 index 0000000000000000000000000000000000000000..651932d6c88378034d7ab9cb05bac00ee3ea7ddf --- /dev/null +++ b/python/paddle/distributed/run/job/container.py @@ -0,0 +1,179 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import OrderedDict +from paddle.distributed.run.utils.process_context import ProcessContext + +from .status import Status + +import os, copy, sys +import time + + +class Container(object): + ''' + TODO(kuizhiqing) A container can be run by process/thread or just a callable function + ''' + + def __init__(self, entrypoint=[], rank=-1, env={}): + self._entrypoint = entrypoint + self._rank = rank + self._out = None + self._err = None + self._env = env + self._proc = None + + self._retry: int = 3 + self._grace_period = 10 + + self._log_handler = None + + @property + def entrypoint(self): + return self._entrypoint + + @entrypoint.setter + def entrypoint(self, entry): + self._entrypoint = entry + + @property + def rank(self): + return self._rank + + @rank.setter + def rank(self, r): + self._rank = r + + @property + def outfile(self): + return self._out + + @outfile.setter + def outfile(self, out): + self._out = out + + @property + def errfile(self): + return self._err + + @errfile.setter + def errfile(self, err): + self._err = err + + def update_env(self, env={}, **kwargs): + env = {k: v for k, v in env.items() if isinstance(v, str)} + self._env.update(env) + + kwargs = {k: v for k, v in kwargs.items() if isinstance(v, str)} + self._env.update(kwargs) + + def _get_fd(self, pth): + if not pth: + return None + + try: + d = os.path.dirname(pth) + if not os.path.isdir(d): + os.makedirs(d, exist_ok=True) + return open(pth, 'w') + except: + return None + + def start(self, timeout=-1): + end = time.time() + timeout + + if self._proc and self._proc.alive(): + return True + + self._stdout = self._get_fd(self._out) or sys.stdout + if self._out == self._err: + self._stderr = self._stdout + elif self._err: + self._stderr = self._get_fd(self._err) or sys.stderr + + self._proc = ProcessContext( + self._entrypoint, env=self._env, out=self._stdout, err=self._stderr) + self._proc.start() + + while timeout > 0 and time.time() < end: + if self._proc.alive(): + time.sleep(0.1) + continue + if self._proc.exit_code() == 0: + return True + return False + + def terminate(self, force=False): + if self._log_handler: + self._log_handler.close() + self._log_handler = None + + if self._proc and self._proc.alive(): + return self._proc.terminate(force) + + def wait(self, timeout=None): + self._proc.wait(timeout) + + def exit_code(self): + return self._proc.exit_code() if self._proc else -1 + + def status(self): + if not self._proc: + return Status.UNINIT + if self._proc.alive(): + return Status.RUNNING + elif self._proc.exit_code() == 0: + return Status.COMPLETED + else: + return Status.FAILED + + def __str__(self): + return 'Container rank {} status {} cmd {} code {} log {} \nenv {}'.format( + self._rank, + self.status(), + self._entrypoint, + self.exit_code(), + self.errfile, + self._env, ) + + def logs(self, fn=None, offset=0, whence=1, lines=1000): + if not self._log_handler: + self._log_handler = open(self._out) + + if fn is None: + fn = sys.stdout + + self._log_handler.seek(offset, whence) + + try: + idx = 0 + for line in self._log_handler: + fn.write(line) + idx += 1 + if idx > lines: + break + finally: + return self._log_handler.tell() + + def tail(self, length=3000): + if not self._log_handler: + self._log_handler = open(self._out) + + self._log_handler.seek(0, 2) + ed = self._log_handler.tell() + + if ed > length: + self.logs(offset=ed - length, whence=0) + else: + self.logs(offset=0, whence=0) diff --git a/python/paddle/distributed/run/job/job.py b/python/paddle/distributed/run/job/job.py new file mode 100644 index 0000000000000000000000000000000000000000..3469ed862576faed3bd7546710927f638b8fe0d5 --- /dev/null +++ b/python/paddle/distributed/run/job/job.py @@ -0,0 +1,80 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class JobMode: + COLLECTIVE = 'collective' + PS = 'ps' + HETER = 'heter' + + +class Job(object): + def __init__(self, id='default', mode=JobMode.COLLECTIVE, np="1"): + self._mode = mode + self._id = id + + self._replicas = 0 + self._replicas_min = self._replicas + self._replicas_max = self._replicas + self._elastic = False + + self.set_replicas(str(np)) + + def __str__(self): + return "Job: {}, mode {}, replicas {}[{}:{}], elastic {}".format( + self.id, self.mode, self._replicas, self._replicas_min, + self._replicas_max, self.elastic) + + @property + def mode(self): + return self._mode + + @property + def id(self): + return self._id + + @property + def elastic(self): + return self._elastic + + @property + def replicas(self): + return self._replicas + + @property + def replicas_min(self): + return self._replicas_min + + @property + def replicas_max(self): + return self._replicas_max + + @replicas.setter + def replicas(self, replicas): + self._replicas = replicas + + def set_replicas(self, np: str): + np = str(np) if np else '1' + + if ':' in np: + nps = np.split(':') + self._replicas_min, self._replicas_max = int(nps[0]), int(nps[1]) + self._replicas = self._replicas_max # default to max + + self._elastic = True + else: + self._replicas = int(np) + self._replicas_min, self._replicas_max = self._replicas, self._replicas + + self._elastic = False diff --git a/python/paddle/distributed/run/job/pod.py b/python/paddle/distributed/run/job/pod.py new file mode 100644 index 0000000000000000000000000000000000000000..f7c31edce1d552befac3a6f54e5e79c326b31c67 --- /dev/null +++ b/python/paddle/distributed/run/job/pod.py @@ -0,0 +1,185 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import OrderedDict +from .container import Container + +from .status import Status + +import random +import time + + +class PodSepc(object): + def __init__(self): + self._name = ''.join( + random.choice('abcdefghijklmnopqrstuvwxyz') for _ in range(6)) + + # by controller + self._init_containers: List[Container] = [] + self._containers: List[Container] = [] + + #self.resource: Resource = None + #self.status: Status = None + + self._rank = -1 + self._init_timeout = 120 # 2 min timeout for each init container + self._restart = -1 + self._replicas = 0 # number of containers + self._exit_code = 0 + + +class Pod(PodSepc): + def __init__(self): + super().__init__() + + def __str__(self): + return "Pod: {}, replicas {}, status {}".format(self.name, + self.replicas, + self.status()) + + def failed_container(self): + for c in self._containers: + if c.status() == Status.FAILED: + return c + return None + + @property + def name(self): + return self._name + + @property + def replicas(self): + return self._replicas + + @replicas.setter + def replicas(self, r): + self._replicas = r + + @property + def rank(self): + return self._rank + + @rank.setter + def rank(self, r): + self._rank = r + + @property + def restart(self): + return self._restart + + @property + def containers(self): + return self._containers + + def add_container(self, c): + c.rank = len(self._containers) + self._containers.append(c) + + @property + def init_containers(self): + return self._init_containers + + def add_init_container(self, c): + c.rank = len(self._init_containers) + self._init_containers.append(c) + + @property + def exit_code(self): + for c in self._containers: + if c.exit_code() != 0: + return c.exit_code() + return 0 + + def deploy(self): + for i in self._init_containers: + i.start(self._init_timeout) + + for c in self._containers: + c.start() + + self._restart += 1 + + def stop(self, sigint=0): + for c in self._containers: + force = True if sigint == 9 else False + c.terminate(force) + + def join(self): + for c in self._containers: + c.wait(None) + + def status(self): + if self.is_failed(): + return Status.FAILED + + if self.is_completed(): + return Status.COMPLETED + + return Status.READY + + def reset(self): + self._init_containers = [] + self._containers = [] + + def is_failed(self): + for c in self._containers: + if c.status() == Status.FAILED: + return True + return False + + def is_completed(self): + for c in self._containers: + if c.status() != Status.COMPLETED: + return False + return True + + def logs(self, idx=None): + if idx is None: + if self.failed_container(): + self.failed_container().logs() + else: + self._containers[0].logs() + else: + self._containers[idx].logs() + + def tail(self, idx=None): + if idx is None: + if self.failed_container(): + self.failed_container().tail() + else: + self._containers[0].tail() + else: + self._containers[idx].tail() + + def watch(self, + all_list=[Status.COMPLETED], + any_list=[Status.FAILED], + interval=1, + timeout=-1): + ''' + watch return if any container status in any_list + or all container status in all_list + ''' + end = time.time() + timeout + while timeout < 0 or time.time() < end: + for c in self._containers: + if c.status() in any_list: + return c.status() + + s = [c.status() for c in self._containers] + if len(set(s)) == 1 and s[0] in all_list: + return s[0] + + time.sleep(interval) diff --git a/python/paddle/distributed/run/job/status.py b/python/paddle/distributed/run/job/status.py new file mode 100644 index 0000000000000000000000000000000000000000..ae10c5adb6cbfe4713370a2f01c74569bfe98182 --- /dev/null +++ b/python/paddle/distributed/run/job/status.py @@ -0,0 +1,24 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class Status(object): + UNINIT = "uninit" + READY = "ready" + RUNNING = "running" + FAILED = "failed" + TERMINATING = "terminating" + RESTARTING = "restarting" + UNKNOWN = "unknown" + COMPLETED = "completed" diff --git a/python/paddle/distributed/run/plugins/__init__.py b/python/paddle/distributed/run/plugins/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ec91402a7aad359c9860cf737a78cb7c1f1375d1 --- /dev/null +++ b/python/paddle/distributed/run/plugins/__init__.py @@ -0,0 +1,50 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import six + +__all__ = [] + + +def log(ctx): + ctx.logger.info("----------- Configuration ----------------------") + for arg, value in sorted(six.iteritems(vars(ctx.args))): + ctx.logger.info("%s: %s" % (arg, value)) + ctx.logger.info("--------------------------------------------------") + + +def process_args(ctx): + # reset device by args + #argdev = ctx.args.gpus or ctx.args.xpus or ctx.args.npus + argdev = ctx.args.devices + if argdev: + ctx.node.device.labels = argdev.split(',') + ctx.node.device.count = len(ctx.node.device.labels) + ctx.logger.debug('Device reset by args {}'.format(argdev)) + + +def collective_compatible(ctx): + if 'PADDLE_TRAINER_ENDPOINTS' in ctx.envs: + ctx.master = ctx.envs['PADDLE_TRAINER_ENDPOINTS'].split(',')[0] + if 'DISTRIBUTED_TRAINER_ENDPOINTS' in ctx.envs: + ctx.master = ctx.envs['DISTRIBUTED_TRAINER_ENDPOINTS'].split(',')[0] + + +def rewrite_host_ip(ctx): + if ctx.args.host is not None and "." in ctx.args.host: + ctx.logger.warning('Host ip reset to {}'.format(ctx.args.host)) + ctx.node.ip = ctx.args.host + + +enabled_plugins = [collective_compatible, rewrite_host_ip, process_args, log] diff --git a/python/paddle/distributed/run/plugins/ip.py b/python/paddle/distributed/run/plugins/ip.py new file mode 100644 index 0000000000000000000000000000000000000000..0809ed5864da9f3bea29235621e7c29b75823391 --- /dev/null +++ b/python/paddle/distributed/run/plugins/ip.py @@ -0,0 +1,30 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import socket + + +def get_local_ip(ctx): + _, ip = _get_host_name_ip() + ctx.args.host = ip + ctx.envs["POD_IP"] = ip + + +def _get_host_name_ip(): + try: + host_name = socket.gethostname() + host_ip = socket.gethostbyname(host_name) + return host_name, host_ip + except: + return None diff --git a/python/paddle/distributed/run/utils/kv_client.py b/python/paddle/distributed/run/utils/kv_client.py new file mode 100644 index 0000000000000000000000000000000000000000..e19195412268a5c309d65a4a61e005ea512d685b --- /dev/null +++ b/python/paddle/distributed/run/utils/kv_client.py @@ -0,0 +1,94 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import requests +import time + + +class KVClient(object): + def __init__(self, endpoint='localhost:2379'): + self.endpoint = endpoint if endpoint.startswith( + "http://") else "http://{}".format(endpoint) + + def put(self, key, value): + key = key if key.startswith('/') else "/{}".format(key) + u = "{}{}".format(self.endpoint, key) + try: + r = requests.post(u, data=value, timeout=3) + if r.status_code == 200: + return True + else: + return False + except: + return False + + def get(self, key): + key = key if key.startswith('/') else "/{}".format(key) + u = "{}{}".format(self.endpoint, key) + try: + r = requests.get(u, timeout=3) + if r.status_code == 200: + ret = r.json() + return ret.get(key, '') + else: + return "error" + except: + return "" + + def get_prefix(self, key): + key = key if key.startswith('/') else "/{}".format(key) + u = "{}{}".format(self.endpoint, key) + try: + r = requests.get(u, timeout=3) + if r.status_code == 200: + return r.json() + except: + return "" + + def delete(self, key): + key = key if key.startswith('/') else "/{}".format(key) + u = "{}{}".format(self.endpoint, key) + try: + r = requests.delete(u, timeout=3) + if r.status_code == 200: + return True + else: + return False + except: + return False + + def wait_server_ready(self, timeout=3): + end = time.time() + timeout + while time.time() < end: + if self.get("/healthy") == "ok": + return True + + +if __name__ == '__main__': + cli = PKVClient("http://localhost:8090") + data = {"/workers/1": "rank1", "/workers/2": "rank2"} + for k, v in data.items(): + cli.put(k, v) + x = cli.get_prefix("/workers") + print(x) + for k, v in data.items(): + assert x[k] == v + + cli.put("key", "value") + print(cli.get("key")) + assert cli.get("key") == "value" + cli.delete("key") + print(cli.get("/key")) + print(cli.get("/healthy")) + assert cli.get("/healthy") == "ok" diff --git a/python/paddle/distributed/run/utils/kv_server.py b/python/paddle/distributed/run/utils/kv_server.py new file mode 100644 index 0000000000000000000000000000000000000000..2d7ae15f13d636f05536dbdd8f35434bb7c3bf97 --- /dev/null +++ b/python/paddle/distributed/run/utils/kv_server.py @@ -0,0 +1,121 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from http.server import HTTPServer +import http.server as SimpleHTTPServer + +from multiprocessing import Process + +import threading +import json + + +class KVHandler(SimpleHTTPServer.SimpleHTTPRequestHandler): + def do_GET(self): + with self.server.kv_lock: + ret = {} + for k, v in self.server.kv.items(): + if k.startswith(self.path): + ret[k] = v.decode(encoding="utf-8") + if ret: + self.output(200, json.dumps(ret).encode("utf-8")) + else: + self.output(404) + + def do_PUT(self): + self.do_POST() + + def do_POST(self): + content_length = int(self.headers['Content-Length'] or 0) + try: + value = self.rfile.read(content_length) + with self.server.kv_lock: + self.server.kv[self.path] = value + self.output(200) + return + except: + self.output(500) + + def do_DELETE(self): + with self.server.kv_lock: + if self.path in self.server.kv: + del self.server.kv[self.path] + self.output(200) + else: + self.output(404) + + def output(self, code, value=''): + self.send_response(code) + self.send_header("Content-Length", len(value)) + self.send_header("Content-Type", "application/json; charset=utf8") + self.end_headers() + if value: + self.wfile.write(value) + + def log_message(self, format, *args): + return + + +class KVServer(HTTPServer, object): + def __init__(self, port): + super(KVServer, self).__init__(('', port), KVHandler) + self.kv_lock = threading.Lock() + self.kv = {'/healthy': b'ok'} + self.port = port + self.stopped = False + self.started = False + + def start(self): + self.listen_thread = threading.Thread(target=self.serve_forever) + self.listen_thread.start() + self.started = True + + def stop(self): + self.shutdown() + self.listen_thread.join() + self.server_close() + self.stopped = True + + +class PKVServer(): + def __init__(self, port): + self._server = KVServer(port) + + def start(self): + self.proc = Process(target=self._server.start) + self.proc.daemon = True + self.proc.start() + + def stop(self): + self._server.stop() + self.proc.join() + + @property + def started(self): + return self._server.started + + @property + def stopped(self): + return self._server.stopped + + +if __name__ == '__main__': + #kv = PKVServer(8090) + kv = KVServer(8090) + kv.start() + import time + + #print("serve at 8090 for 600 s") + + time.sleep(600) diff --git a/python/paddle/distributed/run/utils/process_context.py b/python/paddle/distributed/run/utils/process_context.py new file mode 100644 index 0000000000000000000000000000000000000000..4d6fa8de794ff07874cc788f0abf0a283c066ae7 --- /dev/null +++ b/python/paddle/distributed/run/utils/process_context.py @@ -0,0 +1,83 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import subprocess +import os, sys, signal, time + + +class ProcessContext(object): + def __init__(self, + cmd, + env=os.environ, + out=sys.stdout, + err=sys.stderr, + group=True, + preexec_fn=None): + self._cmd = cmd + self._env = env + self._preexec_fn = preexec_fn + self._stdout = out + self._stderr = err + self._group = group if os.name != 'nt' else False + self._proc = None + self._code = None + + def _start(self): + pre_fn = os.setsid if self._group else None + self._proc = subprocess.Popen( + self._cmd, + env=self._env, + stdout=self._stdout, + stderr=self._stderr, + preexec_fn=self._preexec_fn or pre_fn) + + def _close_std(self): + try: + if not self._stdout.isatty(): + self._stdout.close() + + if not self._stderr.isatty(): + self._stderr.close() + except: + pass + + def alive(self): + return self._proc and self._proc.poll() is None + + def exit_code(self): + return self._proc.poll() if self._proc else None + + def start(self): + self._start() + + def terminate(self, force=False, max_retry=3): + for i in range(max_retry): + if self.alive(): + if self._group: + os.killpg(os.getpgid(self._proc.pid), signal.SIGTERM) + else: + self._proc.terminate() + time.sleep(0.2) + else: + break + + if force and self.alive(): + self._proc.kill() + + self._close_std() + + return self.alive() + + def wait(self, timeout=None): + self._proc.wait(timeout) diff --git a/python/paddle/distributed/sharding/group_sharded.py b/python/paddle/distributed/sharding/group_sharded.py index 2fdb20600f673b21e7cabd6ffe35c545b045bb5d..6fd4caa7b4a5c41e73fcf95ac50d0253bb3e7c79 100644 --- a/python/paddle/distributed/sharding/group_sharded.py +++ b/python/paddle/distributed/sharding/group_sharded.py @@ -39,19 +39,20 @@ def group_sharded_parallel(model, segment_size=2**20, sync_comm=False): """ - Use this module to configure and wrap up the parameters of the group shared module. + Use group_sharded_parallel can perform group shared configuration on the model, optimizer and GradScaler. Level has three string options, 'os', 'os_g' and 'p_g_os' corresponds to three different usage scenarios: optimizer state segmentation, optimizer state + gradient segmentation, and parameter + gradient + optimizer state segmentation. + Usually, optimizer state + gradient segmentation is actually a re optimization of optimizer state segmentation, so optimizer state + gradient segmentation can be used to realize optimizer state segmentation. Args: model (Layer): The layer to be wrapped with group_sharded_parallel. optimizer (Optimizer): The optimizer to be wrapped with group_sharded_parallel. level (str): The different level of the group sharded. Such as `os`, `os_g`, `p_g_os`. - scaler (GradScaler, optional): The scaler to be wrapped with group_sharded_parallel. Defaults to None. - group (Group, optional): The group instance. Defaults to None.d - offload (bool, optional): Whether to perform optimizer state and gradient transfer CPU. Defaults to False. - sync_buffers (bool, optional): Whether to broadcast model buffers. Defaults to False. - buffer_max_size (int, optional): The max size of the buffer used to integrate gradient in `os_g`. Defaults to 2**23. - segment_size (int, optional): The smallest size of parameter to be sharded in `p_g_os`. Defaults to 2**20. - sync_comm (bool, optional): Whether to use synchronous communication, only in `p_g_os` used. Defaults to False. + scaler (GradScaler, optional): If AMP is used, you need to pass GradScaler. Defaults to None, indicating that GradScaler is not used. + group (Group, optional): The group instance. Defaults to None, indicating that the default environment group is used. + offload (bool, optional): Whether to use the offload function. Defaults to False, which means that the offload function is not used. + sync_buffers (bool, optional): Whether to broadcast model buffers. It is generally used when there are registered model buffers. Defaults to False, indicating that model buffers are not used. + buffer_max_size (int, optional): The max size of the buffer used to integrate gradient in `os_g`. The larger the size, the more GPU memory will be used. Defaults to 2**23, which means that the dimension of the buffer is 2**23. + segment_size (int, optional): The smallest size of parameter to be sharded in `p_g_os`. Defaults to 2**20, indicating that the dimension of the minimum segmented parameter is 2**20. + sync_comm (bool, optional): Whether to use synchronous communication, only in `p_g_os` used. Defaults to False, indicating that asynchronous communication is used. Returns: model: A wrapper for group sharded given model. @@ -101,7 +102,7 @@ def group_sharded_parallel(model, def check_dtype(param): return param.dtype == paddle.float16 - params_fp16 = filter(check_dtype, model.parameters()) + params_fp16 = list(filter(check_dtype, model.parameters())) if scaler is None and len(params_fp16) > 0: raise ValueError("Please enter the correct scaler.") # convert model/optimizer/scaler @@ -146,10 +147,13 @@ def save_group_sharded_model(model, output, optimizer=None): """ Group sharded encapsulated model and optimizer state saving module. + .. note:: + If using save_group_sharded_model saves the model. When loading again, you need to set the model or optimizer state before using group_sharded_parallel. + Args: model (Layer): A wrapper for group sharded given model. output (str): Save directory. - optimizer (Optimizer, optional): Group sharded encapsulated optimizer. Defaults to None. + optimizer (Optimizer, optional): Group sharded encapsulated optimizer. Defaults to None, indicating that the optimizer state is not saved. Examples: .. code-block:: python @@ -182,7 +186,7 @@ def save_group_sharded_model(model, output, optimizer=None): optimizer.clear_grad() # save model and optimizer state_dict - save_group_sharded_model(model, optimizer,output=output_dir) + save_group_sharded_model(model, optimizer, output=output_dir) """ logger_.info( "==========Begin to save group sharded model and optimizer==========") diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 997075590e5cf97241188b847c0c5b5036ecee59..7480909a2d88dda51971d0ef66ae6c88a56cd79c 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -226,3 +226,5 @@ if core.is_compiled_with_npu(): atexit.register(core.npu_finalize) # NOTE(Aurelius84): clean up ExecutorCacheInfo in advance manually. atexit.register(core.clear_executor_cache) +# NOTE(Aganlengzi): clean up KernelFactory in advance manually. +atexit.register(core.clear_kernel_factory) diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py b/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py index 30012fb8666fcb5256efa889de7440f6d709cccd..900541459f6fcd3f8caaf9d60b0aabba5c6c469e 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py @@ -104,7 +104,7 @@ class FunctionSpec(object): if isinstance(input_var, np.ndarray): input_var = paddle.static.InputSpec.from_numpy(input_var) _set_spec_stop_gradient(input_var, True) - elif isinstance(input_var, core.VarBase): + elif isinstance(input_var, (core.VarBase, core.eager.Tensor)): stop_gradient = input_var.stop_gradient input_var = paddle.static.InputSpec.from_tensor(input_var) _set_spec_stop_gradient(input_var, stop_gradient) diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py index a442a8b92b6f7cf6c5c366e63ace110e9fb94e01..216f955b7510351c2dc6774a34a485b2341e76aa 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py @@ -148,10 +148,7 @@ class PartialProgramLayer: self._origin_main_program = self._verify_program(main_program) self._tmp_scope_vec = self._create_scope_vec() - # A fake_var to handle empty input or output - self.__fake_vars = _create_fake_var() # Set default mode to train - self._double_grads = self._get_double_grads(self._origin_main_program) self.training = True custom_white_list, custom_black_list = None, None @@ -163,6 +160,14 @@ class PartialProgramLayer: custom_white_list=custom_white_list, custom_black_list=custom_black_list) + @LazyInitialized + def __fake_vars(self): + return _create_fake_var() + + @LazyInitialized + def _double_grads(self): + return self._get_double_grads(self._origin_main_program) + @LazyInitialized def _infer_program(self): """ @@ -356,8 +361,10 @@ class PartialProgramLayer: def drop_scope_if_no_grad(self): tracer = framework._dygraph_tracer() + scope = self._tmp_scope_vec.value().get_scope() if isinstance( + self._tmp_scope_vec, (core.VarBase)) else self._tmp_scope_vec[0] if self.training and not tracer._has_grad: - self._tmp_scope_vec.value().get_scope().drop_kids() + scope.drop_kids() @property def program(self): @@ -449,18 +456,14 @@ class PartialProgramLayer: def _create_scope_vec(self): # Hold forward variables tmp_scope_vec = None + inner_scope = core.Scope() if not core._in_eager_mode(): tmp_scope_vec = core.VarBase(core.VarDesc.VarType.FP32, [], "program_out_scope", core.VarDesc.VarType.STEP_SCOPES, True) - # TODO(jiabin): Support this later. - # else: - # tmp_scope_vec = core.eager.Tensor(core.VarDesc.VarType.FP32, [], - # "program_out_scope", - # core.VarDesc.VarType.STEP_SCOPES, True) - - inner_scope = core.Scope() tmp_scope_vec.value().set_scope(inner_scope) + else: + tmp_scope_vec = [inner_scope] return tmp_scope_vec def _restore_out(self, out_vars): @@ -598,12 +601,10 @@ def _create_fake_var(): core.VarDesc.VarType.RAW, False) ] else: - return [] - # TODO(jiabin): Support this later - # return [ - # core.eager.Tensor(core.VarDesc.VarType.FP32, [], "Fake_var", - # core.VarDesc.VarType.RAW, False) - # ] + return [ + core.eager.Tensor(core.VarDesc.VarType.FP32, [], "Fake_var", + core.VarDesc.VarType.RAW, False) + ] def partial_program_from(concrete_program): diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py index 652916491eed7e511b610c2d00b0612604ecee8b..86d76f1b20a74c9f8bb51e23d9fc7d450717f173 100644 --- a/python/paddle/fluid/dygraph/parallel.py +++ b/python/paddle/fluid/dygraph/parallel.py @@ -30,7 +30,7 @@ from paddle.fluid.dygraph import to_variable, no_grad from paddle.utils import deprecated from ..layers import collective from paddle.fluid.dygraph import base as imperative_base -from paddle.fluid.framework import ParamBase +from paddle.fluid.framework import ParamBase, _in_eager_mode __all__ = ["prepare_context", "ParallelEnv", "DataParallel"] @@ -397,6 +397,16 @@ def sync_params_buffers(model, 'axis': 0}) +@imperative_base.no_grad +@framework.dygraph_only +def sync_eager_params(model, comm_group=None, src_rank=0): + for _, param in model._obtain_parameters_buffers().items(): + if not isinstance(param, core.eager.Tensor): + raise TypeError("The data type of '%s' must be '%s'" % + (param.name, core.eager.Tensor)) + comm_group.broadcast(param, src_rank).synchronize() + + class DataParallel(layers.Layer): """ Run the dygraph module with data parallelism. @@ -576,6 +586,7 @@ class DataParallel(layers.Layer): self.process_group = process_group self.gradient_as_buffer_view = gradient_as_buffer_view self.static_graph = static_graph + self.var_dtype = core.eager.Tensor if _in_eager_mode() else core.VarBase # NOTE(chenweihang): The ParallelStrategy here is not strictly a strategy. # It just stores some environment variables, which can be constructed by @@ -592,11 +603,20 @@ class DataParallel(layers.Layer): "ParallelContext must be initialized before. You should use init_parallel_env() before" \ "constructing the DataParallel." + if self.process_group is None and _in_eager_mode(): + raise RuntimeError( + "Process group should be built in DataParallel of eager mode." + ) + # sync buffer and params # TODO(liuyuhui) Currently not support xpu. xpu is # still broadcasting parameters when calling layer if not paddle.is_compiled_with_xpu(): - sync_params_buffers(self._layers) + if _in_eager_mode(): + sync_eager_params( + self._layers, comm_group=self.process_group) + else: + sync_params_buffers(self._layers) self.comm_buffer_size = int(comm_buffer_size * 1024 * 1024) # NOTE(shenliang03): We can set environment variables to control @@ -620,9 +640,9 @@ class DataParallel(layers.Layer): if param is None or param in params_set: continue params_set.add(param) - if not isinstance(param, core.VarBase): - raise TypeError("The data type of '%s' must be Varbase" % - param.name) + if not isinstance(param, self.var_dtype): + raise TypeError("The data type of '%s' must be '%s'" % + (param.name, self.var_dtype)) if param.trainable: layers_param.append((sublayer, param)) @@ -649,19 +669,32 @@ class DataParallel(layers.Layer): check_layer_sparse(sublayer) for sublayer, _ in layers_param ] - self.group_indices = core.assign_group_by_size( - trainable_parameters, is_sparse_gradient, - [self.last_comm_buffer_size, self.comm_buffer_size]) + if _in_eager_mode(): + self.group_indices = core.eager_assign_group_by_size( + trainable_parameters, is_sparse_gradient, + [self.last_comm_buffer_size, self.comm_buffer_size]) + + self._reducer = core.EagerReducer( + trainable_parameters, + list(reversed(self.group_indices)), is_sparse_gradient, + self.process_group, + [self.last_comm_buffer_size, self.comm_buffer_size], + self.find_unused_parameters) + else: + self.group_indices = core.assign_group_by_size( + trainable_parameters, is_sparse_gradient, + [self.last_comm_buffer_size, self.comm_buffer_size]) - self._reducer = core.Reducer( - trainable_parameters, - list(reversed(self.group_indices)), is_sparse_gradient, - parallel_helper.__parallel_ctx__clz__, - [self.last_comm_buffer_size, self.comm_buffer_size], - self.find_unused_parameters) + self._reducer = core.Reducer( + trainable_parameters, + list(reversed(self.group_indices)), is_sparse_gradient, + parallel_helper.__parallel_ctx__clz__, + [self.last_comm_buffer_size, self.comm_buffer_size], + self.find_unused_parameters) def _find_varbase(self, obj): - if isinstance(obj, core.VarBase): + var_type = core.eager.Tensor if _in_eager_mode() else core.VarBase + if isinstance(obj, var_type): return [obj] if isinstance(obj, (list, tuple)): return itertools.chain(*map(self._find_varbase, obj)) diff --git a/python/paddle/fluid/dygraph/tracer.py b/python/paddle/fluid/dygraph/tracer.py index 563cd433910054522b48b9b0f03a036d0d5abe69..d0552ca41f0daf56ce23317dd06cb5744baaff84 100644 --- a/python/paddle/fluid/dygraph/tracer.py +++ b/python/paddle/fluid/dygraph/tracer.py @@ -52,6 +52,12 @@ final_state_name_mapping = { "axis1": "axis1", "axis2": "axis2", "out": "Out", + }, + "one_hot": { + "final_op_name": "final_state_one_hot", + "x": "X", + "num_class": "depth", + "out": "Out", } } @@ -140,7 +146,12 @@ class Tracer(core.Tracer): outputs[retname][j].reconstruct_from_(returns[i][j], False) else: - outputs[retname][0].reconstruct_from_(returns[i], False) + if isinstance(outputs[retname], list): + outputs[retname][0].reconstruct_from_(returns[i], + False) + else: + outputs[retname].reconstruct_from_(returns[i], + False) elif isinstance(returns, list): assert len(outputs.keys()) == 1 key = list(outputs.keys())[0] diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py index 6843c0e4c3fa85f20b408e7536cf1902dafe3f45..2b67a2029727f6b8f917239094a1b906d5cd6a62 100644 --- a/python/paddle/fluid/dygraph/varbase_patch_methods.py +++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py @@ -311,7 +311,7 @@ def monkey_patch_varbase(): """ if core._in_eager_mode(): - if not self.grad._is_initialized(): + if self.grad is None: return None # TODO(wanghuancoder) support SELECTED_ROWS return self.grad.numpy() diff --git a/python/paddle/fluid/tests/custom_op/custom_relu_op.cc b/python/paddle/fluid/tests/custom_op/custom_relu_op.cc index c89990be34ca06c6277033bb6b6c0844e7d9a327..acaf7cb74280bed23b20feab2a96aa85a9bb5cea 100644 --- a/python/paddle/fluid/tests/custom_op/custom_relu_op.cc +++ b/python/paddle/fluid/tests/custom_op/custom_relu_op.cc @@ -153,6 +153,7 @@ PD_BUILD_GRAD_OP(custom_relu_no_x_in_backward) .SetInferShapeFn(PD_INFER_SHAPE(ReluBackwardWithoutXInferShape)); void relu_cpu_forward_out(const paddle::Tensor& x, paddle::Tensor* out) { + out->reshape(x.shape()); PD_DISPATCH_FLOATING_TYPES( x.type(), "relu_cpu_forward", ([&] { relu_cpu_forward_kernel( @@ -164,6 +165,7 @@ void relu_cpu_backward_out(const paddle::Tensor& x, const paddle::Tensor& out, const paddle::Tensor& grad_out, paddle::Tensor* grad_x) { + grad_x->reshape(x.shape()); PD_DISPATCH_FLOATING_TYPES(out.type(), "relu_cpu_backward", ([&] { relu_cpu_backward_kernel( grad_out.data(), diff --git a/python/paddle/fluid/tests/custom_op/custom_relu_op.cu b/python/paddle/fluid/tests/custom_op/custom_relu_op.cu index 33c5ede299bd47c87490473920bb80b18cd75bf5..4bb773cdaec21712f262bcb217710f6909efd20a 100644 --- a/python/paddle/fluid/tests/custom_op/custom_relu_op.cu +++ b/python/paddle/fluid/tests/custom_op/custom_relu_op.cu @@ -94,6 +94,7 @@ void relu_cuda_forward_out(const paddle::Tensor& x, paddle::Tensor* out) { int numel = x.size(); int block = 512; int grid = (numel + block - 1) / block; + out->reshape(x.shape()); PD_DISPATCH_FLOATING_AND_HALF_TYPES( x.type(), "relu_cuda_forward_kernel", ([&] { relu_cuda_forward_kernel<<>>( @@ -108,6 +109,7 @@ void relu_cuda_backward_out(const paddle::Tensor& x, int numel = out.size(); int block = 512; int grid = (numel + block - 1) / block; + grad_x->reshape(x.shape()); PD_DISPATCH_FLOATING_AND_HALF_TYPES( out.type(), "relu_cuda_backward_kernel", ([&] { relu_cuda_backward_kernel<<>>( diff --git a/python/paddle/fluid/tests/custom_op/test_custom_attrs_jit.py b/python/paddle/fluid/tests/custom_op/test_custom_attrs_jit.py index 1c9c6eedbaeb8c1c3f06d42d82a8ec5cc28750f6..785bfc74229817c022f7f9e80481dde156d4e178 100644 --- a/python/paddle/fluid/tests/custom_op/test_custom_attrs_jit.py +++ b/python/paddle/fluid/tests/custom_op/test_custom_attrs_jit.py @@ -20,6 +20,7 @@ import paddle from paddle.utils.cpp_extension import load, get_build_directory from utils import paddle_includes, extra_cc_args, extra_nvcc_args from paddle.utils.cpp_extension.extension_utils import run_cmd +from paddle.fluid.framework import _test_eager_guard, _in_eager_mode # Because Windows don't use docker, the shared lib already exists in the # cache dir, it will not be compiled again unless the shared lib is removed. @@ -53,7 +54,7 @@ class TestJitCustomAttrs(unittest.TestCase): self.int64_vec_attr = [10000000000, 10000000000, 10000000000] self.str_vec_attr = ["StrAttr", "StrAttr", "StrAttr"] - def test_attr_value(self): + def func_attr_value(self): x = paddle.ones([2, 2], dtype='float32') x.stop_gradient = False out = custom_attrs.attr_test( @@ -65,7 +66,12 @@ class TestJitCustomAttrs(unittest.TestCase): self.assertTrue(np.array_equal(x.numpy(), out.numpy())) - def test_const_attr_value(self): + def test_attr_value(self): + with _test_eager_guard(): + self.func_attr_value() + self.func_attr_value() + + def func_const_attr_value(self): x = paddle.ones([2, 2], dtype='float32') x.stop_gradient = False out = custom_attrs.const_attr_test( @@ -77,6 +83,11 @@ class TestJitCustomAttrs(unittest.TestCase): self.assertTrue(np.array_equal(x.numpy(), out.numpy())) + def test_const_attr_value(self): + with _test_eager_guard(): + self.func_const_attr_value() + self.func_const_attr_value() + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/custom_op/test_custom_concat.py b/python/paddle/fluid/tests/custom_op/test_custom_concat.py index 9049b604c910c80a41afa2c509e2ec3fdb4ffbfc..62e61c5bc7f5f235c03d146bd77fee41948a2a05 100644 --- a/python/paddle/fluid/tests/custom_op/test_custom_concat.py +++ b/python/paddle/fluid/tests/custom_op/test_custom_concat.py @@ -21,6 +21,7 @@ import paddle.static as static from paddle.utils.cpp_extension import load, get_build_directory from paddle.utils.cpp_extension.extension_utils import run_cmd from utils import paddle_includes, extra_cc_args, extra_nvcc_args +from paddle.fluid.framework import _test_eager_guard, _in_eager_mode # Because Windows don't use docker, the shared lib already exists in the # cache dir, it will not be compiled again unless the shared lib is removed. @@ -116,7 +117,7 @@ class TestCustomConcatDynamicAxisJit(unittest.TestCase): "custom op {}: {},\n paddle api {}: {}".format(name, out, name, pd_out)) - def test_dynamic(self): + def func_dynamic(self): for dtype in self.dtypes: for axis in self.axises: out, grad_inputs = concat_dynamic(custom_ops.custom_concat, @@ -128,6 +129,11 @@ class TestCustomConcatDynamicAxisJit(unittest.TestCase): for x_grad, pd_x_grad in zip(grad_inputs, pd_grad_inputs): self.check_output(x_grad, pd_x_grad, "x_grad") + def test_dynamic(self): + with _test_eager_guard(): + self.func_dynamic() + self.func_dynamic() + def test_static(self): for dtype in self.dtypes: for axis in self.axises: @@ -140,7 +146,7 @@ class TestCustomConcatDynamicAxisJit(unittest.TestCase): self.check_output(x1_grad, pd_x1_grad, "x1_grad") self.check_output(x2_grad, pd_x2_grad, "x2_grad") - def test_dynamic_with_attr(self): + def func_dynamic_with_attr(self): for dtype in self.dtypes: for axis in self.axises: out, grad_inputs = concat_dynamic( @@ -153,6 +159,11 @@ class TestCustomConcatDynamicAxisJit(unittest.TestCase): for x_grad, pd_x_grad in zip(grad_inputs, pd_grad_inputs): self.check_output(x_grad, pd_x_grad, "x_grad") + def test_dynamic_with_attr(self): + with _test_eager_guard(): + self.func_dynamic_with_attr() + self.func_dynamic_with_attr() + def test_static_with_attr(self): for dtype in self.dtypes: for axis in self.axises: diff --git a/python/paddle/fluid/tests/custom_op/test_custom_conj.py b/python/paddle/fluid/tests/custom_op/test_custom_conj.py index 25c88ee6c6b01daf62553f5f7857bfe06fce25ca..5f3c107a9b22ad2014bd5e2488c0f48a6866fad8 100644 --- a/python/paddle/fluid/tests/custom_op/test_custom_conj.py +++ b/python/paddle/fluid/tests/custom_op/test_custom_conj.py @@ -21,6 +21,7 @@ import paddle.static as static from paddle.utils.cpp_extension import load, get_build_directory from paddle.utils.cpp_extension.extension_utils import run_cmd from utils import paddle_includes, extra_cc_args, extra_nvcc_args +from paddle.fluid.framework import _test_eager_guard # Because Windows don't use docker, the shared lib already exists in the # cache dir, it will not be compiled again unless the shared lib is removed. @@ -116,11 +117,16 @@ class TestCustomConjJit(unittest.TestCase): self.check_output(out, pd_out, "out") self.check_output(x_grad, pd_x_grad, "x's grad") - def test_dynamic(self): + def func_dynamic(self): for dtype in self.dtypes: np_input = np.random.random(self.shape).astype(dtype) self.run_dynamic(dtype, np_input) + def test_dynamic(self): + with _test_eager_guard(): + self.func_dynamic() + self.func_dynamic() + def test_static(self): for dtype in self.dtypes: np_input = np.random.random(self.shape).astype(dtype) diff --git a/python/paddle/fluid/tests/custom_op/test_custom_linear.py b/python/paddle/fluid/tests/custom_op/test_custom_linear.py index 0ba70eaa3e06cec619b2a9175db4aa1c8bf75a8b..811eedf1edaf39c961f9fd292054c2cce5154db9 100644 --- a/python/paddle/fluid/tests/custom_op/test_custom_linear.py +++ b/python/paddle/fluid/tests/custom_op/test_custom_linear.py @@ -22,6 +22,7 @@ import paddle.nn.functional as F from paddle.utils.cpp_extension import load, get_build_directory from paddle.utils.cpp_extension.extension_utils import run_cmd from utils import paddle_includes, extra_cc_args, extra_nvcc_args +from paddle.fluid.framework import _test_eager_guard, _in_eager_mode # Because Windows don't use docker, the shared lib already exists in the # cache dir, it will not be compiled again unless the shared lib is removed. @@ -94,7 +95,7 @@ class TestCustomLinearJit(unittest.TestCase): self.np_bias) self.check_output(pten_out, pd_out, "pten_out") - def test_dynamic(self): + def func_dynamic(self): for dtype in self.dtypes: pten_out = linear_dynamic(custom_ops.pten_linear, dtype, self.np_x, self.np_weight, self.np_bias) @@ -102,6 +103,11 @@ class TestCustomLinearJit(unittest.TestCase): self.np_bias) self.check_output(pten_out, pd_out, "pten_out") + def test_dynamic(self): + with _test_eager_guard(): + self.func_dynamic() + self.func_dynamic() + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/custom_op/test_custom_raw_op_kernel_op.py b/python/paddle/fluid/tests/custom_op/test_custom_raw_op_kernel_op.py index 207ea87974130698b8bf491bce3cd753045a97e9..4da99b1ea10418c6cb6baddb51596b307c6ba28d 100644 --- a/python/paddle/fluid/tests/custom_op/test_custom_raw_op_kernel_op.py +++ b/python/paddle/fluid/tests/custom_op/test_custom_raw_op_kernel_op.py @@ -68,12 +68,6 @@ class TestCustomRawReluOp(unittest.TestCase): self.assertTrue(custom_raw_relu_op is not None) return custom_raw_relu_op(x) - def test_dygraph(self): - x = paddle.to_tensor(np.random.uniform(low=-1.0, high=1.0, size=[2, 3])) - y1 = self.custom_raw_relu(x) - y2 = paddle.nn.ReLU()(x) - self.assertTrue(np.array_equal(y1.numpy(), y2.numpy())) - def test_static(self): paddle.enable_static() shape = [2, 3] diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py index dddb14eb78e8a1dc6a0820ead2ebfa915b8a09c2..81793f1391d0422393c1f6c1e719f708112d3b6b 100644 --- a/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py +++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py @@ -22,6 +22,7 @@ from paddle.utils.cpp_extension import load, get_build_directory from paddle.utils.cpp_extension.extension_utils import run_cmd from utils import paddle_includes, extra_cc_args, extra_nvcc_args, IS_MAC +from paddle.fluid.framework import _test_eager_guard, _in_eager_mode # Because Windows don't use docker, the shared lib already exists in the # cache dir, it will not be compiled again unless the shared lib is removed. @@ -98,7 +99,7 @@ class TestDygraphModel(unittest.TestCase): self.x_spec = paddle.static.InputSpec( shape=[None, self.in_dim], dtype='float32', name='x') - def test_train_eval(self): + def func_train_eval(self): for device in self.devices: # set device paddle.set_device(device) @@ -106,26 +107,34 @@ class TestDygraphModel(unittest.TestCase): # for train origin_relu_train_out = self.train_model(use_custom_op=False) custom_relu_train_out = self.train_model(use_custom_op=True) - custom_relu_dy2stat_train_out = self.train_model( - use_custom_op=True, dy2stat=True) # for to_static + # open this when dy2stat is ready for eager + if not _in_eager_mode(): + custom_relu_dy2stat_train_out = self.train_model( + use_custom_op=True, dy2stat=True) # for to_static + self.assertTrue( + np.array_equal(origin_relu_train_out, + custom_relu_dy2stat_train_out)) self.assertTrue( np.array_equal(origin_relu_train_out, custom_relu_train_out)) - self.assertTrue( - np.array_equal(origin_relu_train_out, - custom_relu_dy2stat_train_out)) # for eval origin_relu_eval_out = self.eval_model(use_custom_op=False) custom_relu_eval_out = self.eval_model(use_custom_op=True) - custom_relu_dy2stat_eval_out = self.eval_model( - use_custom_op=True, dy2stat=True) # for to_static + if not _in_eager_mode(): + custom_relu_dy2stat_eval_out = self.eval_model( + use_custom_op=True, dy2stat=True) # for to_static + self.assertTrue( + np.array_equal(origin_relu_eval_out, + custom_relu_dy2stat_eval_out)) self.assertTrue( np.array_equal(origin_relu_eval_out, custom_relu_eval_out)) - self.assertTrue( - np.array_equal(origin_relu_eval_out, - custom_relu_dy2stat_eval_out)) + + def test_train_eval(self): + with _test_eager_guard(): + self.func_train_eval() + self.func_train_eval() def train_model(self, use_custom_op=False, dy2stat=False): # reset random seed diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py index 407eb342ba99ba09d0f6faa8686e12c2a1100cdb..a747d10823ec5572e73e7ec8ae3e5da528e3e88d 100644 --- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py +++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py @@ -20,7 +20,7 @@ from paddle.utils.cpp_extension import load, get_build_directory from paddle.utils.cpp_extension.extension_utils import run_cmd from utils import paddle_includes, extra_cc_args, extra_nvcc_args, IS_WINDOWS, IS_MAC from test_custom_relu_op_setup import custom_relu_dynamic, custom_relu_static - +from paddle.fluid.framework import _test_eager_guard, _in_eager_mode # Because Windows don't use docker, the shared lib already exists in the # cache dir, it will not be compiled again unless the shared lib is removed. file = '{}\\custom_relu_module_jit\\custom_relu_module_jit.pyd'.format( @@ -75,7 +75,7 @@ class TestJITLoad(unittest.TestCase): "custom op out: {},\n paddle api out: {}".format( out, pd_out)) - def test_dynamic(self): + def func_dynamic(self): for device in self.devices: for dtype in self.dtypes: if device == 'cpu' and dtype == 'float16': @@ -95,8 +95,14 @@ class TestJITLoad(unittest.TestCase): "custom op x grad: {},\n paddle api x grad: {}".format( x_grad, pd_x_grad)) - def test_exception(self): + def test_dynamic(self): + with _test_eager_guard(): + self.func_dynamic() + self.func_dynamic() + + def func_exception(self): caught_exception = False + # if not _in_eager_mode(): try: x = np.random.uniform(-1, 1, [4, 8]).astype('int32') custom_relu_dynamic(custom_module.custom_relu, 'cpu', 'int32', x) @@ -114,11 +120,11 @@ class TestJITLoad(unittest.TestCase): "python/paddle/fluid/tests/custom_op/custom_relu_op.cc" in str(e)) self.assertTrue(caught_exception) - caught_exception = False # MAC-CI don't support GPU if IS_MAC: return + # if not _in_eager_mode(): try: x = np.random.uniform(-1, 1, [4, 8]).astype('int32') custom_relu_dynamic(custom_module.custom_relu, 'gpu', 'int32', x) @@ -132,6 +138,11 @@ class TestJITLoad(unittest.TestCase): str(e)) self.assertTrue(caught_exception) + def test_exception(self): + with _test_eager_guard(): + self.func_exception() + self.func_exception() + def test_load_multiple_module(self): custom_module = load( name='custom_conj_jit', diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py index 0af0aa16466ea82eeb4a9558bdbcb3de69489bb4..7c61e11a18ecd2ebbcd87fae37a8ba0a39ad56d1 100644 --- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py +++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py @@ -21,6 +21,7 @@ import paddle.static as static import subprocess import numpy as np from paddle.utils.cpp_extension.extension_utils import run_cmd +from paddle.fluid.framework import _test_eager_guard def custom_relu_dynamic(func, device, dtype, np_x, use_func=True): @@ -216,7 +217,7 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase): "custom op out: {},\n paddle api out: {}".format( out, pd_out)) - def test_dynamic(self): + def func_dynamic(self): for device in self.devices: for dtype in self.dtypes: if device == 'cpu' and dtype == 'float16': @@ -236,6 +237,11 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase): "custom op x grad: {},\n paddle api x grad: {}".format( x_grad, pd_x_grad)) + def test_dynamic(self): + with _test_eager_guard(): + self.func_dynamic() + self.func_dynamic() + def test_static_save_and_load_inference_model(self): paddle.enable_static() np_data = np.random.random((1, 1, 28, 28)).astype("float32") diff --git a/python/paddle/fluid/tests/custom_op/test_custom_simple_slice.py b/python/paddle/fluid/tests/custom_op/test_custom_simple_slice.py index c60bac4060b643c01be87d82c1fcde4a8ae4be7e..f68a37b1a2f3b87f4126953ae50464d5ad8e6fe3 100644 --- a/python/paddle/fluid/tests/custom_op/test_custom_simple_slice.py +++ b/python/paddle/fluid/tests/custom_op/test_custom_simple_slice.py @@ -20,6 +20,7 @@ import paddle from paddle.utils.cpp_extension import load, get_build_directory from paddle.utils.cpp_extension.extension_utils import run_cmd from utils import paddle_includes, extra_cc_args, extra_nvcc_args +from paddle.fluid.framework import _test_eager_guard, _in_eager_mode # Because Windows don't use docker, the shared lib already exists in the # cache dir, it will not be compiled again unless the shared lib is removed. @@ -39,7 +40,7 @@ custom_ops = load( class TestCustomSimpleSliceJit(unittest.TestCase): - def test_slice_output(self): + def func_slice_output(self): np_x = np.random.random((5, 2)).astype("float32") x = paddle.to_tensor(np_x) custom_op_out = custom_ops.custom_simple_slice(x, 2, 3) @@ -48,6 +49,11 @@ class TestCustomSimpleSliceJit(unittest.TestCase): np.array_equal(custom_op_out, np_out), "custom op: {},\n numpy: {}".format(np_out, custom_op_out.numpy())) + def test_slice_output(self): + with _test_eager_guard(): + self.func_slice_output() + self.func_slice_output() + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py b/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py index 12e9f50a5e4092a067c533bcdb6bcb03011d35fa..0d2cb941eafaa188c7be458b08f0f5ef35ab6238 100644 --- a/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py +++ b/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py @@ -19,7 +19,7 @@ import numpy as np from paddle.utils.cpp_extension import load, get_build_directory from utils import paddle_includes, extra_cc_args from paddle.utils.cpp_extension.extension_utils import run_cmd - +from paddle.fluid.framework import _test_eager_guard # Because Windows don't use docker, the shared lib already exists in the # cache dir, it will not be compiled again unless the shared lib is removed. file = '{}\\dispatch_op\\dispatch_op.pyd'.format(get_build_directory()) @@ -39,7 +39,7 @@ class TestJitDispatch(unittest.TestCase): def setUp(self): paddle.set_device('cpu') - def run_dispatch_test(self, func, dtype): + def run_dispatch_test_impl(self, func, dtype): np_x = np.ones([2, 2]).astype(dtype) x = paddle.to_tensor(np_x) out = func(x) @@ -50,6 +50,11 @@ class TestJitDispatch(unittest.TestCase): np.array_equal(np_x, np_out), "custom op x: {},\n custom op out: {}".format(np_x, np_out)) + def run_dispatch_test(self, func, dtype): + with _test_eager_guard(): + self.run_dispatch_test_impl(func, dtype) + self.run_dispatch_test_impl(func, dtype) + def test_dispatch_integer(self): dtypes = ["int32", "int64", "int8", "uint8", "int16"] for dtype in dtypes: diff --git a/python/paddle/fluid/tests/custom_op/test_multi_out_jit.py b/python/paddle/fluid/tests/custom_op/test_multi_out_jit.py index 97b37498c4d3d0a6b2c336c240aaf116b11e0407..4fc9270b0f44cc5778775bdab4c2b7cab95c8c3a 100644 --- a/python/paddle/fluid/tests/custom_op/test_multi_out_jit.py +++ b/python/paddle/fluid/tests/custom_op/test_multi_out_jit.py @@ -22,7 +22,7 @@ from paddle.utils.cpp_extension import load from paddle.utils.cpp_extension import load, get_build_directory from paddle.utils.cpp_extension.extension_utils import run_cmd from utils import paddle_includes, extra_cc_args - +from paddle.fluid.framework import _test_eager_guard # Because Windows don't use docker, the shared lib already exists in the # cache dir, it will not be compiled again unless the shared lib is removed. file = '{}\\multi_out_jit\\multi_out_jit.pyd'.format(get_build_directory()) @@ -84,7 +84,7 @@ class TestMultiOutputDtypes(unittest.TestCase): self.check_multi_outputs(res) paddle.disable_static() - def test_dynamic(self): + def func_dynamic(self): for device in self.devices: for dtype in self.dtypes: paddle.set_device(device) @@ -95,6 +95,11 @@ class TestMultiOutputDtypes(unittest.TestCase): self.assertTrue(len(outs) == 3) self.check_multi_outputs(outs, True) + def test_dynamic(self): + with _test_eager_guard(): + self.func_dynamic() + self.func_dynamic() + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index e75b8d1f60bf7dbbfb500a464a3b591a0d1f7ed3..cbe360f556cd986646bc7f45b3a80ab0f5edb9eb 100755 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -557,6 +557,7 @@ if (APPLE OR WIN32) list(REMOVE_ITEM TEST_OPS test_multiprocess_dataloader_exception) list(REMOVE_ITEM TEST_OPS test_multiprocess_dataloader_iterable_dataset) list(REMOVE_ITEM TEST_OPS test_multiprocess_dataloader_dataset) + list(REMOVE_ITEM TEST_OPS test_paddle_multiprocessing) endif() if (NOT WITH_GLOO) @@ -948,6 +949,7 @@ if (WITH_DISTRIBUTE AND NOT APPLE) endif() # setting timeout value as 15S +set_tests_properties(test_run PROPERTIES TIMEOUT 200) set_tests_properties(test_sync_batch_norm_op PROPERTIES TIMEOUT 120) set_tests_properties(test_cross_op PROPERTIES TIMEOUT 120) set_tests_properties(test_imperative_lod_tensor_to_selected_rows PROPERTIES TIMEOUT 200) @@ -1174,6 +1176,7 @@ if((WITH_ROCM OR WITH_GPU) AND NOT WIN32) test_collective_global_scatter PROPERTIES LABELS "RUN_TYPE=DIST") endif() + set_tests_properties(test_paddle_multiprocessing PROPERTIES TIMEOUT 120) set_tests_properties(test_reducescatter_api PROPERTIES TIMEOUT 120) set_tests_properties(test_broadcast PROPERTIES TIMEOUT 120) set_tests_properties(test_reducescatter PROPERTIES TIMEOUT 120) diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt index 80bc206ae7b7952aea55cb93bd42346dc019633b..1f7ae53acdf4536921ca25e27874279f271b4de8 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt @@ -9,4 +9,6 @@ if(WITH_DISTRIBUTE AND WITH_GPU) set_tests_properties(test_relaunch_with_gpt_planner PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 240) py_test_modules(test_engine_api MODULES test_engine_api ENVS ${dist_ENVS}) set_tests_properties(test_engine_api PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 80) + py_test_modules(test_converter MODULES test_converter ENVS ${dist_ENVS}) + set_tests_properties(test_converter PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 50) endif() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/converter.py b/python/paddle/fluid/tests/unittests/auto_parallel/converter.py new file mode 100644 index 0000000000000000000000000000000000000000..e34f267b4237bf5ebe19adda1c90f1c147294333 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/auto_parallel/converter.py @@ -0,0 +1,83 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np + +import paddle +from paddle.distributed.auto_parallel.converter import Converter + + +def test_convert(): + rank_id = paddle.distributed.get_rank() + complete_tensor = np.arange(64).reshape([8, 8]) + tensor_row = np.split(complete_tensor, 2, axis=0) + tensor_col = np.split(complete_tensor, 2, axis=1) + tensor_name = "tensor_0" + complet_strategy = { + tensor_name: { + "process_shape": [2], + "process_group": [0, 1], + "dims_mapping": [-1, -1] + } + } + row_strategy = { + tensor_name: { + "process_shape": [2], + "process_group": [0, 1], + "dims_mapping": [0, -1] + } + } + col_strategy = { + tensor_name: { + "process_shape": [2], + "process_group": [0, 1], + "dims_mapping": [-1, 0] + } + } + + # test merge + tensor_dict = {tensor_name: tensor_row} + converter = Converter(tensor_dict, row_strategy, complet_strategy) + convert_tensor_dict = converter.convert() + assert np.equal(convert_tensor_dict[tensor_name], complete_tensor).all() + + # test slice + tensor_dict = {tensor_name: [complete_tensor]} + converter = Converter(tensor_dict, complet_strategy, col_strategy) + convert_tensor_dict = converter.convert() + assert np.equal(convert_tensor_dict[tensor_name], tensor_col[rank_id]).all() + + # test merge and slice + tensor_dict = {tensor_name: tensor_col} + converter = Converter(tensor_dict, col_strategy, row_strategy) + convert_tensor_dict = converter.convert() + assert np.equal(convert_tensor_dict[tensor_name], tensor_row[rank_id]).all() + + # test merge and slice with prefix match + new_name = "tensor_1" + row_strategy = { + new_name: { + "process_shape": [2], + "process_group": [0, 1], + "dims_mapping": [0, -1] + } + } + converter = Converter(tensor_dict, col_strategy, row_strategy) + convert_tensor_dict = converter.convert(strict=False) + assert np.equal(convert_tensor_dict[new_name], tensor_row[rank_id]).all() + + +if __name__ == "__main__": + test_convert() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_converter.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..fbadbb7d8c1cfe8bb92e0287694d48a0a546f206 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_converter.py @@ -0,0 +1,69 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import os +import sys +import shutil +import subprocess +from paddle.distributed.fleet.launch_utils import run_with_coverage +from paddle.distributed.auto_parallel.converter import Converter + + +class TestConverter(unittest.TestCase): + def test_converter(self): + file_dir = os.path.dirname(os.path.abspath(__file__)) + launch_model_path = os.path.join(file_dir, "converter.py") + + if os.environ.get("WITH_COVERAGE", "OFF") == "ON": + coverage_args = ["-m", "coverage", "run", "--branch", "-p"] + else: + coverage_args = [] + + cmd = [sys.executable, "-u"] + coverage_args + [ + "-m", "launch", "--gpus", "0,1", launch_model_path + ] + + process = subprocess.Popen(cmd) + process.wait() + self.assertEqual(process.returncode, 0) + + # Remove unnecessary files + log_path = os.path.join(file_dir, "log") + if os.path.exists(log_path): + shutil.rmtree(log_path) + + def test_input_invalid(self): + with self.assertRaises(ValueError): + Converter({}, [], []) + with self.assertRaises(TypeError): + Converter([0, 1], [], []) + with self.assertRaises(ValueError): + Converter({"tmp_0": [0]}, {}, []) + with self.assertRaises(TypeError): + Converter({"tmp_0": [0]}, [0], []) + + strategy_1 = { + 'tmp_0': { + "process_shape": [1], + "process_group": [0], + "dims_mapping": [-1] + } + } + with self.assertRaises(TypeError): + Converter({"tmp_0": [0]}, strategy_1, []) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py index a7d51a7e176d475763f7368c509dd926e81d0b0f..d150da761aad3de3ab09f257d3b638cf37c27996 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_space.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_space.py new file mode 100644 index 0000000000000000000000000000000000000000..cb7104f9ef641c00af91f461495eae0caa3c7cd1 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_space.py @@ -0,0 +1,138 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +from paddle.distributed.auto_parallel.tuner import tunable_space as ts + + +class TestTunableSpace(unittest.TestCase): + def test_fixed(self): + space = ts.TunableSpace() + fixed = space.fixed("fixed", default=4) + self.assertEqual(space.values["fixed"], 4) + self.assertEqual(len(space.variables), 1) + self.assertEqual(space.variables["fixed"].name, "fixed") + + space.values["fixed"] = 2 + self.assertEqual(space.get_value("fixed"), 2) + self.assertEqual(space.values, {"fixed": 2}) + self.assertEqual(len(space.variables), 1) + self.assertEqual(space.variables["fixed"].name, "fixed") + + def test_boolean(self): + space = ts.TunableSpace() + boolean = space.boolean("boolean") + self.assertEqual(space.values["boolean"], False) + self.assertEqual(len(space.variables), 1) + self.assertEqual(space.variables["boolean"].name, "boolean") + + space.values["boolean"] = True + self.assertEqual(space.get_value("boolean"), True) + self.assertEqual(space.values, {"boolean": True}) + self.assertEqual(len(space.variables), 1) + self.assertEqual(space.variables["boolean"].name, "boolean") + + def test_choice(self): + space = ts.TunableSpace() + choice = space.choice("choice", [1, 2, 3, 4], default=4) + self.assertEqual(space.values["choice"], 4) + self.assertEqual(len(space.variables), 1) + self.assertEqual(space.variables["choice"].name, "choice") + + space.values["choice"] = 2 + self.assertEqual(space.get_value("choice"), 2) + self.assertEqual(space.values, {"choice": 2}) + self.assertEqual(len(space.variables), 1) + self.assertEqual(space.variables["choice"].name, "choice") + + def test_int_range(self): + space = ts.TunableSpace() + int_range = space.int_range("int_range", start=1, stop=4, default=2) + self.assertEqual(space.values["int_range"], 2) + self.assertEqual(len(space.variables), 1) + self.assertEqual(space.variables["int_range"].name, "int_range") + + space.values["int_range"] = 3 + self.assertEqual(space.get_value("int_range"), 3) + self.assertEqual(space.values, {"int_range": 3}) + self.assertEqual(len(space.variables), 1) + self.assertEqual(space.variables["int_range"].name, "int_range") + + def test_float_range(self): + space = ts.TunableSpace() + float_range = space.float_range( + "float_range", start=0.4, stop=4.4, default=2.0) + self.assertEqual(space.values["float_range"], 2.0) + self.assertEqual(len(space.variables), 1) + self.assertEqual(space.variables["float_range"].name, "float_range") + + space.values["float_range"] = 3.0 + self.assertEqual(space.get_value("float_range"), 3.0) + self.assertEqual(space.values, {"float_range": 3.0}) + self.assertEqual(len(space.variables), 1) + self.assertEqual(space.variables["float_range"].name, "float_range") + + def test_varibles(self): + space = ts.TunableSpace() + choice = space.choice("choice", [1, 2, 3, 4], default=4) + self.assertEqual(space.values["choice"], 4) + self.assertEqual(len(space.variables), 1) + self.assertEqual(space.variables["choice"].name, "choice") + + int_range = space.int_range("int_range", start=1, stop=4, default=2) + self.assertEqual(space.values["int_range"], 2) + self.assertEqual(len(space.variables), 2) + self.assertEqual(space.variables["int_range"].name, "int_range") + + def test_not_populated_variable(self): + space = ts.TunableSpace() + choice = space.choice("choice", [1, 2, 3, 4], default=2) + self.assertEqual(choice, 2) + + def test_populated_variable(self): + space = ts.TunableSpace() + space.values["choice"] = 2 + choice = space.choice("choice", [1, 2, 3, 4], default=4) + self.assertEqual(choice, 2) + + space["choice"] = 3 + self.assertNotEqual(space.values["choice"], 2) + self.assertEqual(space.values["choice"], 3) + + def test_state(self): + space = ts.TunableSpace() + choice = space.choice("choice", [1, 2, 3, 4], default=4) + int_range = space.int_range("int_range", start=1, stop=4, default=2) + + new_space = space.from_state(space.get_state()) + self.assertEqual(new_space.get_value("choice"), 4) + self.assertEqual(new_space.get_value("int_range"), 2) + self.assertEqual(len(new_space.variables), 2) + self.assertEqual(len(new_space.values), 2) + + self.assertEqual(new_space.variables["choice"].name, "choice") + self.assertEqual(new_space.variables["choice"].default, 4) + self.assertEqual(new_space.variables["choice"].values, [1, 2, 3, 4]) + + self.assertEqual(new_space.variables["int_range"].name, "int_range") + self.assertEqual(new_space.variables["int_range"].default, 2) + self.assertEqual(new_space.variables["int_range"].start, 1) + self.assertEqual(new_space.variables["int_range"].stop, 4) + self.assertEqual(new_space.variables["int_range"].step, 1) + self.assertEqual(new_space.variables["int_range"].endpoint, False) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_variable.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_variable.py new file mode 100644 index 0000000000000000000000000000000000000000..c36fca7a9d09a6fb15664226ac0be441fbf49c3e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_variable.py @@ -0,0 +1,99 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +from paddle.distributed.auto_parallel.tuner import tunable_variable as tv + + +class TestTunableVariable(unittest.TestCase): + def test_fixed(self): + fixed = tv.Fixed("fixed", True) + fixed = tv.Fixed.from_state(fixed.get_state()) + self.assertEqual(fixed.default, True) + self.assertEqual(fixed.random(), True) + + fixed = tv.Fixed("fixed", 1) + fixed = tv.Fixed.from_state(fixed.get_state()) + self.assertEqual(fixed.default, 1) + self.assertEqual(fixed.random(), 1) + + def test_boolean(self): + boolean = tv.Boolean("bool") + boolean = tv.Boolean.from_state(boolean.get_state()) + self.assertEqual(boolean.default, False) + self.assertIn(boolean.random(), [True, False]) + self.assertIn(boolean.random(1234), [True, False]) + + boolean = tv.Boolean("bool", True) + boolean = tv.Boolean.from_state(boolean.get_state()) + self.assertEqual(boolean.default, True) + self.assertIn(boolean.random(), [True, False]) + self.assertIn(boolean.random(1234), [True, False]) + + def test_choice(self): + choice = tv.Choice("choice", [1, 2, 3, 4]) + choice = tv.Choice.from_state(choice.get_state()) + self.assertEqual(choice.default, 1) + self.assertIn(choice.random(), [1, 2, 3, 4]) + self.assertIn(choice.random(1234), [1, 2, 3, 4]) + + choice = tv.Choice("choice", [1, 2, 3, 4], default=2) + choice = tv.Choice.from_state(choice.get_state()) + self.assertEqual(choice.default, 2) + self.assertIn(choice.random(), [1, 2, 3, 4]) + self.assertIn(choice.random(1234), [1, 2, 3, 4]) + + def test_int_range(self): + int_range = tv.IntRange("int_range", start=1, stop=4, default=2) + int_range = tv.IntRange.from_state(int_range.get_state()) + self.assertEqual(int_range.default, 2) + self.assertIn(int_range.random(), [1, 2, 3, 4]) + self.assertIn(int_range.random(1234), [1, 2, 3, 4]) + self.assertNotEqual(int_range.default, 4) + + int_range = tv.IntRange( + "int_range", start=1, stop=8, step=2, default=3, endpoint=True) + int_range = tv.IntRange.from_state(int_range.get_state()) + self.assertEqual(int_range.default, 3) + self.assertIn(int_range.random(), [1, 3, 5, 7]) + self.assertIn(int_range.random(1234), [1, 3, 5, 7]) + self.assertNotEqual(int_range.default, 2) + + def test_float_range(self): + float_range = tv.FloatRange( + "float_range", start=0.4, stop=4.4, default=2.0) + float_range = tv.FloatRange.from_state(float_range.get_state()) + self.assertEqual(float_range.default, 2.0) + self.assertGreater(float_range.random(), 0.4) + self.assertLess(float_range.random(1234), 4.4) + self.assertNotAlmostEqual(float_range.random(), 1) + self.assertNotAlmostEqual(float_range.random(), 4.4) + + float_range = tv.FloatRange( + "float_range", + start=0.4, + stop=8.4, + step=2.0, + default=3.0, + endpoint=True) + float_range = tv.FloatRange.from_state(float_range.get_state()) + self.assertEqual(float_range.default, 3.0) + self.assertGreater(float_range.random(), 0.4) + self.assertLessEqual(float_range.random(1234), 8.4) + self.assertNotAlmostEqual(float_range.random(), 2) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt index 959700ad743b40420200b56055354279386a9a7c..79a2430a161703348824d8e4e687bf85569c408a 100644 --- a/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt @@ -4,5 +4,11 @@ if(WITH_IPU) foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP}) + # set all UTs timeout to 200s + set_tests_properties(${TEST_OP} PROPERTIES TIMEOUT 200) endforeach(TEST_OP) + + set_tests_properties(test_conv_op_ipu PROPERTIES TIMEOUT 300) + set_tests_properties(test_elemetwise_x_op_ipu PROPERTIES TIMEOUT 300) + set_tests_properties(test_reduce_x_op_ipu PROPERTIES TIMEOUT 600) endif() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op.py b/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op.py deleted file mode 100644 index 58a88c113fc0b6b82c1c58d50a1b0824cb530632..0000000000000000000000000000000000000000 --- a/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op.py +++ /dev/null @@ -1,126 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.nn.functional as F -import paddle.optimizer -import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() - - -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") -class TestRelu(IPUOpTest): - def setUp(self): - self.set_atol() - self.set_training() - self.init_op() - - def init_op(self): - self.op = paddle.fluid.layers.relu - - def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] - - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() - main_prog = paddle.static.Program() - startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED - - with fluid.scope_guard(scope): - with paddle.static.program_guard(main_prog, startup_prog): - x = paddle.static.data( - name=self.feed_list[0], - shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) - out = self.op(x, **self.attrs) - - fetch_list = [out.name] - - if run_ipu: - place = paddle.IPUPlace() - else: - place = paddle.CPUPlace() - exe = paddle.static.Executor(place) - exe.run(startup_prog) - - if run_ipu: - feed_list = self.feed_list - ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IpuCompiler( - main_prog, - ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) - else: - program = main_prog - - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) - return result[0] - - def run_test_base(self): - res0 = self._test_base(False) - res1 = self._test_base(True) - - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) - - self.assertTrue(res0.shape == res1.shape) - - def test_case0(self): - self.feed = { - "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'), - } - self.attrs = {} - self.set_feed_attr() - self.run_test_base() - - -class TestTanh(TestRelu): - def init_op(self): - self.op = F.tanh - - -class TestLog(TestRelu): - def init_op(self): - self.op = paddle.fluid.layers.log - - -class TestSigmoid(TestRelu): - def init_op(self): - self.op = F.sigmoid - - -class TestSqrt(TestRelu): - def init_op(self): - self.op = paddle.fluid.layers.sqrt - - -if __name__ == "__main__": - unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py index 1dab958c1ecbc806df94c651cf4a2d6cd82f3ddb..c640cd441f1b2589bcc3ffa466b865d1fb34c582 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py @@ -115,7 +115,7 @@ class TestBase(IPUOpTest): class TestCase1(TestBase): def set_atol(self): - self.atol = 1e-7 + self.atol = 1e-6 self.rtol = 1e-6 self.atol_fp16 = 1e-3 self.rtol_fp16 = 1e-3 @@ -129,7 +129,7 @@ class TestCase1(TestBase): class TestCase2(TestBase): def set_atol(self): - self.atol = 1e-7 + self.atol = 1e-6 self.rtol = 1e-6 self.atol_fp16 = 1e-3 self.rtol_fp16 = 1e-3 diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_fp16_support.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_fp16_support.py deleted file mode 100644 index aa6c05dc59a87f844c19912be484a4b007f0adfc..0000000000000000000000000000000000000000 --- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_fp16_support.py +++ /dev/null @@ -1,109 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer -import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() - - -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") -class TestBase(IPUOpTest): - def setUp(self): - self.set_atol() - self.set_feed() - self.set_feed_attr() - self.set_attrs() - - def set_feed(self): - np_data = np.random.uniform(low=-1, high=1, size=[1, 3, 100, 100]) - self.feed_ipu = {"x": np_data.astype('float16')} - self.feed_cpu = {"x": np_data.astype('float32')} - - def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed_cpu.values()] - self.feed_list = list(self.feed_cpu.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed_cpu.values() - ] - - def set_attrs(self): - self.attrs = {} - - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() - main_prog = paddle.static.Program() - startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED - - with fluid.scope_guard(scope): - with paddle.static.program_guard(main_prog, startup_prog): - x = paddle.static.data( - name=self.feed_list[0], - shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) - conv1 = paddle.static.nn.conv2d( - x, num_filters=3, filter_size=3, bias_attr=False) - conv2 = paddle.static.nn.conv2d( - x, num_filters=3, filter_size=3, bias_attr=False) - add1 = conv1 + conv2 - conv3 = paddle.static.nn.conv2d( - add1, num_filters=8, filter_size=8, bias_attr=False) - out = paddle.fluid.layers.relu(conv3, **self.attrs) - fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: - place = paddle.CPUPlace() - exe = paddle.static.Executor(place) - exe.run(startup_prog) - - feed = self.feed_ipu if run_ipu else self.feed_cpu - if run_ipu: - feed_list = self.feed_list - ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=False) - ipu_strategy.SetHalfConfig(enable_fp16=True) - program = compiler.IPUCompiledProgram( - main_prog, - ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) - else: - feed_list = self.feed_list - program = main_prog - result = exe.run(program, feed=feed, fetch_list=fetch_list) - return result[0] - - def test_base(self): - res0 = self._test_base(False) - res1 = self._test_base(True) - - self.assertTrue(res0.shape == res1.shape) - mae = np.mean(np.abs(res0.flatten() - res1.flatten())) - print("mae is ", mae) - self.assertTrue(mae < 0.001) - - -if __name__ == "__main__": - unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py new file mode 100644 index 0000000000000000000000000000000000000000..893bd3833430c1059d5251ae6039c274112cbddb --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py @@ -0,0 +1,328 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from inference_pass_test import InferencePassTest +import paddle +import paddle.fluid as fluid +from paddle.fluid.core import PassVersionChecker + + +class ElementwiseActivationMkldnnFusePassTest(InferencePassTest): + act_alpha = None + act_beta = None + pass_name = 'elt_act_mkldnn_fuse_pass' + + def setUp(self): + self.set_params() + with fluid.program_guard(self.main_program, self.startup_program): + data_A = fluid.data( + name="data_A", shape=[-1, 3, 100, 100], dtype="float32") + data_B = fluid.data( + name="data_B", shape=[-1, 3, 100, 100], dtype="float32") + elt_out = self.operand(data_A, data_B) + if self.act is not None: + if self.act_beta is not None: + elt_out = self.act(elt_out, self.act_alpha, self.act_beta) + elif self.act_alpha is not None: + elt_out = self.act(elt_out, self.act_alpha) + else: + elt_out = self.act(elt_out) + + self.feeds = { + "data_A": np.random.random((1, 3, 100, 100)).astype("float32"), + "data_B": np.random.random((1, 3, 100, 100)).astype("float32") + } + self.fetch_list = [elt_out] + self.enable_mkldnn = True + + def set_params(self): + self.operand = fluid.layers.elementwise_add + self.act = None + + def test_check_output(self): + use_gpu = False + self.check_output_with_option(use_gpu) + + def test_pass_compatible(self): + self.assertTrue(PassVersionChecker.IsCompatible(self.pass_name)) + + +class ElementwiseActivationMkldnnFusePassTest_Add_Relu( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_add + self.act = fluid.layers.relu + + +class ElementwiseActivationMkldnnFusePassTest_Add_Tanh( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_add + self.act = fluid.layers.tanh + + +class ElementwiseActivationMkldnnFusePassTest_Add_LeakyRelu( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_add + self.act_alpha = 0.2 + self.act = fluid.layers.leaky_relu + + +class ElementwiseActivationMkldnnFusePassTest_Add_Swish( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_add + self.act_alpha = 4 + self.act = fluid.layers.swish + + +class ElementwiseActivationMkldnnFusePassTest_Add_HardSwish( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_add + self.act = fluid.layers.hard_swish + + +class ElementwiseActivationMkldnnFusePassTest_Add_SQRT( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_add + self.act = fluid.layers.sqrt + + +class ElementwiseActivationMkldnnFusePassTest_Add_ABS( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_add + self.act = fluid.layers.abs + + +class ElementwiseActivationMkldnnFusePassTest_Add_Clip( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_add + self.act = fluid.layers.clip + self.act_alpha = 0.0 + self.act_beta = 10.0 + + +class ElementwiseActivationMkldnnFusePassTest_Add_Gelu( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_add + self.act = fluid.layers.gelu + + +class ElementwiseActivationMkldnnFusePassTest_Add_Gelu_Tanh( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_add + self.act = fluid.layers.gelu + self.act_alpha = True + + +class ElementwiseActivationMkldnnFusePassTest_Add_Relu6( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_add + self.act = fluid.layers.relu6 + self.act_alpha = 5.0 + + +class ElementwiseActivationMkldnnFusePassTest_Add_Sigmoid( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_add + self.act = fluid.layers.sigmoid + + +class ElementwiseActivationMkldnnFusePassTest_Sub_Relu( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_sub + self.act = fluid.layers.relu + + +class ElementwiseActivationMkldnnFusePassTest_Sub_Tanh( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_sub + self.act = fluid.layers.tanh + + +class ElementwiseActivationMkldnnFusePassTest_Sub_LeakyRelu( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_sub + self.act_alpha = 0.2 + self.act = fluid.layers.leaky_relu + + +class ElementwiseActivationMkldnnFusePassTest_Sub_Swish( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_sub + self.act = fluid.layers.swish + + +class ElementwiseActivationMkldnnFusePassTest_Sub_HardSwish( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_sub + self.act = fluid.layers.hard_swish + + +class ElementwiseActivationMkldnnFusePassTest_Sub_ABS( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_sub + self.act = fluid.layers.abs + + +class ElementwiseActivationMkldnnFusePassTest_Sub_Clip( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_sub + self.act = fluid.layers.clip + self.act_alpha = 0.0 + self.act_beta = 10.0 + + +class ElementwiseActivationMkldnnFusePassTest_Sub_Gelu( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_sub + self.act = fluid.layers.gelu + + +class ElementwiseActivationMkldnnFusePassTest_Sub_Gelu_Tanh( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_sub + self.act = fluid.layers.gelu + self.act_alpha = True + + +class ElementwiseActivationMkldnnFusePassTest_Sub_Relu6( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_sub + self.act = fluid.layers.relu6 + self.act_alpha = 5.0 + + +class ElementwiseActivationMkldnnFusePassTest_Sub_Sigmoid( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_sub + self.act = fluid.layers.sigmoid + + +class ElementwiseActivationMkldnnFusePassTest_Mul_Relu( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_mul + self.act = fluid.layers.relu + + +class ElementwiseActivationMkldnnFusePassTest_Mul_Tanh( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_mul + self.act = fluid.layers.tanh + + +class ElementwiseActivationMkldnnFusePassTest_Mul_LeakyRelu( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_mul + self.act_alpha = 0.2 + self.act = fluid.layers.leaky_relu + + +class ElementwiseActivationMkldnnFusePassTest_Mul_Swish( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_mul + self.act = fluid.layers.swish + + +class ElementwiseActivationMkldnnFusePassTest_Mul_HardSwish( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_mul + self.act = fluid.layers.hard_swish + + +class ElementwiseActivationMkldnnFusePassTest_Mul_SQRT( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_mul + self.act = fluid.layers.sqrt + + +class ElementwiseActivationMkldnnFusePassTest_Mul_ABS( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_mul + self.act = fluid.layers.abs + + +class ElementwiseActivationMkldnnFusePassTest_Mul_Clip( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_mul + self.act = fluid.layers.clip + self.act_alpha = 0.0 + self.act_beta = 10.0 + + +class ElementwiseActivationMkldnnFusePassTest_Mul_Gelu( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_mul + self.act = fluid.layers.gelu + + +class ElementwiseActivationMkldnnFusePassTest_Mul_Gelu_Tanh( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_mul + self.act = fluid.layers.gelu + self.act_alpha = True + + +class ElementwiseActivationMkldnnFusePassTest_Mul_Relu6( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_mul + self.act = fluid.layers.relu6 + self.act_alpha = 5.0 + + +class ElementwiseActivationMkldnnFusePassTest_Mul_Sigmoid( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_mul + self.act = fluid.layers.sigmoid + + +if __name__ == "__main__": + paddle.enable_static() + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass_new.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass_new.py new file mode 100644 index 0000000000000000000000000000000000000000..0f5279b0edadd61467c3bebe55dfb83aea909267 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass_new.py @@ -0,0 +1,82 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from auto_scan_test import PassAutoScanTest +from program_config import TensorConfig, ProgramConfig +import numpy as np +import paddle.inference as paddle_infer +from functools import partial +import unittest + +import hypothesis +from hypothesis import given, settings, seed, example, assume +import hypothesis.strategies as st + + +class TestElementWiseAddReluFusePass(PassAutoScanTest): + def is_program_valid(self, program_config: ProgramConfig) -> bool: + return True + + def sample_program_config(self, draw): + batch_size = draw(st.integers(min_value=1, max_value=4)) + + def generate_input(): + return np.random.random( + [batch_size, 3, 100, 100]).astype(np.float32) + + ops_config = [{ + "op_type": "elementwise_add", + "op_inputs": { + "X": ["A"], + "Y": ["B"] + }, + "op_outputs": { + "Out": ["add_output"] + }, + "op_attrs": {} + }, { + "op_type": "relu", + "op_inputs": { + "X": ["add_output"] + }, + "op_outputs": { + "Out": ["relu_output"] + }, + "op_attrs": {} + }] + + ops = self.generate_op_config(ops_config) + + program_config = ProgramConfig( + ops=ops, + weights={}, + inputs={ + "A": TensorConfig(data_gen=partial(generate_input)), + "B": TensorConfig(data_gen=partial(generate_input)) + }, + outputs=["relu_output"]) + + return program_config + + def sample_predictor_configs(self, program_config): + config = self.create_inference_config(use_mkldnn=True) + yield config, ["elementwise_add"], (1e-5, 1e-5) + + def test(self): + self.run_and_statis( + quant=False, passes=["elt_act_mkldnn_fuse_pass"], min_success_num=4) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mlu/test_amp_check_finite_and_scale_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_amp_check_finite_and_scale_op_mlu.py new file mode 100644 index 0000000000000000000000000000000000000000..57fa56acd687582fa67c1592a7d5c505ca6cce06 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_amp_check_finite_and_scale_op_mlu.py @@ -0,0 +1,145 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle + +paddle.enable_static() +SEED = 2022 + + +class TestCheckFiniteAndUnscaleOp(OpTest): + def setUp(self): + self.set_mlu() + self.op_type = "check_finite_and_unscale" + self.init_dtype() + self.init_test_case() + + def init_test_case(self): + x = np.random.random((129, 129)).astype(self.dtype) + scale = np.random.random((1)).astype(self.dtype) + + self.inputs = {'X': [('x0', x)], 'Scale': scale} + self.outputs = { + 'FoundInfinite': np.array([0]), + 'Out': [('out0', x / scale)], + } + + def set_mlu(self): + self.__class__.use_mlu = True + self.place = paddle.device.MLUPlace(0) + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place) + + +class TestCheckFiniteAndUnscaleOpWithNan(TestCheckFiniteAndUnscaleOp): + def init_test_case(self): + x = np.random.random((129, 129)).astype(self.dtype) + x[128][128] = np.nan + scale = np.random.random((1)).astype(self.dtype) + + self.inputs = {'X': [('x0', x)], 'Scale': scale} + self.outputs = { + 'FoundInfinite': np.array([1]), + 'Out': [('out0', x)], + } + + def test_check_output(self): + # When input contains nan, do not check the output, + # since the output may be nondeterministic and will be discarded. + self.check_output_with_place(self.place, no_check_set=['Out']) + + +class TestCheckFiniteAndUnscaleOpWithInf(TestCheckFiniteAndUnscaleOp): + def init_test_case(self): + x = np.random.random((129, 129)).astype(self.dtype) + x[128][128] = np.inf + scale = np.random.random((1)).astype(self.dtype) + + self.inputs = {'X': [('x0', x)], 'Scale': scale} + self.outputs = { + 'FoundInfinite': np.array([1]), + 'Out': [('out0', x)], + } + + def test_check_output(self): + # When input contains inf, do not check the output, + # since the output may be nondeterministic and will be discarded. + self.check_output_with_place(self.place, no_check_set=['Out']) + + +class TestCheckFiniteAndUnscaleOpMultiInput(TestCheckFiniteAndUnscaleOp): + def init_test_case(self): + x0 = np.random.random((129, 129)).astype(self.dtype) + x1 = np.random.random((129, 129)).astype(self.dtype) + scale = np.random.random((1)).astype(self.dtype) + + self.inputs = {'X': [('x0', x0), ('x1', x1)], 'Scale': scale} + self.outputs = { + 'FoundInfinite': np.array([0]), + 'Out': [('out0', x0 / scale), ('out1', x1 / scale)], + } + + +class TestCheckFiniteAndUnscaleOpMultiInputWithNan(TestCheckFiniteAndUnscaleOp): + def init_test_case(self): + x0 = np.random.random((129, 129)).astype(self.dtype) + x0[128][128] = np.nan + x1 = np.random.random((129, 129)).astype(self.dtype) + scale = np.random.random((1)).astype(self.dtype) + + self.inputs = {'X': [('x0', x0), ('x1', x1)], 'Scale': scale} + self.outputs = { + 'FoundInfinite': np.array([1]), + 'Out': [('out0', x0 / scale), ('out1', x1 / scale)], + } + + def test_check_output(self): + # When input contains inf, do not check the output, + # since the output may be nondeterministic and will be discarded. + self.check_output_with_place(self.place, no_check_set=['Out']) + + +class TestCheckFiniteAndUnscaleOpMultiInputWithInf(TestCheckFiniteAndUnscaleOp): + def init_test_case(self): + x0 = np.random.random((129, 129)).astype(self.dtype) + x0[128][128] = np.nan + x1 = np.random.random((129, 129)).astype(self.dtype) + x1[128][128] = np.inf + scale = np.random.random((1)).astype(self.dtype) + + self.inputs = {'X': [('x0', x0), ('x1', x1)], 'Scale': scale} + self.outputs = { + 'FoundInfinite': np.array([1]), + 'Out': [('out0', x0 / scale), ('out1', x1 / scale)], + } + + def test_check_output(self): + # When input contains inf, do not check the output, + # since the output may be nondeterministic and will be discarded. + self.check_output_with_place(self.place, no_check_set=['Out']) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mlu/test_merged_momentum_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_merged_momentum_op_mlu.py new file mode 100644 index 0000000000000000000000000000000000000000..f3699da15b5356d2bf25341261be0c237e037ce5 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_merged_momentum_op_mlu.py @@ -0,0 +1,373 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +sys.path.append('..') +import unittest +import paddle +import numpy as np +from paddle.fluid.layer_helper import LayerHelper +from collections import OrderedDict + + +def run_momentum_op(params, + grads, + velocitys, + master_params, + learning_rate, + place, + multi_precision, + mu=0.9, + rescale_grad=0.01, + use_merged=False): + assert len(params) == len(grads) + assert len(params) == len(velocitys) + if multi_precision: + assert len(params) == len(master_params) + op_type = 'merged_momentum' if use_merged else 'momentum' + main = paddle.static.Program() + startup = paddle.static.Program() + with paddle.static.program_guard(main, startup): + helper = LayerHelper(op_type, **locals()) + attrs = { + 'mu': mu, + 'multi_precision': multi_precision, + 'rescale_grad': rescale_grad, + } + + param_vars = [ + helper.create_variable( + persistable=True, shape=p.shape, dtype=p.dtype) for p in params + ] + grad_vars = [ + helper.create_variable( + shape=g.shape, dtype=g.dtype) for g in grads + ] + velocity_vars = [ + helper.create_variable( + persistable=True, shape=v.shape, dtype=v.dtype) + for v in velocitys + ] + lr_var = helper.create_variable( + persistable=True, + shape=learning_rate.shape, + dtype=learning_rate.dtype) + + feed_dict = OrderedDict() + + feed_dict.update( + OrderedDict([(p_var.name, p_val) + for p_var, p_val in zip(param_vars, params)])) + feed_dict.update( + OrderedDict([(v_var.name, v_val) + for v_var, v_val in zip(velocity_vars, velocitys)])) + fetch_list = list(feed_dict.keys()) + + feed_dict.update( + OrderedDict([(g_var.name, g_val) + for g_var, g_val in zip(grad_vars, grads)])) + feed_dict.update({lr_var.name: learning_rate}) + + if multi_precision: + master_param_vars = [ + helper.create_variable( + persistable=True, shape=p.shape, dtype=p.dtype) + for p in master_params + ] + feed_dict.update( + OrderedDict([(mp_var.name, mp_val) + for mp_var, mp_val in zip(master_param_vars, + master_params)])) + # CPUPlace does not use MasterParam + if isinstance(place, paddle.CUDAPlace): + fetch_list = fetch_list + [ + mp_var.name for mp_var in master_param_vars + ] + else: + master_param_vars = None + + if not use_merged: + for i, (p, g, + v) in enumerate(zip(param_vars, grad_vars, velocity_vars)): + inputs = { + 'Param': p, + 'Grad': g, + 'Velocity': v, + 'LearningRate': lr_var, + } + outputs = {'ParamOut': p, 'VelocityOut': v} + if multi_precision: + inputs['MasterParam'] = master_param_vars[i] + outputs['MasterParamOut'] = master_param_vars[i] + helper.append_op( + type=op_type, inputs=inputs, outputs=outputs, attrs=attrs) + else: + inputs = { + 'Param': param_vars, + 'Grad': grad_vars, + 'Velocity': velocity_vars, + 'LearningRate': lr_var, + } + outputs = {'ParamOut': param_vars, 'VelocityOut': velocity_vars} + if multi_precision: + inputs['MasterParam'] = master_param_vars + outputs['MasterParamOut'] = master_param_vars + helper.append_op( + type=op_type, inputs=inputs, outputs=outputs, attrs=attrs) + + exe = paddle.static.Executor(place) + with paddle.static.scope_guard(paddle.static.Scope()): + exe.run(startup) + return exe.run(main, feed=feed_dict, fetch_list=fetch_list) + + +def run_momentum_op2(params, + grads, + velocitys, + master_params, + learning_rate, + place, + multi_precision, + mu=0.9, + rescale_grad=0.01, + use_merged=False, + use_nesterov=True): + assert len(params) == len(grads) + assert len(params) == len(velocitys) + if multi_precision: + assert len(params) == len(master_params) + op_type = 'merged_momentum' if use_merged else 'momentum' + main = paddle.static.Program() + startup = paddle.static.Program() + with paddle.static.program_guard(main, startup): + helper = LayerHelper(op_type, **locals()) + + param_vars = [ + helper.create_variable( + persistable=True, shape=p.shape, dtype=p.dtype) for p in params + ] + grad_vars = [ + helper.create_variable( + shape=g.shape, dtype=g.dtype) for g in grads + ] + velocity_vars = [ + helper.create_variable( + persistable=True, shape=v.shape, dtype=v.dtype) + for v in velocitys + ] + lr_var = helper.create_variable( + persistable=True, + shape=learning_rate.shape, + dtype=learning_rate.dtype) + + feed_dict = OrderedDict() + + feed_dict.update( + OrderedDict([(p_var.name, p_val) + for p_var, p_val in zip(param_vars, params)])) + feed_dict.update( + OrderedDict([(v_var.name, v_val) + for v_var, v_val in zip(velocity_vars, velocitys)])) + fetch_list = list(feed_dict.keys()) + + feed_dict.update( + OrderedDict([(g_var.name, g_val) + for g_var, g_val in zip(grad_vars, grads)])) + feed_dict.update({lr_var.name: learning_rate}) + + if multi_precision: + master_param_vars = [ + helper.create_variable( + persistable=True, shape=p.shape, dtype=p.dtype) + for p in master_params + ] + feed_dict.update( + OrderedDict([(mp_var.name, mp_val) + for mp_var, mp_val in zip(master_param_vars, + master_params)])) + # CPUPlace does not use MasterParam + if isinstance(place, paddle.CUDAPlace): + fetch_list = fetch_list + [ + mp_var.name for mp_var in master_param_vars + ] + else: + master_param_vars = None + + if not use_merged: + for i, (p, g, + v) in enumerate(zip(param_vars, grad_vars, velocity_vars)): + inputs = { + 'Param': p, + 'Grad': g, + 'Velocity': v, + 'LearningRate': lr_var, + } + outputs = {'ParamOut': p, 'VelocityOut': v} + if multi_precision: + inputs['MasterParam'] = master_param_vars[i] + outputs['MasterParamOut'] = master_param_vars[i] + attrs = { + 'mu': mu, + 'multi_precision': multi_precision, + 'rescale_grad': rescale_grad, + 'use_nesterov': use_nesterov, + 'regularization_method': 'l2_decay', + 'regularization_coeff': 2.0, + } + helper.append_op( + type=op_type, inputs=inputs, outputs=outputs, attrs=attrs) + else: + inputs = { + 'Param': param_vars, + 'Grad': grad_vars, + 'Velocity': velocity_vars, + 'LearningRate': lr_var, + } + outputs = {'ParamOut': param_vars, 'VelocityOut': velocity_vars} + if multi_precision: + inputs['MasterParam'] = master_param_vars + outputs['MasterParamOut'] = master_param_vars + attrs = { + 'mu': mu, + 'multi_precision': multi_precision, + 'rescale_grad': rescale_grad, + 'use_nesterov': use_nesterov, + 'regularization_method': + ['l2_decay' for i in range(len(param_vars))], + 'regularization_coeff': [2.0 for i in range(len(param_vars))], + } + helper.append_op( + type=op_type, inputs=inputs, outputs=outputs, attrs=attrs) + + exe = paddle.static.Executor(place) + with paddle.static.scope_guard(paddle.static.Scope()): + exe.run(startup) + return exe.run(main, feed=feed_dict, fetch_list=fetch_list) + + +class TestMergedMomentum(unittest.TestCase): + def setUp(self): + paddle.enable_static() + self.shapes = [[3, 4], [2, 7], [5, 6], [7, 8]] + self.seed = 10 + self.place = paddle.device.MLUPlace(0) + self.__class__.use_mlu = True + + def gen_rand_data(self, shapes, dtype): + return [np.random.random(s).astype(dtype) for s in shapes] + + def prepare_data(self, shapes, multi_precision, seed, place): + np.random.seed(seed) + mp_dtype = np.float32 + dtype = np.float32 + params = self.gen_rand_data(shapes, dtype) + grads = self.gen_rand_data(shapes, dtype) + velocitys = self.gen_rand_data(shapes, mp_dtype) + learning_rate = self.gen_rand_data([[1]], mp_dtype)[0] + if multi_precision: + master_params = [p.astype(mp_dtype) for p in params] + else: + master_params = None + return params, grads, velocitys, master_params, learning_rate + + def check_with_place(self, place, multi_precision): + params, grads, velocitys, master_params, learning_rate = self.prepare_data( + self.shapes, multi_precision, self.seed, place) + + def run_op(use_merged): + # MLU Momentum Op does not support rescale_grad + rescale_grad = 1.0 + return run_momentum_op( + params, + grads, + velocitys, + master_params, + learning_rate, + place, + multi_precision, + rescale_grad=rescale_grad, + use_merged=use_merged) + + outs1 = run_op(True) + outs2 = run_op(False) + self.assertEqual(len(outs1), len(outs2)) + for i, (out1, out2) in enumerate(zip(outs1, outs2)): + self.assertTrue(np.allclose(out1, out2, atol=1e-7)) + + def test_main(self): + self.check_with_place(self.place, multi_precision=False) + + +class TestMergedMomentum2(unittest.TestCase): + def setUp(self): + paddle.enable_static() + self.shapes = [[3, 4], [2, 7], [5, 6], [7, 8]] + self.seed = 10 + self.place = paddle.device.MLUPlace(0) + self.__class__.use_mlu = True + + def gen_rand_data(self, shapes, dtype): + return [np.random.random(s).astype(dtype) for s in shapes] + + def prepare_data(self, shapes, multi_precision, seed, place): + np.random.seed(seed) + mp_dtype = np.float32 + dtype = np.float32 # np.float16 + params = self.gen_rand_data(shapes, dtype) + grads = self.gen_rand_data(shapes, dtype) + velocitys = self.gen_rand_data(shapes, mp_dtype) + learning_rate = self.gen_rand_data([[1]], mp_dtype)[0] + if multi_precision: + master_params = [p.astype(mp_dtype) for p in params] + else: + master_params = None + return params, grads, velocitys, master_params, learning_rate + + def check_with_place(self, place, multi_precision): + params, grads, velocitys, master_params, learning_rate = self.prepare_data( + self.shapes, multi_precision, self.seed, place) + + def run_op(use_nesterov, use_merged): + # MLU Momentum Op does not support rescale_grad + rescale_grad = 1.0 + return run_momentum_op2( + params, + grads, + velocitys, + master_params, + learning_rate, + place, + multi_precision, + rescale_grad=rescale_grad, + use_merged=use_merged, + use_nesterov=use_nesterov) + + outs1 = run_op(use_nesterov=True, use_merged=True) + outs2 = run_op(use_nesterov=True, use_merged=False) + self.assertEqual(len(outs1), len(outs2)) + for i, (out1, out2) in enumerate(zip(outs1, outs2)): + self.assertTrue(np.allclose(out1, out2, atol=1e-7)) + + outs3 = run_op(use_nesterov=False, use_merged=True) + outs4 = run_op(use_nesterov=False, use_merged=False) + self.assertEqual(len(outs3), len(outs4)) + for j, (out3, out4) in enumerate(zip(outs3, outs4)): + self.assertTrue(np.allclose(out3, out4, atol=1e-7)) + + def test_main(self): + self.check_with_place(self.place, multi_precision=False) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_dataparallel_in_eager_mode.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_dataparallel_in_eager_mode.py new file mode 100644 index 0000000000000000000000000000000000000000..8ff68a1ce0d69307547db2fd1f83526094c9bfcf --- /dev/null +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_dataparallel_in_eager_mode.py @@ -0,0 +1,127 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import division +from __future__ import print_function + +import unittest +import os +import numpy as np +import random + +import paddle +import paddle.nn as nn +from paddle.fluid.dygraph.nn import Linear +import paddle.fluid.core as core +from paddle.fluid.framework import _test_eager_guard +import paddle.distributed as dist +from paddle.fluid.dygraph.parallel import ParallelEnv +from paddle.optimizer import SGD +from paddle.fluid.initializer import NumpyArrayInitializer + + +def init_process_group(strategy=None): + nranks = ParallelEnv().nranks + rank = ParallelEnv().local_rank + is_master = True if rank == 0 else False + store = paddle.fluid.core.TCPStore("127.0.0.1", 6172, is_master, nranks) + group = core.ProcessGroupNCCL(store, rank, nranks) + return group + + +class LinearModel(nn.Layer): + def __init__(self, attr_list): + super(LinearModel, self).__init__() + self._linear1 = paddle.nn.Linear( + 50, 30, weight_attr=attr_list[0], bias_attr=False) + self._linear2 = paddle.nn.Linear( + 30, 10, weight_attr=attr_list[1], bias_attr=False) + self._linear3 = paddle.nn.Linear( + 10, 10, weight_attr=attr_list[2], bias_attr=False) + + def forward(self, x): + output = self._linear1(x) + output = self._linear2(output) + output = self._linear3(output) + return output + + +class TestDistTraning(unittest.TestCase): + def test_multiple_gpus(self): + process_group = init_process_group() + self.generate_reducer("float32", process_group) + self.generate_reducer("float16", process_group) + + def generate_reducer(self, dtype, process_group): + dev_id = ParallelEnv().dev_id + np.random.seed(2022 + dev_id) + paddle.set_default_dtype(dtype) + + w_1 = paddle.ParamAttr(initializer=NumpyArrayInitializer( + np.random.rand(50, 30).astype(dtype))) + w_2 = paddle.ParamAttr(initializer=NumpyArrayInitializer( + np.random.rand(30, 10).astype(dtype))) + w_3 = paddle.ParamAttr(initializer=NumpyArrayInitializer( + np.random.rand(10, 10).astype(dtype))) + + attr_list = [w_1, w_2, w_3] + inp = np.random.rand(10, 50).astype(dtype) + + # original reducer + params_a = self.model_train(attr_list, inp) + + # refactored reducer in eager mode + with _test_eager_guard(): + params_b = self.model_train( + attr_list, inp, process_group=process_group) + + for i in range(len(params_a)): + np.testing.assert_allclose(params_a[i].numpy(), params_b[i].numpy()) + + def model_train(self, attr_list, inp, process_group=None): + model = LinearModel(attr_list) + model = paddle.DataParallel(model, process_group=process_group) + optimizer = SGD(learning_rate=0.0003, parameters=model.parameters()) + + x = paddle.to_tensor(inp) + x.stop_gradient = False + + for step in range(10): + y = model(x) + loss = y.mean() + + loss.backward() + optimizer.step() + optimizer.clear_grad() + + return model.parameters() + + +class TestCatchErrors1(unittest.TestCase): + def test_multiple_gpus(self): + linear = paddle.nn.Linear(2, 4) + with _test_eager_guard(): + self.assertRaises(RuntimeError, paddle.DataParallel, linear) + + +class TestCatchErrors2(unittest.TestCase): + def test_multiple_gpus(self): + with _test_eager_guard(): + linear = paddle.nn.Linear(2, 4) + self.assertRaises(RuntimeError, paddle.DataParallel, linear) + + +if __name__ == '__main__': + dist.init_parallel_env() + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_custom_grad_input.py b/python/paddle/fluid/tests/unittests/test_custom_grad_input.py index bc280a01890d4a54f76026ccee31666c5f0ff2a8..83a25b71626e1b84ae0f85eeccee5423205dc978 100644 --- a/python/paddle/fluid/tests/unittests/test_custom_grad_input.py +++ b/python/paddle/fluid/tests/unittests/test_custom_grad_input.py @@ -20,6 +20,7 @@ import numpy as np import paddle import paddle.fluid.dygraph as dg from op_test import OpTest +from paddle.fluid.framework import _test_eager_guard class TestTensorBackward(unittest.TestCase): @@ -29,7 +30,7 @@ class TestTensorBackward(unittest.TestCase): if paddle.is_compiled_with_cuda(): self._places.append(paddle.CUDAPlace(0)) - def test_tensor_backward(self): + def func_tensor_backward(self): for dtype in self._dtypes: x = np.random.random([2, 100]).astype(dtype) y = np.random.random([100, 2]).astype(dtype) @@ -48,6 +49,11 @@ class TestTensorBackward(unittest.TestCase): self.assertTrue(np.allclose(x_grad, x_tensor.grad.numpy())) + def test_tensor_backward(self): + with _test_eager_guard(): + self.func_tensor_backward() + self.func_tensor_backward() + class TestBackwardAPI(unittest.TestCase): def setUp(self): @@ -56,7 +62,7 @@ class TestBackwardAPI(unittest.TestCase): if paddle.is_compiled_with_cuda(): self._places.append(paddle.CUDAPlace(0)) - def test_backward_api(self): + def func_backward_api(self): for dtype in self._dtypes: x = np.random.random([2, 2]).astype(dtype) y = np.random.random([2, 2]).astype(dtype) @@ -78,7 +84,12 @@ class TestBackwardAPI(unittest.TestCase): self.assertTrue( np.allclose(x_grad * 2, x_tensor.grad.numpy())) - def test_backward_single_tensor(self): + def test_backward_api(self): + with _test_eager_guard(): + self.func_backward_api() + self.func_backward_api() + + def func_backward_single_tensor(self): for dtype in self._dtypes: x = np.random.random([2, 2]).astype(dtype) y = np.random.random([2, 2]).astype(dtype) @@ -97,7 +108,12 @@ class TestBackwardAPI(unittest.TestCase): self.assertTrue(np.allclose(x_grad, x_tensor.grad.numpy())) - def test_backward_none_grad_tensor(self): + def test_backward_single_tensor(self): + with _test_eager_guard(): + self.func_backward_single_tensor() + self.func_backward_single_tensor() + + def func_backward_none_grad_tensor(self): for dtype in self._dtypes: x = np.random.random([2, 2]).astype(dtype) y = np.random.random([2, 2]).astype(dtype) @@ -115,7 +131,12 @@ class TestBackwardAPI(unittest.TestCase): self.assertTrue(np.allclose(x_grad, x_tensor.grad.numpy())) - def test_backward_accumulator_with_init_grad(self): + def test_backward_none_grad_tensor(self): + with _test_eager_guard(): + self.func_backward_none_grad_tensor() + self.func_backward_none_grad_tensor() + + def func_backward_accumulator_with_init_grad(self): for dtype in self._dtypes: x = np.random.random([10, ]).astype(dtype) y_grad = np.random.random([10, ]).astype(dtype) @@ -134,11 +155,14 @@ class TestBackwardAPI(unittest.TestCase): y = x**2 z = x**3 - x_grad = 2 * x_tensor * ( - y_grad_tensor + 3 * y_tensor * y_tensor * z_grad_tensor) + x_grad = 2 * x * (y_grad + 3 * y * y * z_grad) - self.assertTrue( - np.allclose(x_grad.numpy(), x_tensor.grad.numpy())) + self.assertTrue(np.allclose(x_grad, x_tensor.grad.numpy())) + + def test_backward_accumulator_with_init_grad(self): + with _test_eager_guard(): + self.func_backward_accumulator_with_init_grad() + self.func_backward_accumulator_with_init_grad() if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_diag_v2.py b/python/paddle/fluid/tests/unittests/test_diag_v2.py index 0371fa054282bb009889c90c9de4da58894fad8f..9f727608f816c4e818f50f12d4d5cc1fccf04bdb 100644 --- a/python/paddle/fluid/tests/unittests/test_diag_v2.py +++ b/python/paddle/fluid/tests/unittests/test_diag_v2.py @@ -44,6 +44,10 @@ class TestDiagV2Op(OpTest): paddle.enable_static() self.check_output(check_eager=True) + def test_check_grad(self): + paddle.enable_static() + self.check_grad(['X'], 'Out', check_eager=True) + def init_config(self): pass @@ -62,14 +66,14 @@ class TestDiagV2OpCase2(TestDiagV2Op): class TestDiagV2OpCase3(TestDiagV2Op): def init_config(self): - self.x = np.random.randint(-10, 10, size=(10, 10)) + self.x = np.random.randint(-10, 10, size=(10, 10)).astype("float64") self.out = np.diag(self.x, self.offset) class TestDiagV2OpCase4(TestDiagV2Op): def init_config(self): self.x = np.random.rand(100) - self.padding_value = 8 + self.padding_value = 2 n = self.x.size self.out = self.padding_value * np.ones((n, n)) + np.diag( self.x, self.offset) - np.diag(self.padding_value * np.ones(n)) diff --git a/python/paddle/fluid/tests/unittests/test_egr_python_api.py b/python/paddle/fluid/tests/unittests/test_egr_python_api.py index 156fdcb9b0abe1ea2dcca0e15bbcfec87b8ebf7a..27aec284de4cdebb5ebb9191bfb67d48c1b327f5 100644 --- a/python/paddle/fluid/tests/unittests/test_egr_python_api.py +++ b/python/paddle/fluid/tests/unittests/test_egr_python_api.py @@ -50,7 +50,7 @@ class EagerScaleTestCase(unittest.TestCase): data_eager.retain_grads() out_eager = core.eager.scale(data_eager, 1.0, 0.9, True, True) - self.assertFalse(data_eager.grad._is_initialized()) + self.assertIsNone(data_eager.grad) out_eager.backward(grad_eager, False) self.assertTrue(data_eager.grad._is_initialized()) self.assertTrue(np.array_equal(data_eager.grad.numpy(), input_data)) @@ -72,7 +72,7 @@ class EagerScaleTestCase(unittest.TestCase): data_eager.retain_grads() out_eager = core.eager.scale(data_eager, 1.0, 0.9, True, True) - self.assertFalse(data_eager.grad._is_initialized()) + self.assertIsNone(data_eager.grad) with self.assertRaisesRegexp( AssertionError, "The type of grad_tensor must be paddle.Tensor"): @@ -632,13 +632,13 @@ class EagerVariablePropertiesAndMethodsTestCase(unittest.TestCase): tensor2.persistable = True tensor2.stop_gradient = False if core.is_compiled_with_cuda(): - tensor3 = tensor2._copy_to(True, core.CUDAPlace(0)) + tensor3 = tensor2._copy_to(core.CUDAPlace(0), True) self.assertTrue(np.array_equal(tensor3.numpy(), arr2)) self.assertTrue(tensor3.persistable, True) self.assertTrue(tensor3.stop_gradient, True) self.assertTrue(tensor3.place.is_gpu_place()) else: - tensor3 = tensor2._copy_to(True, core.CPUPlace()) + tensor3 = tensor2._copy_to(core.CPUPlace(), True) self.assertTrue(np.array_equal(tensor3.numpy(), arr2)) self.assertTrue(tensor3.persistable, True) self.assertTrue(tensor3.stop_gradient, True) diff --git a/python/paddle/fluid/tests/unittests/test_one_hot_v2_op.py b/python/paddle/fluid/tests/unittests/test_one_hot_v2_op.py index 66de1b309797fb53316a46b436e5cccf11410216..fac258192112dbff5353c581ad8e276cc5e375c0 100644 --- a/python/paddle/fluid/tests/unittests/test_one_hot_v2_op.py +++ b/python/paddle/fluid/tests/unittests/test_one_hot_v2_op.py @@ -22,7 +22,8 @@ import paddle import paddle.fluid as fluid import paddle.fluid.core as core import paddle.fluid.framework as framework -from paddle.fluid.framework import Program, program_guard +from paddle.framework import _in_eager_mode +from paddle.fluid.framework import Program, program_guard, _test_eager_guard class TestOneHotOp(OpTest): @@ -45,7 +46,7 @@ class TestOneHotOp(OpTest): self.outputs = {'Out': (out, x_lod)} def test_check_output(self): - self.check_output(check_dygraph=False) + self.check_output() class TestOneHotOp_attr(OpTest): @@ -68,7 +69,7 @@ class TestOneHotOp_attr(OpTest): self.outputs = {'Out': (out, x_lod)} def test_check_output(self): - self.check_output(check_dygraph=False) + self.check_output() class TestOneHotOp_default_dtype(OpTest): @@ -91,7 +92,7 @@ class TestOneHotOp_default_dtype(OpTest): self.outputs = {'Out': (out, x_lod)} def test_check_output(self): - self.check_output(check_dygraph=False) + self.check_output() class TestOneHotOp_default_dtype_attr(OpTest): @@ -114,7 +115,7 @@ class TestOneHotOp_default_dtype_attr(OpTest): self.outputs = {'Out': (out, x_lod)} def test_check_output(self): - self.check_output(check_dygraph=False) + self.check_output() class TestOneHotOp_out_of_range(OpTest): @@ -132,7 +133,7 @@ class TestOneHotOp_out_of_range(OpTest): self.outputs = {'Out': (out, x_lod)} def test_check_output(self): - self.check_output(check_dygraph=False) + self.check_output() class TestOneHotOp_exception(unittest.TestCase): @@ -190,6 +191,12 @@ class TestOneHotOpApi(unittest.TestCase): one_hot_label = fluid.one_hot( input=fluid.dygraph.to_variable(label), depth=depth) + one_hot_label = paddle.nn.functional.one_hot( + fluid.dygraph.to_variable(label), depth) + with _test_eager_guard(): + one_hot_label = paddle.nn.functional.one_hot( + paddle.to_tensor(label), depth) + def _run(self, depth): label = fluid.layers.data(name="label", shape=[1], dtype="int64") one_hot_label = fluid.one_hot(input=label, depth=depth) diff --git a/python/paddle/fluid/tests/unittests/test_paddle_multiprocessing.py b/python/paddle/fluid/tests/unittests/test_paddle_multiprocessing.py new file mode 100644 index 0000000000000000000000000000000000000000..1e31356a6bc81c1684a3620d36b66ed441add40b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_paddle_multiprocessing.py @@ -0,0 +1,199 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import gc +import sys +import unittest +import time +import paddle +import paddle.incubate.multiprocessing as mp + +REPEAT = 20 +HAS_SHM_FILES = os.path.isdir('/dev/shm') + + +def fill_tensor(queue, event): + data = queue.get() + with paddle.no_grad(): + data[0][:] = 5 + data[1][:] = 5 + + event.set() + + +def send_tensor(queue, event, device, dtype): + tensor = paddle.ones([5, 5], dtype=dtype) + queue.put(tensor) + queue.put(tensor) + event.wait() + + +def send_parambase(queue, event, device, dtype): + tensor = paddle.nn.Layer().create_parameter( + [5, 5], + dtype=dtype, + default_initializer=paddle.nn.initializer.Constant(value=1.0)) + queue.put(tensor) + queue.put(tensor) + event.wait() + + +class leak_checker(object): + def __init__(self, test_case): + self.checked_pids = [os.getpid()] + self.test_case = test_case + + def __enter__(self): + self.next_fds = self._get_next_fds(10) + return self + + def __exit__(self, *args): + if args[0] is None: + self.test_case.assertFalse(self.has_shm_files()) + return False + + def check_pid(self, pid): + self.checked_pids.append(pid) + + def _get_next_fds(self, n=1): + fds = [os.dup(0) for i in range(n)] + for fd in fds: + os.close(fd) + return fds + + def has_shm_files(self, wait=True): + if not HAS_SHM_FILES: + return False + result = self._has_shm_files() + if result and wait: + time.sleep(0.5) + return self._has_shm_files() + return result + + def _has_shm_files(self): + gc.collect() + names = ['paddle_' + str(pid) for pid in self.checked_pids] + for filename in os.listdir('/dev/shm'): + for name in names: + if filename.startswith(name): + print("have", filename) + return True + return False + + +class TestMultiprocessingBase(unittest.TestCase): + def get_tensor(self, device="cpu"): + self.device = device.lower() + place = None + tensor = paddle.zeros([5, 5], dtype="float32") + return tensor + + def get_parameter(self): + w = paddle.nn.Layer().create_parameter( + [10, 10], + default_initializer=paddle.nn.initializer.Constant(value=0.0)) + return w + + def _test_empty(self, dtype="float32"): + q = mp.Queue() + empty = paddle.to_tensor([], dtype=dtype) + q.put(empty) + out = q.get(timeout=1) + self.assertEqual(str(out), str(empty)) + + def _test_sharing(self, + ctx=mp, + device='cpu', + dtype="float32", + repeat=1, + param=False): + def test_fill(): + if param: + x = self.get_parameter() + y = (x[:, 1]).detach() + else: + x = self.get_tensor() + y = x[:, 1] + + data = [x, y] + + queue = ctx.Queue() + event = ctx.Event() + queue.put(data) + + process = ctx.Process(target=fill_tensor, args=(queue, event)) + process.daemon = True + lc.check_pid(process.pid) + process.start() + + event.wait(30) + + self.assertTrue(event.is_set()) + self.assertTrue(data[0].equal(5).all()) + self.assertTrue(data[1].equal(5).all()) + + process.join(1 if device != "gpu" else 10) + self.assertFalse(process.is_alive()) + + def test_receive(): + queue = ctx.Queue() + event = ctx.Event() + + process = ctx.Process( + target=send_parambase if param else send_tensor, + args=(queue, event, device, dtype)) + process.daemon = True + lc.check_pid(process.pid) + process.start() + + t1 = queue.get() + t2 = queue.get() + self.assertTrue(t1.equal(1).all()) + del t1, t2 + + event.set() + process.join(1 if device != "gpu" else 10) + self.assertFalse(process.is_alive()) + + with leak_checker(self) as lc: + for _ in range(repeat): + test_fill() + test_receive() + + +class TestMultiprocessingCpu(TestMultiprocessingBase): + def test_pass_tensor(self): + paddle.set_device("cpu") + self._test_sharing(repeat=REPEAT) + + def test_pass_parambase(self): + paddle.set_device("cpu") + self._test_sharing(repeat=1, param=True) + + def test_pass_empty(self): + paddle.set_device("cpu") + self._test_empty() + + +class TestMultiprocessingGpu(TestMultiprocessingBase): + @unittest.skipIf(not paddle.fluid.core.is_compiled_with_cuda(), + "core is not compiled with CUDA") + def test_pass_tensor(self): + paddle.set_device("gpu") + self._test_sharing(mp.get_context("spawn"), "gpu") + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py index edf9aed04f5e0a1df3ceb1ec0add27251e2264a5..802fcc96288f6114e81f080ef17e527b2e7ad2bd 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py @@ -200,5 +200,10 @@ class TestDataParallelWithPyLayer(TestMultipleGpus): self.run_mnist_2gpu('parallel_dygraph_dataparallel_with_pylayer.py') +class TestDataParallelInEagerMode(TestMultipleGpus): + def test_multiple_gpus_dynamic(self): + self.run_mnist_2gpu('parallel_dygraph_dataparallel_in_eager_mode.py') + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_run.py b/python/paddle/fluid/tests/unittests/test_run.py new file mode 100644 index 0000000000000000000000000000000000000000..8fe5fb9bb9455aa58d84bda03f9e9e16038a3be0 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_run.py @@ -0,0 +1,174 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import subprocess +import sys, os +import json +import shutil + +import random + +from os import listdir +from os.path import isfile, join + +pyname = 'train.py' +colpyfile = '''# train.py for unitest +import os +env = os.environ.copy() +assert "PADDLE_MASTER" in env +assert "PADDLE_GLOBAL_SIZE" in env +assert "PADDLE_LOCAL_SIZE" in env +assert "PADDLE_GLOBAL_RANK" in env +assert "PADDLE_LOCAL_RANK" in env +''' + +pspyfile = '''# train.py for unitest +import os +env = os.environ.copy() +assert "PADDLE_PSERVERS_IP_PORT_LIST" in env +assert "PADDLE_TRAINER_ENDPOINTS" in env +#assert "PADDLE_PSERVER_ENDPOINTS" in env +#assert "PADDLE_TRAINER_ENDPOINTS" in env +#assert "PADDLE_ROLE" in env +#assert "PADDLE_RANK" in env +''' + + +def write_file(name, ct): + with open(name, "w") as f: + f.write(ct) + + +def get_files(pth, prefix): + return [ + f for f in listdir(pth) if isfile(join(pth, f)) and f.startswith(prefix) + ] + + +class Collective_Test(unittest.TestCase): + def setUp(self): + write_file(pyname, colpyfile) + + def pdrun(self, args, env=None): + cmd = [sys.executable.split('/')[-1], "-m", "paddle.distributed.run"] + if args: + cmd.extend(args.split(" ")) + cmd.extend([pyname]) + proc = subprocess.Popen(cmd, env) + return proc + + ''' + def test_collective_1(self): + args = "--id test1" + p = self.pdrun(args) + p.wait() + self.assertTrue(p.poll() == 0) + + ''' + + def test_collective_2(self): + if os.path.exists('./log'): + shutil.rmtree('./log') + + args = "--id test2 --devices 0,1,2" + p = self.pdrun(args) + p.wait() + self.assertTrue(p.poll() == 0) + + c = get_files('log', 'test2') + self.assertTrue(len(c) == 4) + + def test_collective_3(self): + if os.path.exists('./log'): + shutil.rmtree('./log') + + port = random.randrange(6000, 8000) + args = "--id test3 --devices 0,1 --master 127.0.0.1:{} --np 2".format( + port) + p1 = self.pdrun(args) + p2 = self.pdrun(args) + p1.wait() + p2.wait() + self.assertTrue(p1.poll() == 0) + self.assertTrue(p2.poll() == 0) + + c = get_files('log', 'test3') + self.assertTrue(len(c) == 6) + + +class PS_Test(unittest.TestCase): + def setUp(self): + write_file(pyname, pspyfile) + + def pdrun(self, args, env=None): + cmd = [sys.executable.split('/')[-1], "-m", "paddle.distributed.run"] + if args: + cmd.extend(args.split(" ")) + cmd.extend([pyname]) + proc = subprocess.Popen(cmd, env) + return proc + + ''' + def test_ps_1(self): + args = "--mode ps" + p = self.pdrun(args) + p.wait() + self.assertTrue(p.poll() == 0) + + def test_ps_2(self): + if os.path.exists('./log'): + shutil.rmtree('./log') + + args = "--id ps2 --server_num=2 --trainer_num=2" + p = self.pdrun(args) + p.wait() + self.assertTrue(p.poll() == 0) + + c = get_files('log', 'ps2') + self.assertTrue(len(c) == 5) + ''' + + def test_ps_3(self): + if os.path.exists('./log'): + shutil.rmtree('./log') + + port = random.randrange(6000, 8000) + args = "--id ps3 --master 127.0.0.1:{} --np 2 --server_num=1 --trainer_num=1".format( + port) + p1 = self.pdrun(args) + p2 = self.pdrun(args) + p1.wait() + p2.wait() + self.assertTrue(p1.poll() == 0) + self.assertTrue(p2.poll() == 0) + + c = get_files('log', 'ps3') + self.assertTrue(len(c) == 6) + + def test_ps_4(self): + if os.path.exists('./log'): + shutil.rmtree('./log') + + args = "--id ps4 --servers 127.0.0.1:8900,127.0.0.1:8901 --trainers 127.0.0.1:8902,127.0.0.1:8903" + p1 = self.pdrun(args) + p1.wait() + self.assertTrue(p1.poll() == 0) + + c = get_files('log', 'ps4') + self.assertTrue(len(c) == 5) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py index d50c0fecdeebc79a98f66037080d1a03d73f3924..69bca8dd9ef15459021f44fd1b4887e636516ec6 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py @@ -474,5 +474,410 @@ def ref_softplus(x, beta=1, threshold=20): return out +# XPU_KP unittests, these ops can be found from xpu_op_kpfirst_list.h +class XPUTestBReluOP(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'brelu' + self.use_dynamic_create_class = False + + class XPUTestBRelu(TestActivationOPBase): + def set_case(self): + self.op_type = "brelu" + self.dtype = self.in_type + + np.random.seed(1024) + x = np.random.uniform(-5, 10, [10, 12]).astype(self.dtype) + t_min = 1.0 + t_max = 4.0 + # The same with TestAbs + x[np.abs(x - t_min) < 0.005] = t_min + 0.02 + x[np.abs(x - t_max) < 0.005] = t_max + 0.02 + t = np.copy(x) + t[t < t_min] = t_min + t[t > t_max] = t_max + + self.inputs = {'X': x} + self.outputs = {'Out': t} + self.attrs = {'use_xpu': True, 't_min': t_min, 't_max': t_max} + + +support_types = get_xpu_op_support_types('brelu') +for stype in support_types: + create_test_class(globals(), XPUTestBReluOP, stype) + + +class XPUTestCeilOP(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'ceil' + self.use_dynamic_create_class = False + + class XPUTestCeil(TestActivationOPBase): + def set_case(self): + self.op_type = "ceil" + self.dtype = self.in_type + + np.random.seed(1024) + x = np.random.uniform(-1, 1, [10, 12]).astype(self.dtype) + out = np.ceil(x) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.outputs = {'Out': out} + self.attrs = {'use_xpu': True} + + +support_types = get_xpu_op_support_types('ceil') +for stype in support_types: + create_test_class(globals(), XPUTestCeilOP, stype) + + +class XPUTestCeluOP(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'celu' + self.use_dynamic_create_class = False + + class XPUTestCelu(TestActivationOPBase): + def set_case(self): + self.op_type = "celu" + self.dtype = self.in_type + + alpha = 1.5 + x = np.random.uniform(-3, 3, [10, 12]).astype(self.dtype) + out = ref_celu(x, alpha) + + self.inputs = {'X': x} + self.outputs = {'Out': out} + self.attrs = {'use_xpu': True, 'alpha': alpha} + + +support_types = get_xpu_op_support_types('celu') +for stype in support_types: + create_test_class(globals(), XPUTestCeluOP, stype) + + +def ref_celu(x, alpha): + out_ref = np.maximum(0, x) + np.minimum(0, alpha * (np.exp(x / alpha) - 1)) + return out_ref.astype(x.dtype) + + +class XPUTestEluOP(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'elu' + self.use_dynamic_create_class = False + + class XPUTestElu(TestActivationOPBase): + def set_case(self): + self.op_type = "elu" + self.dtype = self.in_type + + alpha = 1. + x = np.random.uniform(-3, 3, [10, 12]).astype(self.dtype) + out = ref_elu(x, alpha) + + self.inputs = {'X': x} + self.outputs = {'Out': out} + self.attrs = {'use_xpu': True, 'alpha': alpha} + + +support_types = get_xpu_op_support_types('elu') +for stype in support_types: + create_test_class(globals(), XPUTestEluOP, stype) + + +def ref_elu(x, alpha): + out_ref = np.where(x > 0, x, alpha * (np.exp(x) - 1)) + return out_ref.astype(x.dtype) + + +class XPUTestFloorOP(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'floor' + self.use_dynamic_create_class = False + + class XPUTestFloor(TestActivationOPBase): + def set_case(self): + self.op_type = "floor" + self.dtype = self.in_type + + np.random.seed(1024) + x = np.random.uniform(-1, 1, [10, 12]).astype(self.dtype) + out = np.floor(x) + + self.inputs = {'X': x} + self.outputs = {'Out': out} + self.attrs = {'use_xpu': True} + + +support_types = get_xpu_op_support_types('floor') +for stype in support_types: + create_test_class(globals(), XPUTestFloorOP, stype) + + +class XPUTestHardShrinkOP(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'hard_shrink' + self.use_dynamic_create_class = False + + class XPUTestHardShrink(TestActivationOPBase): + def set_case(self): + self.op_type = "hard_shrink" + self.dtype = self.in_type + + threshold = 0.5 + # self.set_attrs() + np.random.seed(1024) + x = np.random.uniform(-1, 1, [10, 12]).astype(self.dtype) * 10 + out = ref_hardshrink(x, threshold) + + self.attrs = {'use_xpu': True} + self.inputs = {'X': x} + self.outputs = {'Out': out} + + +support_types = get_xpu_op_support_types('hard_shrink') +for stype in support_types: + create_test_class(globals(), XPUTestHardShrinkOP, stype) + + +def ref_hardshrink(x, threshold): + out = np.copy(x) + out[(out >= -threshold) & (out <= threshold)] = 0 + return out + + +class XPUTestHardSigmoidOP(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'hard_sigmoid' + self.use_dynamic_create_class = False + + class XPUTestHardSigmoid(TestActivationOPBase): + def set_case(self): + self.op_type = "hard_sigmoid" + self.dtype = self.in_type + self.slope = 0.166666666666667 + self.offset = 0.5 + + x = np.random.uniform(-5, 5, [10, 12]).astype(self.dtype) + lower_threshold = -self.offset / self.slope + upper_threshold = (1. - self.offset) / self.slope + + # Same reason as TestAbs + delta = 0.005 + x[np.abs(x - lower_threshold) < delta] = lower_threshold - 0.02 + x[np.abs(x - upper_threshold) < delta] = upper_threshold - 0.02 + + out = ref_hardsigmoid(x, self.slope, self.offset) + + self.attrs = { + 'use_xpu': True, + 'slope': self.slope, + 'offset': self.offset + } + self.inputs = {'X': x} + self.outputs = {'Out': out} + + +support_types = get_xpu_op_support_types('hard_sigmoid') +for stype in support_types: + create_test_class(globals(), XPUTestHardSigmoidOP, stype) + + +def ref_hardsigmoid(x, slope=0.166666666666667, offset=0.5): + return np.maximum(np.minimum(x * slope + offset, 1.), 0.).astype(x.dtype) + + +class XPUTestLog1pOP(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'log1p' + self.use_dynamic_create_class = False + + class XPUTestLog1p(TestActivationOPBase): + def set_case(self): + self.op_type = "log1p" + self.dtype = self.in_type + + np.random.seed(1024) + x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype) + out = np.log1p(x) + + self.inputs = {'X': x} + self.outputs = {'Out': out} + self.attrs = {'use_xpu': True} + + +support_types = get_xpu_op_support_types('log1p') +for stype in support_types: + create_test_class(globals(), XPUTestLog1pOP, stype) + + +class XPUTestLogsigmoidOP(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'logsigmoid' + self.use_dynamic_create_class = False + + class XPUTestLogsigmoid(TestActivationOPBase): + def set_case(self): + self.op_type = "logsigmoid" + self.dtype = self.in_type + + np.random.seed(2048) + x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype) + out = np.log(1 / (1 + np.exp(-x))) + + self.inputs = {'X': x} + self.outputs = {'Out': out} + self.attrs = {'use_xpu': True} + + +support_types = get_xpu_op_support_types('logsigmoid') +for stype in support_types: + create_test_class(globals(), XPUTestLogsigmoidOP, stype) + + +class XPUTestRelu6OP(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'relu6' + self.use_dynamic_create_class = False + + class XPUTestRelu6(TestActivationOPBase): + def set_case(self): + self.op_type = "relu6" + self.dtype = self.in_type + + np.random.seed(1024) + x = np.random.uniform(-1, 10, [10, 12]).astype(self.dtype) + x[np.abs(x) < 0.005] = 0.02 + out = ref_relu6(x) + + self.attrs = {'use_xpu': True} + self.inputs = {'X': x} + self.outputs = {'Out': out} + + +support_types = get_xpu_op_support_types('relu6') +for stype in support_types: + create_test_class(globals(), XPUTestRelu6OP, stype) + + +def ref_relu6(x, threshold=6.0): + out = np.copy(x) + out[np.abs(x - threshold) < 0.005] = threshold + 0.02 + out = np.minimum(np.maximum(x, 0), threshold) + return out + + +class XPUTestSiluOP(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'silu' + self.use_dynamic_create_class = False + + class XPUTestSilu(TestActivationOPBase): + def set_case(self): + self.op_type = "silu" + self.dtype = self.in_type + + np.random.seed(1024) + x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype) + out = x / (np.exp(-x) + 1) + + self.inputs = {'X': x} + self.outputs = {'Out': out} + self.attrs = {'use_xpu': True} + + +support_types = get_xpu_op_support_types('silu') +for stype in support_types: + create_test_class(globals(), XPUTestSiluOP, stype) + + +class XPUTestSoftReluOP(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'soft_relu' + self.use_dynamic_create_class = False + + class XPUTestSoftRelu(TestActivationOPBase): + def set_case(self): + self.op_type = "soft_relu" + self.dtype = self.in_type + + np.random.seed(4096) + x = np.random.uniform(-3, 3, [4, 4]).astype(self.dtype) + threshold = 2.0 + # The same reason with TestAbs + x[np.abs(x - threshold) < 0.005] = threshold + 0.02 + x[np.abs(x + threshold) < 0.005] = -threshold - 0.02 + t = np.copy(x) + t[t < -threshold] = -threshold + t[t > threshold] = threshold + out = np.log((np.exp(t) + 1)) + + self.inputs = {'X': x} + self.outputs = {'Out': out} + self.attrs = {'use_xpu': True, 'threshold': threshold} + + +support_types = get_xpu_op_support_types('soft_relu') +for stype in support_types: + create_test_class(globals(), XPUTestSoftReluOP, stype) + + +class XPUTestSoftSignOP(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'softsign' + self.use_dynamic_create_class = False + + class XPUTestSoftSign(TestActivationOPBase): + def set_case(self): + self.op_type = "softsign" + self.dtype = self.in_type + + np.random.seed(1024) + x = np.random.uniform(-1, 1, [10, 12]).astype(self.dtype) + out = ref_softsign(x) + + self.inputs = {'X': x} + self.outputs = {'Out': out} + self.attrs = {'use_xpu': True} + + +support_types = get_xpu_op_support_types('softsign') +for stype in support_types: + create_test_class(globals(), XPUTestSoftSignOP, stype) + + +def ref_softsign(x): + out = np.divide(x, 1 + np.abs(x)) + return out + + +class XPUTestSwishOP(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'swish' + self.use_dynamic_create_class = False + + class XPUTestSwish(TestActivationOPBase): + def set_case(self): + self.op_type = "swish" + self.dtype = self.in_type + + np.random.seed(1024) + x = np.random.uniform(-1, 1, [10, 12]).astype(self.dtype) + out = ref_swish(x) + + self.inputs = {'X': x} + self.outputs = {'Out': out} + self.attrs = {'use_xpu': True} + + +support_types = get_xpu_op_support_types('swish') +for stype in support_types: + create_test_class(globals(), XPUTestSwishOP, stype) + + +def ref_swish(x): + from scipy.special import expit + out = x * expit(x) + return out + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/incubate/multiprocessing/__init__.py b/python/paddle/incubate/multiprocessing/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..27c23be3a89411da702e3071fec4c99186fca4b9 --- /dev/null +++ b/python/paddle/incubate/multiprocessing/__init__.py @@ -0,0 +1,27 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .reductions import init_reductions +import multiprocessing + +__all__ = [] + +from multiprocessing import * # noqa: F403 + +__all__ += multiprocessing.__all__ # type: ignore[attr-defined] + +# Only support linux for now +# Only support file_system sharing strategy. + +init_reductions() diff --git a/python/paddle/incubate/multiprocessing/reductions.py b/python/paddle/incubate/multiprocessing/reductions.py new file mode 100644 index 0000000000000000000000000000000000000000..cfbc55afd3bca87aa279c7aa251aa23671b1a317 --- /dev/null +++ b/python/paddle/incubate/multiprocessing/reductions.py @@ -0,0 +1,189 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle + +# TODO: check the hooks of tensor +# TODO: check serializing named tensor +# TODO: check influence on autograd +import os +import sys +import warnings +import math +import copy +import threading +import multiprocessing +from multiprocessing.util import register_after_fork +from multiprocessing.reduction import ForkingPickler + +from collections import OrderedDict + + +def _supported_check(): + if sys.platform != "linux": + # warnings.warn("`paddle.multiprocessing` only support linux for now, " + # " import this will not take any effect !") + + return False + + if not sys.version_info >= (3, 4): + warnings.warn("Use `paddle.multiprocessing` to share paddle tensor " + "requires python version greater than 3.4 ." + " `paddle.multiprocessing` will not take any effect !!!") + return False + + return True + + +class LRUSharedCache(OrderedDict): + def __init__(self): + self.limit = 128 + self._after_fork() + register_after_fork(self, LRUSharedCache._after_fork) + + def _after_fork(self): + self.lock = threading.Lock() + + def get(self, key): + with self.lock: + try: + value = super().pop(key) + super().__setitem__(key, value) + return value + except KeyError: + return None + + def __setitem__(self, key, value): + with self.lock: + try: + super().__delitem__(key) + except KeyError: + if len(self) >= self.limit: + super().popitem(last=False) + super().__setitem__(key, value) + + +shared_cache = LRUSharedCache() + + +def cuda_from_cache(key): + lodtensor = shared_cache.get(key) + if lodtensor is None: + return None + return lodtensor + + +def rebuild_tensor(cls, lodtensor, metadata): + if cls == paddle.fluid.framework.ParamBase: + tensor = paddle.fluid.framework.ParamBase(lodtensor.shape(), + lodtensor._dtype(), + **metadata) + tensor.value().get_tensor()._share_data_with(lodtensor) + else: + size, stop_gradient = metadata + tensor = paddle.fluid.core.VarBase() + if lodtensor._is_initialized(): + tensor.value().get_tensor()._share_data_with(lodtensor) + else: + tensor = paddle.to_tensor([], dtype=lodtensor._dtype()) + tensor.stop_gradient = stop_gradient + return tensor + + +def reduce_tensor(tensor): + lodtensor = tensor.value().get_tensor() + + if not tensor.stop_gradient and not tensor.is_leaf: + raise RuntimeError( + "Refusing to serialize non-leaf tensor which not stop_gradient, you can detach it!" + ) + # TODO: add serializing name and hooks check + if tensor.place.is_cpu_place() or tensor.place.is_gpu_place( + ) or tensor.place.is_cuda_pinned_place(): + if type(tensor) == paddle.fluid.framework.ParamBase: + metadata = copy.deepcopy(tensor.__dict__) + else: + metadata = (tensor.size, tensor.stop_gradient) + + return (rebuild_tensor, (type(tensor), lodtensor, metadata)) + else: + raise ValueError( + "Only support tensors of CPU/CUDA/CUDAPinned Place, Not support %s for now!" + % tensor.place) + + +def rebuild_lodtensor_filename(cls, ipc_name, size, type_idx, dims, lod): + lodtensor = cls._new_shared_filename((ipc_name, size, type_idx, dims, lod)) + lodtensor._shared_decref() + return lodtensor + + +def rebuild_cuda_tensor(cls, handle, offset_bytes, size, type_idx, dims, lod, + device_idx): + cache_tensor = cuda_from_cache((handle, offset_bytes)) + if cache_tensor is None: + lodtensor = cls._new_shared_cuda( + (handle, offset_bytes, size, type_idx, dims, lod, device_idx)) + # We only cache cuda shared tensor here. + # The opening cost of cudaIpcMemoryHandle is very high. + # Since we cache the recived tensor directly, + # The sender may reallocate the tensor space, + # you should manualy maintian the lifecycle of ipc tensor + shared_cache[(handle, offset_bytes)] = lodtensor + else: + lodtensor = paddle.fluid.core.LoDTensor() + lodtensor._share_buffer_with(cache_tensor, + (size, type_idx, dims, lod, device_idx)) + + return lodtensor + + +def rebuild_lodtensor_empty(cls): + #TODO: check if tensor initialized + #TODO: handle the dtype of empty tensor + return cls() + + +def reduce_lodtensor(lodtensor): + if lodtensor._place().is_cpu_place() or lodtensor._place( + ).is_cuda_pinned_place(): + for dim in lodtensor.shape(): + if dim == 0: + # Empty tensors have nothing be mmapped. + return (rebuild_lodtensor_empty, (type(lodtensor), )) + + # Default use share filename stratege + metadata = lodtensor._share_filename( + ) # ipc_name, size, type_idx, dims, lod + rebuild = rebuild_lodtensor_filename + lodtensor._shared_incref() + # TODO, maintain reference for lodtensor + # TODO: support file_discriptor stratege + elif lodtensor._place().is_gpu_place(): + metadata = lodtensor._share_cuda() + rebuild = rebuild_cuda_tensor + else: + raise RuntimeError("We only support pass cpu/gpu lodtensor for now!") + + return (rebuild, (type(lodtensor), ) + metadata) + + +def init_reductions(): + if not _supported_check(): + return + + ForkingPickler.register(paddle.Tensor, reduce_tensor) + ForkingPickler.register(paddle.fluid.core.VarBase, reduce_tensor) + ForkingPickler.register(paddle.fluid.framework.ParamBase, reduce_tensor) + ForkingPickler.register(paddle.fluid.core.LoDTensor, reduce_lodtensor) diff --git a/python/paddle/nn/functional/input.py b/python/paddle/nn/functional/input.py index de8a7ff6d3c7b6cd87d6301f2cd0bb7af119a74d..4c30ed03735f26b6df77c6a8f5b32391972738e5 100644 --- a/python/paddle/nn/functional/input.py +++ b/python/paddle/nn/functional/input.py @@ -19,6 +19,7 @@ from ...fluid.layer_helper import LayerHelper from ...fluid.data_feeder import check_variable_and_dtype, check_dtype from paddle import _C_ops from paddle import in_dynamic_mode +from paddle.framework import _in_eager_mode __all__ = [] @@ -87,6 +88,8 @@ def one_hot(x, num_classes, name=None): """ if in_dynamic_mode(): + if _in_eager_mode(): + return _C_ops.final_state_one_hot(x, num_classes) return _C_ops.one_hot_v2(x, 'depth', num_classes, 'allow_out_of_range', False) else: diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index 47dc02705f80bee3ce614846a82c7e44140247b1..96f35eb9d27ec86baa9a7311a4a85a217a7499b8 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -42,6 +42,7 @@ from .. import compat as cpt from .lr import LRScheduler import copy from paddle import _C_ops +from paddle.fluid.framework import _in_eager_mode __all__ = [] @@ -1108,7 +1109,13 @@ class Optimizer(object): for p in param_group['params']: if not p.stop_gradient: param_list.append(p) - core.clear_gradients(param_list, set_to_zero) + + if _in_eager_mode(): + for p in param_list: + clear_func = p._zero_grads if set_to_zero else p.clear_gradient + clear_func() + else: + core.clear_gradients(param_list, set_to_zero) @imperative_base.no_grad def minimize(self, diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index 6c27d465cb12e3c89391608f5ea9871a5a42ddef..0d012685b738c63f3eadc0c1d8c44592fe875845 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -150,6 +150,15 @@ func : reshape inplace : (x -> out) +- api : relu + args : (Tensor x) + output : Tensor + infer_meta : + func : UnchangedInferMeta + kernel : + func : relu + inplace : (x -> out) + - api : scale args : (Tensor x, Scalar scale, float bias, bool bias_after_scale) output : Tensor @@ -158,6 +167,7 @@ param : [x] kernel : func : scale, scale_sr + inplace : (x -> out) - api : sign args : (Tensor x) @@ -194,6 +204,15 @@ output : Tensor invoke : full_like(x, 0, dtype, place) + +- api : one_hot + args : (Tensor x, Scalar num_classes) + output : Tensor + infer_meta : + func : OneHotInferMeta + kernel : + func : one_hot + - api : digamma args : (Tensor x) output : Tensor diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py index fe68548a22a6d90bececdd00ac75d760969cee92..d91b76bb70314a2d516b8a384cf3406b7f9e4d0d 100644 --- a/python/paddle/utils/code_gen/api_base.py +++ b/python/paddle/utils/code_gen/api_base.py @@ -696,6 +696,7 @@ PADDLE_API {self.gene_return_type_code()} {self.get_api_func_name() + '_'}({self code_indent) outputs_args, kernel_output_names, output_create = self.gene_output( self.outputs['types'], 'SetKernelOutput', code_indent, inplace_flag) + api_func_name = self.get_api_func_name() + ('_' if inplace_flag else '') return f""" {code_indent} auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( {code_indent} "{self.kernel['func'][0]}", {{kernel_backend, kernel_layout, kernel_data_type}}); @@ -709,7 +710,10 @@ PADDLE_API {self.gene_return_type_code()} {self.get_api_func_name() + '_'}({self {code_indent} using kernel_signature = {kernel_signature}; {code_indent} auto* kernel_fn = kernel.GetVariadicKernelFn(); -{code_indent} (*kernel_fn)({kernel_args}, {outputs_args}); +{code_indent} {{ +{code_indent} paddle::platform::RecordEvent kernel_record_event(\"{api_func_name} compute\", paddle::platform::TracerEventType::Operator, 1); +{code_indent} (*kernel_fn)({kernel_args}, {outputs_args}); +{code_indent} }} {code_indent} return {self.gene_return_code()};""" @@ -719,6 +723,7 @@ PADDLE_API {self.gene_return_type_code()} {self.get_api_func_name() + '_'}({self outputs_args, kernel_output_names, output_create = self.gene_output( self.outputs['types'], 'SetSelectedRowsKernelOutput', code_indent, inplace_flag) + api_func_name = self.get_api_func_name() + ('_' if inplace_flag else '') return f""" {code_indent} auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( {code_indent} "{self.kernel['func'][1]}", {{kernel_backend, kernel_layout, kernel_data_type}}); @@ -732,7 +737,10 @@ PADDLE_API {self.gene_return_type_code()} {self.get_api_func_name() + '_'}({self {code_indent} using kernel_signature = {kernel_signature}; {code_indent} auto* kernel_fn = kernel.GetVariadicKernelFn(); -{code_indent} (*kernel_fn)({kernel_args}, {outputs_args}); +{code_indent} {{ +{code_indent} paddle::platform::RecordEvent kernel_record_event(\"{api_func_name} compute\", paddle::platform::TracerEventType::Operator, 1); +{code_indent} (*kernel_fn)({kernel_args}, {outputs_args}); +{code_indent} }} {code_indent} return {self.gene_return_code()};""" diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py index a404fc01784154900a7c6ac1df501424dcdb307e..98a3606952bbb13d3b20c55427b9747f1a4a5624 100644 --- a/python/paddle/utils/code_gen/api_gen.py +++ b/python/paddle/utils/code_gen/api_gen.py @@ -147,6 +147,9 @@ def source_include(header_file_path): #include "paddle/phi/infermeta/multiary.h" #include "paddle/phi/infermeta/nullary.h" #include "paddle/phi/infermeta/unary.h" +#include "paddle/phi/kernels/declarations.h" + +#include "paddle/fluid/platform/profiler/event_tracing.h" """ diff --git a/python/paddle/utils/code_gen/backward_api_gen.py b/python/paddle/utils/code_gen/backward_api_gen.py index 7417d6bb030da095daf29db080b524db034cdcc6..5506f71f4b671da282fb933436b3c17d4a47a8fb 100644 --- a/python/paddle/utils/code_gen/backward_api_gen.py +++ b/python/paddle/utils/code_gen/backward_api_gen.py @@ -154,6 +154,8 @@ def source_include(header_file_path): #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/api/include/api.h" #include "paddle/phi/infermeta/backward.h" + +#include "paddle/fluid/platform/profiler/event_tracing.h" """ diff --git a/python/paddle/utils/cpp_extension/extension_utils.py b/python/paddle/utils/cpp_extension/extension_utils.py index 853a98a62b504d94617127bd35212d2412719e1c..b0a5d37a535df7e83c08f18e624402294bf29539 100644 --- a/python/paddle/utils/cpp_extension/extension_utils.py +++ b/python/paddle/utils/cpp_extension/extension_utils.py @@ -146,6 +146,9 @@ def custom_write_stub(resource, pyfile): import types import paddle + cur_dir = os.path.dirname(os.path.abspath(__file__)) + so_path = os.path.join(cur_dir, "{resource}") + def inject_ext_module(module_name, api_names): if module_name in sys.modules: return sys.modules[module_name] @@ -157,9 +160,6 @@ def custom_write_stub(resource, pyfile): return new_module def __bootstrap__(): - cur_dir = os.path.dirname(os.path.abspath(__file__)) - so_path = os.path.join(cur_dir, "{resource}") - assert os.path.exists(so_path) # load custom op shared library with abs path @@ -169,6 +169,7 @@ def custom_write_stub(resource, pyfile): __bootstrap__() {custom_api} + """).lstrip() # Parse registerring op information @@ -900,7 +901,7 @@ def _generate_python_module(module_name, # delete the temp file before exit python process atexit.register(lambda: remove_if_exit(api_file)) - # write into .py file with RWLock + # write into .py file with RWLockc api_content = [_custom_api_content(op_name) for op_name in op_names] with open(api_file, 'w') as f: f.write('\n\n'.join(api_content)) @@ -911,13 +912,15 @@ def _generate_python_module(module_name, def _custom_api_content(op_name): - params_str, ins_str, attrs_str, outs_str = _get_api_inputs_str(op_name) - + params_str, ins_str, attrs_str, outs_str, in_names, attrs_names = _get_api_inputs_str( + op_name) + lower_in_names = [p.split("@")[0].lower() for p in in_names] API_TEMPLATE = textwrap.dedent(""" - from paddle.fluid.core import VarBase - from paddle.fluid.framework import in_dygraph_mode, _dygraph_tracer + import paddle.fluid.core as core + from paddle.fluid.core import VarBase, CustomOpKernelContext + from paddle.fluid.framework import in_dygraph_mode, _dygraph_tracer, _in_eager_mode from paddle.fluid.layer_helper import LayerHelper - + def {op_name}({inputs}): # prepare inputs and outputs ins = {ins} @@ -928,9 +931,20 @@ def _custom_api_content(op_name): # The output variable's dtype use default value 'float32', # and the actual dtype of output variable will be inferred in runtime. if in_dygraph_mode(): - for out_name in out_names: - outs[out_name] = VarBase() - _dygraph_tracer().trace_op(type="{op_name}", inputs=ins, outputs=outs, attrs=attrs) + if _in_eager_mode(): + ctx = CustomOpKernelContext() + for i in {in_names}: + ctx.add_inputs(i) + for j in {attr_names}: + ctx.add_attr(j) + for out_name in out_names: + outs[out_name] = core.eager.Tensor() + ctx.add_outputs(outs[out_name]) + core.eager._run_custom_op(ctx, "{op_name}", True) + else: + for out_name in out_names: + outs[out_name] = VarBase() + _dygraph_tracer().trace_op(type="{op_name}", inputs=ins, outputs=outs, attrs=attrs) else: helper = LayerHelper("{op_name}", **locals()) for out_name in out_names: @@ -949,6 +963,9 @@ def _custom_api_content(op_name): inputs=params_str, ins=ins_str, attrs=attrs_str, + # "[x, y, z]"" + in_names="[" + ",".join(lower_in_names) + "]", + attr_names="[" + ",".join(attrs_names) + "]", out_names=outs_str) return api_content @@ -996,7 +1013,7 @@ def _get_api_inputs_str(op_name): ]) # e.g: ['Out', 'Index'] outs_str = "[%s]" % ','.join(["'{}'".format(name) for name in out_names]) - return params_str, ins_str, attrs_str, outs_str + return params_str, ins_str, attrs_str, outs_str, in_names, attr_names def _write_setup_file(name, diff --git a/python/setup.py.in b/python/setup.py.in index 689f63c0f00e95e3eb861ca1c497685babd01638..44998bd3e1675f2a3f77edd26c9cd8fa85121b6a 100755 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -300,6 +300,7 @@ packages=['paddle', 'paddle.distributed.fleet.meta_parallel.parallel_layers', 'paddle.distributed.auto_parallel', 'paddle.distributed.auto_parallel.operators', + 'paddle.distributed.auto_parallel.tuner', 'paddle.distributed.passes', 'paddle.framework', 'paddle.jit', diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh index 55d2d59c7ece6a4639b1227f600a7d208a69f2e7..9c802a56a7b6e29bc89ad164a15f2f6d4749734e 100644 --- a/tools/check_file_diff_approvals.sh +++ b/tools/check_file_diff_approvals.sh @@ -198,7 +198,9 @@ if [ ${HAS_BOOST_GET} ] && [ "${GIT_PR_ID}" != "" ]; then check_approval 1 6836917 47554610 22561442 fi -HAS_LOG_FATAL=`git diff -U0 upstream/$BRANCH $FILTER |grep "^+" |grep -o -m 1 "LOG(FATAL)" || true` +# infrt needs to temporarily use LOG(FATAL) during the debugging period, and will replace it with standard error format in the future. +NO_INFRT_FILES=`git diff --name-only upstream/develop | grep -v "tools/\|paddle/infrt/" || true` +HAS_LOG_FATAL=`git diff -U0 upstream/$BRANCH $NO_INFRT_FILES |grep "^+" |grep -o -m 1 "LOG(FATAL)" || true` if [ ${HAS_LOG_FATAL} ] && [ "${GIT_PR_ID}" != "" ]; then echo_line="LOG(FATAL) is not recommended, because it will throw exception without standard stack information, so please use PADDLE_THROW macro here. If you have to use LOG(FATAL) here, please request chenwhql (Recommend), luotao1 or lanxianghit review and approve.\n" check_approval 1 6836917 47554610 22561442 diff --git a/tools/infrt/get_compat_kernel_signature.py b/tools/infrt/get_compat_kernel_signature.py index 78d59c2aef10be6db99c7947e8dc238e5463fb47..0680e87b38b3f6c29e7f813474d947598912437d 100644 --- a/tools/infrt/get_compat_kernel_signature.py +++ b/tools/infrt/get_compat_kernel_signature.py @@ -16,6 +16,8 @@ import os import re import json +skip_list = [] + def parse_compat_registry(kernel_info): name, inputs_str, attrs_str, outputs_str = kernel_info.split(",{") @@ -42,6 +44,8 @@ def get_compat_kernels_info(): compat_files.remove(file_) for file_ in compat_files: + if file_ in skip_list: + continue with open("../../paddle/phi/ops/compat/" + file_) as in_file: txt = in_file.readlines() content = "" @@ -54,8 +58,9 @@ def get_compat_kernels_info(): content += line if (registry and ";" in line): data = content.replace("\n", "").replace( - " ", "").strip("return").strip( - "KernelSignature(").strip("\);").replace("\"", "") + " ", + "").strip("return").strip("KernelSignature(").strip( + "\);").replace("\"", "").replace("\\", "") registry = False name, registry_info = parse_compat_registry(data)