From aba759ba16422abf8cd39ae7e19d24f5997b9ade Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Mon, 21 Sep 2020 17:55:29 +0800 Subject: [PATCH] [Feature] Enhance inplace addto strategy for gradient accumulation in static graph (#27112) * support use add instead of sum to do gradient accumulation * add inplace addto pass * add grad_add op and inplace addto pass * remove debug code * code refine * fix bug when sereral sum ops inserts at same op_idx * fix Flags type * add addto attribute for conv3d * fix ut * code clean * fix type --- paddle/fluid/framework/details/CMakeLists.txt | 1 + .../fluid/framework/details/build_strategy.h | 4 + .../fluid/framework/details/op_handle_base.cc | 7 + .../fluid/framework/details/op_handle_base.h | 6 + .../details/share_tensor_buffer_functor.cc | 12 +- .../details/share_tensor_buffer_functor.h | 10 +- .../details/share_tensor_buffer_op_handle.cc | 9 +- .../details/share_tensor_buffer_op_handle.h | 5 +- .../ir/memory_optimize_pass/CMakeLists.txt | 2 + .../buffer_shared_inplace_op_pass.cc | 6 +- .../inplace_addto_op_pass.cc | 221 ++++++++++++++++++ .../memory_optimize_pass/memory_reuse_pass.cc | 11 +- .../memory_optimize_pass/memory_reuse_pass.h | 14 +- paddle/fluid/framework/operator.h | 8 + paddle/fluid/framework/parallel_executor.cc | 19 ++ paddle/fluid/operators/conv_cudnn_op.cu | 27 ++- paddle/fluid/operators/conv_op.cc | 10 + .../elementwise/elementwise_add_op.cc | 18 ++ .../elementwise/elementwise_add_op.cu | 7 + paddle/fluid/platform/flags.cc | 15 ++ .../pybind/global_value_getter_setter.cc | 3 +- paddle/fluid/pybind/pybind.cc | 6 + python/paddle/fluid/__init__.py | 1 + python/paddle/fluid/backward.py | 94 ++++++-- .../unittests/test_inplace_addto_strategy.py | 114 +++++++++ 25 files changed, 589 insertions(+), 41 deletions(-) create mode 100644 paddle/fluid/framework/ir/memory_optimize_pass/inplace_addto_op_pass.cc create mode 100644 python/paddle/fluid/tests/unittests/test_inplace_addto_strategy.py diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index a3cc4d1721..8281ec2143 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -74,6 +74,7 @@ set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto eager_deletion_pass buffer_shared_inplace_op_pass buffer_shared_cross_op_memory_reuse_pass + inplace_addto_op_pass set_reader_device_info_utils add_reader_dependency_pass) cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS}) diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 5388df6bc5..01d496d4ea 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -19,6 +19,7 @@ #include #include #include + #include "boost/optional.hpp" #include "paddle/fluid/framework/ir/pass_builder.h" #include "paddle/fluid/framework/program_desc.h" @@ -119,6 +120,9 @@ struct BuildStrategy { // Turn on inplace by default. bool enable_inplace_{true}; + // Turn off inplace addto by default. + bool enable_addto_{false}; + // FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode, // num_trainers is 1, so the current fields of build_strategy doesn't tell if // it's distributed model. diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index 35fe5d631f..459bcff5c0 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. #include "paddle/fluid/framework/details/op_handle_base.h" + #include #include @@ -88,6 +89,12 @@ void OpHandleBase::Run(bool use_cuda) { PADDLE_ENFORCE(!use_cuda); #endif + // skip running current op, used with inplace_addto_op_pass + if (skip_running_) { + VLOG(4) << "skip running: " << Name(); + return; + } + RunImpl(); } diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h index c5aa1295aa..097f54d5d5 100644 --- a/paddle/fluid/framework/details/op_handle_base.h +++ b/paddle/fluid/framework/details/op_handle_base.h @@ -18,6 +18,7 @@ #include #include #include + #include "paddle/fluid/framework/details/var_handle.h" #include "paddle/fluid/framework/ir/node.h" #include "paddle/fluid/platform/device_context.h" @@ -52,6 +53,10 @@ class OpHandleBase { virtual Priority GetPriority() const { return kNormal; } + virtual bool GetSkipRunning() const { return skip_running_; } + + virtual void SetSkipRunning(bool skip_runing) { skip_running_ = skip_runing; } + virtual std::string Name() const = 0; void Run(bool use_cuda); @@ -131,6 +136,7 @@ class OpHandleBase { std::map dev_ctxes_; std::vector local_exec_scopes_; + bool skip_running_ = false; #ifdef PADDLE_WITH_CUDA std::unordered_map events_; diff --git a/paddle/fluid/framework/details/share_tensor_buffer_functor.cc b/paddle/fluid/framework/details/share_tensor_buffer_functor.cc index 19f075018c..5fbaf3cbfe 100644 --- a/paddle/fluid/framework/details/share_tensor_buffer_functor.cc +++ b/paddle/fluid/framework/details/share_tensor_buffer_functor.cc @@ -48,12 +48,13 @@ static inline Tensor *GetMutableTensorFromVar(Variable *var) { ShareTensorBufferFunctor::ShareTensorBufferFunctor( Scope *scope, size_t scope_idx, const std::string &op_type, const std::vector &in_var_infos, - const std::vector &out_var_names) + const std::vector &out_var_names, bool share_dims) : scope_(scope), scope_idx_(scope_idx), op_type_(op_type), in_var_infos_(in_var_infos), - out_var_names_(out_var_names) { + out_var_names_(out_var_names), + share_dims_(share_dims) { PADDLE_ENFORCE_EQ(in_var_infos_.size(), out_var_names_.size(), platform::errors::PreconditionNotMet( "The number of input variables and output variables " @@ -151,6 +152,13 @@ void ShareTensorBufferFunctor::operator()(Scope *exec_scope) { } else { out_tensor->ShareBufferWith(in_tensor); + // NOTE(zhiqiu): In the case of inplace addto, if the operator of + // the in_out_vars is skipped during running, we should set the dims of + // output as the same as input. + if (share_dims_) { + out_tensor->Resize(in_tensor.dims()); + } + VLOG(2) << "Share tensor buffer when running " << op_type_ << " : " << in_var_info->Name() << " -> " << out_var_names_[i]; } diff --git a/paddle/fluid/framework/details/share_tensor_buffer_functor.h b/paddle/fluid/framework/details/share_tensor_buffer_functor.h index 774dcd056e..be49d1c432 100644 --- a/paddle/fluid/framework/details/share_tensor_buffer_functor.h +++ b/paddle/fluid/framework/details/share_tensor_buffer_functor.h @@ -19,6 +19,7 @@ #include #include #include + #include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h" #include "paddle/fluid/framework/scope.h" @@ -40,11 +41,13 @@ class ShareTensorBufferFunctor { ShareTensorBufferFunctor( Scope *scope, size_t scope_idx, const std::string &op_type, const std::vector &in_var_infos, - const std::vector &out_var_names); + const std::vector &out_var_names, bool share_dims = false); void AddReuseVarPair(const ir::MemOptVarInfo *in_var_info, const std::string &out_var_name); + void SetShareDims(bool share_dims) { share_dims_ = share_dims; } + void operator()(Scope *exec_scope); std::unordered_map ReusedVars() const; @@ -66,6 +69,11 @@ class ShareTensorBufferFunctor { std::vector out_var_names_; std::vector> in_out_vars_; + + // NOTE(zhiqiu): In the case of inplace addto, if the operator of + // the in_out_vars is skipped during running, we should set the dims of output + // as the same as input. + bool share_dims_{false}; }; } // namespace details diff --git a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc index b805ad3b07..be3f5515a9 100644 --- a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc +++ b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc @@ -59,9 +59,10 @@ ComputationOpHandle *GetUniquePendingComputationOpHandle( ShareTensorBufferOpHandle::ShareTensorBufferOpHandle( ir::Node *node, Scope *scope, size_t scope_idx, const std::string &op_type, const std::vector &in_var_infos, - const std::vector &out_var_names) + const std::vector &out_var_names, bool share_dims) : OpHandleBase(node), - functor_(scope, scope_idx, op_type, in_var_infos, out_var_names) {} + functor_(scope, scope_idx, op_type, in_var_infos, out_var_names, + share_dims) {} std::unordered_map ShareTensorBufferOpHandle::ReusedVars() const { @@ -73,6 +74,10 @@ void ShareTensorBufferOpHandle::AddReuseVarPair( functor_.AddReuseVarPair(in_var_info, out_var_name); } +void ShareTensorBufferOpHandle::SetShareDims(bool share_dims) { + functor_.SetShareDims(share_dims); +} + void ShareTensorBufferOpHandle::InitCUDA() { #ifdef PADDLE_WITH_CUDA int dev_id = diff --git a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h index b22f5621fe..a02c346485 100644 --- a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h +++ b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h @@ -17,6 +17,7 @@ #include #include #include + #include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/share_tensor_buffer_functor.h" @@ -31,7 +32,7 @@ class ShareTensorBufferOpHandle : public OpHandleBase { ir::Node *node, Scope *scope, size_t scope_idx, const std::string &op_type, const std::vector &in_vars_infos, - const std::vector &out_var_names); + const std::vector &out_var_names, bool share_dims = false); std::unordered_map ReusedVars() const; @@ -42,6 +43,8 @@ class ShareTensorBufferOpHandle : public OpHandleBase { void AddReuseVarPair(const ir::MemOptVarInfo *in_var_info, const std::string &out_var_name); + void SetShareDims(bool share_dims); + const ShareTensorBufferFunctor &Functor() const { return functor_; } protected: diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt b/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt index 726a2d90fc..a8c0973cac 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt +++ b/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt @@ -13,4 +13,6 @@ cc_library(memory_reuse_pass SRCS memory_reuse_pass.cc DEPS computation_op_handl cc_library(buffer_shared_inplace_op_pass SRCS buffer_shared_inplace_op_pass.cc DEPS memory_reuse_pass) cc_library(buffer_shared_cross_op_memory_reuse_pass SRCS buffer_shared_cross_op_memory_reuse_pass.cc DEPS memory_reuse_pass) +cc_library(inplace_addto_op_pass SRCS inplace_addto_op_pass.cc DEPS memory_reuse_pass) + cc_test(test_reference_count_pass_last_lived_ops SRCS test_reference_count_pass_last_lived_ops.cc DEPS parallel_executor elementwise_mul_op elementwise_add_op scale_op) diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc index 0b42f2ebd5..ce7f27d275 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc +++ b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc @@ -16,6 +16,7 @@ #include #include #include + #include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h" @@ -141,11 +142,12 @@ void BufferSharedInplaceOpPass::Run(Graph *graph) const { VLOG(4) << "Inplace performed in op " << op_type << ": " << in_var_handle_ptr->Name() << " -> " << out_var_handle_ptr->Name() - << ". Debug String is: " << op->GetOp()->DebugString(); + << ". Debug String is: " << op->GetOp()->DebugString() + << ". ReuseType: " << ReuseType(); } else { VLOG(3) << "Inplace failed in op " << op_type << ": " << in_var_handle_ptr->Name() << " -> " - << out_var_handle_ptr->Name(); + << out_var_handle_ptr->Name() << ". ReuseType: " << ReuseType(); } } } diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/inplace_addto_op_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/inplace_addto_op_pass.cc new file mode 100644 index 0000000000..81c63f46bd --- /dev/null +++ b/paddle/fluid/framework/ir/memory_optimize_pass/inplace_addto_op_pass.cc @@ -0,0 +1,221 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include "paddle/fluid/framework/details/computation_op_handle.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h" +#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h" +#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h" +#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +class InplaceAddToOpPass : public MemoryReusePass { + protected: + std::string ReuseType() const override { return "inplace_addto"; } + + void Run(Graph *graph) const override; + + private: + // 1. Add last living op of in_var, add any last living op of out_var + // 2. Set reference count of in_var to be 2 + void UpdateLastLiveOpOfVar(details::ComputationOpHandle *op, + details::VarHandle *in_var, + details::VarHandle *out_var) const override { + size_t scope_idx = op->GetScopeIdx(); + auto *last_live_ops_of_vars_ = + &Get>(kLastLiveOpsOfVars); + auto *var_infos_ = &(Get(kMemOptVarInfoMapList)); + auto out_var_op_iter = + (*last_live_ops_of_vars_)[scope_idx].find(out_var->Name()); + + // In Reduce mode, some output variable(gradient of parameter) does not have + // last live ops + details::ComputationOpHandle *last_live_op_of_in_var = nullptr; + if (out_var_op_iter == (*last_live_ops_of_vars_)[scope_idx].end()) { + last_live_op_of_in_var = op; + } else { + PADDLE_ENFORCE_EQ( + out_var_op_iter->second.ops().empty(), false, + platform::errors::InvalidArgument( + "Var(%s)'s last live op should not empty.", out_var->Name())); + last_live_op_of_in_var = *(out_var_op_iter->second.ops().begin()); + } + + auto *last_live_ops_of_in_var = + (*last_live_ops_of_vars_)[scope_idx][in_var->Name()].mutable_ops(); + // last_live_ops_of_in_var->clear(); + last_live_ops_of_in_var->insert(last_live_op_of_in_var); + + auto in_var_info_iter = (*var_infos_)[scope_idx].find(in_var->Name()); + PADDLE_ENFORCE_NE( + in_var_info_iter, (*var_infos_)[scope_idx].end(), + platform::errors::NotFound("Cannot find variable %s.", in_var->Name())); + + in_var_info_iter->second->SetRefCnt(2); // before inplace, it is 1 + } +}; + +void InplaceAddToOpPass::Run(Graph *graph) const { + const auto &last_live_ops = + Get>(kLastLiveOpsOfVars); + + bool use_cuda = Get(kUseCuda); + + // Currently, only perform InplaceAddToOpPass on cuda place + if (!use_cuda) { + return; + } + + // Step 1: Build a reverse map of last_live_ops + // i.e.: op -> vars + std::unordered_map> + candidate_ops; + for (auto &each_scope_ops : last_live_ops) { + for (auto &pair : each_scope_ops) { + // If variable has more than 1 last lived ops, this variable cannot + // be inplaced. + if (pair.second.ops().size() != 1) { + continue; + } + + auto *op = *(pair.second.ops().begin()); + const std::string &op_type = op->GetOp()->Type(); + const framework::OpDesc *op_desc = op->Node()->Op(); + PADDLE_ENFORCE_NOT_NULL( + op_desc, platform::errors::NotFound("Op(%s) can not find opdesc.", + op->Name())); + + // only grad op should be processed. + if (op_type != "grad_add") { + continue; + } + + const std::string &var_name = pair.first; + auto in_nodes = this->FindNodesByName(var_name, op->Node()->inputs); + if (in_nodes.size() == 1) { + candidate_ops[op][var_name] = *in_nodes.begin(); + } + VLOG(4) << "Find op " << op_type << " with input(" << var_name + << ") that can do inplace add to"; + } + } + + // Step 2: Check which vars can be inplaced indeed + for (auto &op_vars_pair : candidate_ops) { + auto *op = op_vars_pair.first; + + // The original gradient accumulation is g = sum(g_0, g_1,..., g_n), and it + // could be changed as follws if inplace addto is enabled: + // g_sum_0 = g_0 + // g_sum_1 = grad_add(g_sum_0, g_1) + // g_sum_2 = grad_add(g_sum_1, g_2) + // ... + // g_sum_n = grad_add(g_sum_n-1, g_n) + + // here we will add inplace for each grad_add, for example, for the first + // grad_add, g_sum_0 -> g1, g_sum_1 -> g1, and set grad_add as skipped. + + const std::string &op_type = op->GetOp()->Type(); + + PADDLE_ENFORCE_EQ(op->Node()->inputs.size(), 2, + platform::errors::InvalidArgument( + "The size of inputs of %s should be 2, but got %d", + op_type, op->Node()->inputs.size())); + + PADDLE_ENFORCE_EQ(op->Node()->outputs.size(), 1, + platform::errors::InvalidArgument( + "The size of outputs of %s should be 1, but got %d", + op_type, op->Node()->outputs.size())); + + auto *left_var_ptr = dynamic_cast( + &(op->Node()->inputs[0]->Wrapper())); + auto *right_var_ptr = dynamic_cast( + &(op->Node()->inputs[1]->Wrapper())); + auto *out_var_ptr = dynamic_cast( + &(op->Node()->outputs[0]->Wrapper())); + + if (left_var_ptr == nullptr || right_var_ptr == nullptr || + out_var_ptr == nullptr) { + continue; + } + + // auto *left_generated_op = dynamic_cast( + // left_var_ptr->GeneratedOp()); + + auto *right_generated_op = dynamic_cast( + right_var_ptr->GeneratedOp()); + + auto *out_generated_op = dynamic_cast( + out_var_ptr->GeneratedOp()); + + // NOTE(zhiqiu): currently, only conv2d_grad supports addto strategy + if (right_generated_op->Name() != "conv2d_grad") { + continue; + } + + // NOTE(zhiqiu): Normally, if we inplace a->b, we should let a generated + // before b. However, in the situation of inplace addto, we do not care + // the order, since a+b is equal to b+a. Is there any exception for that? + + // AddDependencyVar(right_generated_op, left_generated_op); + // no need, as discussed above. + + // step (a): inplace right_var->left_var of grad_add + + this->AddReuseVar(right_generated_op, left_var_ptr, right_var_ptr); + UpdateLastLiveOpOfVar(right_generated_op, left_var_ptr, right_var_ptr); + VLOG(4) << "Inplace performed in op " << right_generated_op->GetOp()->Type() + << ": " << left_var_ptr->Name() << " -> " << right_var_ptr->Name() + << ". Debug String is: " + << right_generated_op->GetOp()->DebugString() + << ". ReuseType: " << ReuseType(); + + // step (b): inplace out -> right_var of grad_add + + this->AddReuseVar(out_generated_op, right_var_ptr, out_var_ptr, true); + + VLOG(4) << "Inplace performed in op " << op_type << ": " + << left_var_ptr->Name() << " -> " << out_var_ptr->Name() + << ". Debug String is: " << op->GetOp()->DebugString() + << ". ReuseType: " << ReuseType(); + + // step (c): make right_var cannot inplace afterwards. canbe done + // aotomatically since CollectReusedVars is called before any reuse. + + // step (d): make right_var's generated op use addto + right_generated_op->GetOp()->SetAttr("use_addto", true); + + // step (e): make grad_add skip running + op->SetSkipRunning(true); + } +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(inplace_addto_op_pass, paddle::framework::ir::InplaceAddToOpPass) + .RequirePassAttr(paddle::framework::ir::kMemOptVarInfoMapList) + .RequirePassAttr(paddle::framework::ir::kLastLiveOpsOfVars) + .RequirePassAttr(paddle::framework::ir::kUseCuda); diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc index 221b0a76e7..3e3b9864a7 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc +++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h" + #include #include #include @@ -73,6 +74,7 @@ bool MemoryReusePass::TryReuseVar(details::VarHandle *in_var, out_var->Name())); if (IsVarPairReusable(*in_var, *out_var)) { AddReuseVar(op, in_var, out_var); + UpdateLastLiveOpOfVar(op, in_var, out_var); return true; } else { return false; @@ -324,7 +326,8 @@ bool MemoryReusePass::IsVarPairReusable( void MemoryReusePass::AddReuseVar(details::ComputationOpHandle *op, details::VarHandle *in_var, - details::VarHandle *out_var) const { + details::VarHandle *out_var, + bool share_dims) const { PADDLE_ENFORCE_GT( (*var_infos_)[op->GetScopeIdx()].count(in_var->Name()), 0, platform::errors::NotFound("Var(%s) does not in mem opt var infos.", @@ -344,13 +347,15 @@ void MemoryReusePass::AddReuseVar(details::ComputationOpHandle *op, share_buffer_op->AddInput(in_var); } + if (share_dims) { + share_buffer_op->SetShareDims(true); + } + share_buffer_op->AddReuseVarPair( (*var_infos_)[op->GetScopeIdx()].at(in_var->Name()).get(), out_var->Name()); reused_in_var_names_[op->GetScopeIdx()].insert(in_var->Name()); reused_out_var_names_[op->GetScopeIdx()].insert(out_var->Name()); - - UpdateLastLiveOpOfVar(op, in_var, out_var); } // 1. Set last living op of in_var to be any last living op of out_var diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h index 8227441918..1c0c6ae602 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h +++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h @@ -18,6 +18,7 @@ #include #include #include + #include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h" @@ -92,6 +93,12 @@ class MemoryReusePass : public Pass { int64_t GetMemorySize(const details::VarHandle &var) const; + void AddReuseVar(details::ComputationOpHandle *op, details::VarHandle *in_var, + details::VarHandle *out_var, bool share_dims = false) const; + virtual void UpdateLastLiveOpOfVar(details::ComputationOpHandle *op, + details::VarHandle *in_var, + details::VarHandle *out_var) const; + private: VarDesc *GetVarDesc(const details::VarHandle &var) const; @@ -109,13 +116,6 @@ class MemoryReusePass : public Pass { void CollectReusedVars() const; - void AddReuseVar(details::ComputationOpHandle *op, details::VarHandle *in_var, - details::VarHandle *out_var) const; - - void UpdateLastLiveOpOfVar(details::ComputationOpHandle *op, - details::VarHandle *in_var, - details::VarHandle *out_var) const; - private: mutable Graph *graph_; mutable bool use_cuda_; diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index ebecbf0498..bd52d7ffef 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -157,6 +157,14 @@ class OperatorBase { platform::errors::NotFound("(%s) is not found in AttributeMap.", name)); return BOOST_GET_CONST(T, attrs_.at(name)); } + void SetAttr(const std::string& name, const Attribute& v) { + PADDLE_ENFORCE_EQ( + HasAttr(name), true, + platform::errors::NotFound( + "The attribute %s is not found in operator %s", name, Type())); + + attrs_[name] = v; + } const AttributeMap& Attrs() const { return attrs_; } const VariableNameMap& Inputs() const { return inputs_; } diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 12e0f97f12..535ec9cd7d 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -13,12 +13,14 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/parallel_executor.h" + #include #include #include #include #include #include + #include "paddle/fluid/framework/details/async_ssa_graph_executor.h" #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" @@ -108,6 +110,11 @@ class ParallelExecutorPrivate { * them. */ inline void SetSkipMemoryReuse(size_t scope_idx, const std::string &name) { + if (mem_opt_var_infos_.size() == 0) { + VLOG(4) << "The mem_opt_var_infos_ is empty, maybe no memory " + "optimization strategy is enabled"; + return; + } auto iter = mem_opt_var_infos_[scope_idx].find(name); if (iter != mem_opt_var_infos_[scope_idx].end()) { iter->second->SetSkipMemoryReuse(true); @@ -308,6 +315,7 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) { } bool need_mem_opt = build_strategy_.enable_inplace_ || + build_strategy_.enable_addto_ || build_strategy_.memory_optimize_.get() || is_gc_enabled; if (!need_mem_opt) return graph; @@ -320,6 +328,16 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) { graph = ref_cnt_pass->Apply(graph); VLOG(10) << "ReferenceCountPass Applied"; + if (build_strategy_.enable_addto_) { + auto addto_pass = ir::PassRegistry::Instance().Get("inplace_addto_op_pass"); + addto_pass->SetNotOwned(ir::kMemOptVarInfoMapList, &mem_opt_var_infos_); + addto_pass->SetNotOwned(ir::kLastLiveOpsOfVars, &last_live_ops_of_vars); + addto_pass->SetNotOwned(ir::kUseCuda, &use_cuda_); + VLOG(10) << "Start to apply inplace_addto_op_pass"; + graph = addto_pass->Apply(graph); + VLOG(10) << "inplace_addto_op_pass Applied"; + } + if (build_strategy_.enable_inplace_) { auto inplace_pass = ir::PassRegistry::Instance().Get("buffer_shared_inplace_pass"); @@ -1068,3 +1086,4 @@ USE_PASS(reference_count_pass); USE_PASS(eager_deletion_pass); USE_PASS(buffer_shared_inplace_pass); USE_PASS(buffer_shared_cross_op_memory_reuse_pass); +USE_PASS(inplace_addto_op_pass); diff --git a/paddle/fluid/operators/conv_cudnn_op.cu b/paddle/fluid/operators/conv_cudnn_op.cu index 7f70575591..00af724ac7 100644 --- a/paddle/fluid/operators/conv_cudnn_op.cu +++ b/paddle/fluid/operators/conv_cudnn_op.cu @@ -14,6 +14,7 @@ limitations under the License. */ #include #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor.h" @@ -287,7 +288,9 @@ class CUDNNConvOpKernel : public framework::OpKernel { #endif // ------------------- cudnn conv forward --------------------- - ScalingParamType alpha = 1.0f, beta = 0.0f; + ScalingParamType alpha = 1.0f; + ScalingParamType beta = ctx.Attr("use_addto") ? 1.0f : 0.0f; + VLOG(4) << "Conv: use_addto = " << ctx.Attr("use_addto"); for (int i = 0; i < groups; i++) { workspace_handle.RunFunc( [&](void* workspace_ptr) { @@ -609,9 +612,13 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { } // ------------------- cudnn conv backward data --------------------- - ScalingParamType alpha = 1.0f, beta = 0.0f; + ScalingParamType alpha = 1.0f; + ScalingParamType beta = ctx.Attr("use_addto") ? 1.0f : 0.0f; + VLOG(4) << "Conv_grad: use_addto = " << ctx.Attr("use_addto"); + if (input_grad) { - // Because beta is zero, it is unnecessary to reset input_grad. + // When beta is 0, it is unnecessary to reset input_grad. + // When beta is 1, the output cannot be reset since addt strategy used. for (int i = 0; i < groups; i++) { workspace_handle.RunFunc( [&](void* cudnn_workspace_ptr) { @@ -653,6 +660,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { ctx, &transformed_input_grad_channel, input_grad); } } + + // filter_grad do not use inplace addto. + ScalingParamType beta_filter = 0.0f; // ------------------- cudnn conv backward filter --------------------- if (filter_grad) { // Because beta is zero, it is unnecessary to reset filter_grad. @@ -665,7 +675,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { input_data + i * group_offset_in, args2.odesc.desc(), output_grad_data + i * group_offset_out, args2.cdesc.desc(), filter_algo, cudnn_workspace_ptr, - workspace_size, &beta, args2.wdesc.desc(), + workspace_size, &beta_filter, args2.wdesc.desc(), filter_grad_data + i * group_offset_filter)); }, workspace_size); @@ -1017,7 +1027,14 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel { int group_offset_out = o_c / groups * o_h * o_w * o_d; int group_offset_filter = W->numel() / groups; - ScalingParamType alpha = 1.0f, beta = 0.0f; + ScalingParamType alpha = 1.0f; + ScalingParamType beta = 0.0f; + + // NOTE(zhiqiu): inplace addto is not supportted in double grad yet. + // ScalingParamType beta = ctx.Attr("use_addto") ? 1.0f : + // 0.0f; + // VLOG(4) << "Conv_grad_grad: use_addto = " << ctx.Attr("use_addto"); + auto wkspace_handle = dev_ctx.cudnn_workspace_handle(); if (ddO) { diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index 9ed169fe35..bf97b9d03c 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -305,6 +305,11 @@ void Conv2DOpMaker::Make() { .SetDefault(0.0f); AddAttr("fuse_beta", "(float, default 0.0) Only used in mkldnn kernel") .SetDefault(0.0f); + AddAttr( + "use_addto", + "(bool, default false) If use addto strategy or not, only used in " + "cudnn kernel") + .SetDefault(false); AddAttr("fuse_residual_connection", "(bool, default false) Only used in mkldnn kernel. Used " "whenever convolution output is as an input to residual " @@ -460,6 +465,11 @@ void Conv3DOpMaker::Make() { .SetDefault(0.0f); AddAttr("fuse_beta", "(float, default 0.0) Only used in mkldnn kernel") .SetDefault(0.0f); + AddAttr( + "use_addto", + "(bool, default false) If use addto strategy or not, only used in " + "cudnn kernel") + .SetDefault(false); AddAttr("fuse_residual_connection", "(bool, default false) Only used in mkldnn kernel. Used " "whenever convolution output is as an input to residual " diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cc b/paddle/fluid/operators/elementwise/elementwise_add_op.cc index 534a19bd94..97624944ca 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc @@ -13,8 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_add_op.h" + #include #include + +#include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/elementwise/elementwise_op.h" namespace paddle { @@ -129,3 +132,18 @@ REGISTER_OP_CPU_KERNEL( int>, ops::ElementwiseAddDoubleGradKernel); + +// A specialization elementwise_add operator, used in gradient accumulation with +// inplace addto. +REGISTER_OPERATOR( + grad_add, paddle::operators::ElementwiseOp, + paddle::operators::ElementwiseAddOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker); + +REGISTER_OP_CPU_KERNEL( + grad_add, + ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu index 7101987280..a4cbd14388 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu @@ -111,3 +111,10 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseAddDoubleGradKernel, ops::ElementwiseAddDoubleGradKernel); + +REGISTER_OP_CUDA_KERNEL( + grad_add, ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel); diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index af8798a4b7..9116edd01b 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -521,3 +521,18 @@ DEFINE_int32( DEFINE_bool(sort_sum_gradient, false, "Sum gradients by the reverse order of " "the forward execution sequence."); + +/** + * Performance related FLAG + * Name: max_inplace_grad_add + * Since Version: 2.0.0 + * Value Range: int32, default=0 + * Example: + * Note: The maximum number of inplace grad_add. + */ +DEFINE_int32( + max_inplace_grad_add, 0, + "The maximum number of inplace grad_add. When doing " + "gradient accumulation, if the number of gradients need to that " + "less FLAGS_max_inplace_grad_add, than it will be use several grad_add" + "instead of sum. Default is 0."); diff --git a/paddle/fluid/pybind/global_value_getter_setter.cc b/paddle/fluid/pybind/global_value_getter_setter.cc index 318178d5eb..894740e25c 100644 --- a/paddle/fluid/pybind/global_value_getter_setter.cc +++ b/paddle/fluid/pybind/global_value_getter_setter.cc @@ -62,6 +62,7 @@ DECLARE_bool(use_system_allocator); // others DECLARE_bool(benchmark); DECLARE_int32(inner_op_parallelism); +DECLARE_int32(max_inplace_grad_add); DECLARE_string(tracer_profile_fname); #ifdef PADDLE_WITH_CUDA // cudnn @@ -348,7 +349,7 @@ static void RegisterGlobalVarGetterSetter() { FLAGS_init_allocated_mem, FLAGS_initial_cpu_memory_in_mb, FLAGS_memory_fraction_of_eager_deletion, FLAGS_use_pinned_memory, FLAGS_benchmark, FLAGS_inner_op_parallelism, FLAGS_tracer_profile_fname, - FLAGS_paddle_num_threads, FLAGS_use_mkldnn); + FLAGS_paddle_num_threads, FLAGS_use_mkldnn, FLAGS_max_inplace_grad_add); #ifdef PADDLE_WITH_CUDA REGISTER_PUBLIC_GLOBAL_VAR( diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 330254ecaa..04087cb241 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include + #include #include #include @@ -22,6 +23,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/feed_fetch_type.h" @@ -2528,6 +2530,10 @@ All parameter, weight, gradient are variables in Paddle. "enable_inplace", [](const BuildStrategy &self) { return self.enable_inplace_; }, [](BuildStrategy &self, bool b) { self.enable_inplace_ = b; }) + .def_property( + "enable_addto", + [](const BuildStrategy &self) { return self.enable_addto_; }, + [](BuildStrategy &self, bool b) { self.enable_addto_ = b; }) .def_property( "fuse_all_reduce_ops", [](const BuildStrategy &self) { diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 9f748b7956..e8cc6ce990 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -197,6 +197,7 @@ def __bootstrap__(): 'free_when_no_cache_hit', 'call_stack_level', 'sort_sum_gradient', + 'max_inplace_grad_add', ] if 'Darwin' not in sysstr: read_env_flags.append('use_pinned_memory') diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py index d51cacd1a5..478fecf74e 100644 --- a/python/paddle/fluid/backward.py +++ b/python/paddle/fluid/backward.py @@ -251,12 +251,19 @@ def _rename_arg_(op_descs, old_name, new_name, begin_idx=None, end_idx=None): begin_idx = 0 if end_idx is None: end_idx = len(op_descs) - for i in range(begin_idx, end_idx): - op_desc = op_descs[i] - if isinstance(op_desc, tuple): - op_desc = op_desc[0] - op_desc._rename_input(old_name, new_name) - op_desc._rename_output(old_name, new_name) + if isinstance(op_descs, (list, tuple)): + for i in range(begin_idx, end_idx): + op_desc = op_descs[i] + if isinstance(op_desc, tuple): + op_desc = op_desc[0] + op_desc._rename_input(old_name, new_name) + op_desc._rename_output(old_name, new_name) + if isinstance(op_descs, collections.OrderedDict): + for key, value in op_descs.items(): + if isinstance(value, (list, tuple)): + for op_desc in value: + op_desc._rename_input(old_name, new_name) + op_desc._rename_output(old_name, new_name) def _create_op_desc_(op_type, inputs, outputs, attrs): @@ -369,6 +376,41 @@ def _append_grad_suffix_(name): return cpt.to_text(name) + core.grad_var_suffix() +def _accumulate_gradients_by_sum_op_(var_name, renamed_vars, pending_sum_ops, + op_idx): + """ + Use sum op to accumulate_gradients, the gradients are stored in renamed_vars. + """ + if op_idx not in pending_sum_ops.keys(): + pending_sum_ops[op_idx] = [] + pending_sum_ops[op_idx].append( + _create_op_desc_("sum", {"X": renamed_vars[var_name]}, + {"Out": [var_name]}, {"use_mkldnn": False})) + renamed_vars[var_name] = [var_name] + + +def _accumulate_gradients_by_add_ops_(var_name, renamed_vars, pending_sum_ops, + op_idx): + """ + Use several inplace add op to accumulate_gradients, the gradients are stored in renamed_vars. + """ + if op_idx not in pending_sum_ops.keys(): + pending_sum_ops[op_idx] = [] + out_name = renamed_vars[var_name][0] + for i in range(1, len(renamed_vars[var_name])): + x_name = out_name + y_name = renamed_vars[var_name][i] + if i != len(renamed_vars[var_name]) - 1: + out_name = var_name + '@ADD@' + str(i) + else: + out_name = var_name + pending_sum_ops[op_idx].append( + _create_op_desc_("grad_add", {"X": [x_name], + "Y": [y_name]}, {"Out": [out_name]}, + {"use_mkldnn": False})) + renamed_vars[var_name] = [var_name] + + def _addup_repetitive_outputs_(op_descs, block_idx): """ In backward part, an variable may be the output of more than one ops. @@ -376,7 +418,9 @@ def _addup_repetitive_outputs_(op_descs, block_idx): In these cases, the variable should be the accumulation of all the outputs. `sum_op`s are added to implement the accumulate. """ - pending_sum_ops = [] + _MAX_ADD_NUM_ = core.globals()['FLAGS_max_inplace_grad_add'] + #pending_sum_ops = [] + pending_sum_ops = collections.OrderedDict() var_rename_count = collections.defaultdict(int) renamed_vars = collections.defaultdict(list) renamed_var_start_idx = collections.defaultdict(list) @@ -385,10 +429,13 @@ def _addup_repetitive_outputs_(op_descs, block_idx): if "@GRAD" not in var_name: continue if len(renamed_vars[var_name]) > 1: - pending_sum_ops.append((_create_op_desc_( - "sum", {"X": renamed_vars[var_name]}, {"Out": [var_name]}, - {"use_mkldnn": False}), idx)) - renamed_vars[var_name] = [var_name] + if len(renamed_vars[var_name]) > _MAX_ADD_NUM_: + _accumulate_gradients_by_sum_op_(var_name, renamed_vars, + pending_sum_ops, idx) + else: + _accumulate_gradients_by_add_ops_(var_name, renamed_vars, + pending_sum_ops, idx) + for param_idx, param_name in enumerate(op_desc.output_names()): arg_names = op_desc.output(param_name) for arg_idx, var_name in enumerate(arg_names): @@ -440,13 +487,26 @@ def _addup_repetitive_outputs_(op_descs, block_idx): renamed_vars[var_name].append(new_name) for var_name, inputs in six.iteritems(renamed_vars): - if len(inputs) > 1: - pending_sum_ops.append( - (_create_op_desc_("sum", {"X": inputs}, {"Out": [var_name]}, - {"use_mkldnn": False}), len(op_descs))) + if len(renamed_vars[var_name]) > 1: + if len(renamed_vars[var_name]) > _MAX_ADD_NUM_: + _accumulate_gradients_by_sum_op_(var_name, renamed_vars, + pending_sum_ops, len(op_descs)) + else: + _accumulate_gradients_by_add_ops_(var_name, renamed_vars, + pending_sum_ops, + len(op_descs)) + # sum_op descs are sorted according to their insert position - for p in reversed(pending_sum_ops): - op_descs.insert(p[1], p[0]) + for key, value in collections.OrderedDict( + reversed(list(pending_sum_ops.items()))).items(): + + # NOTE(zhiqiu): Since reversed, the idx of op_descs to be inserted will remains correct. + # For example, [0, 1, 2], and we want to insert 'a' at idx 1, 'b' at idx 2, and the expected result is [0, 1, 'a', 2, 'b']. + # If reversed, we first insert 'b' at idx 2, it becomes [0, 1, 2, 'b'], and then insert 'a' at idx 1, it becomes [0, 1, 'a', 2, 'b']. + # If not reverse, we first insert 'a' at idx 1, it becomes [0, 1, 'a', 2], and then insert 'b' at idx 2, it becomes [0, 1, 'a', 'b', 2]. + idx = key + for i, op in enumerate(value): + op_descs.insert(idx + i, op) return op_descs diff --git a/python/paddle/fluid/tests/unittests/test_inplace_addto_strategy.py b/python/paddle/fluid/tests/unittests/test_inplace_addto_strategy.py new file mode 100644 index 0000000000..c75acd7c15 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_inplace_addto_strategy.py @@ -0,0 +1,114 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest + +import paddle +import paddle.fluid as fluid +import paddle.fluid.layers as layers +from paddle.fluid.backward import calc_gradient +import numpy as np + + +class ConvBNLayer(fluid.Layer): + def __init__(self, + num_channels, + num_filters, + filter_size, + stride=1, + groups=1, + act=None, + use_cudnn=False): + super(ConvBNLayer, self).__init__() + + self._conv = fluid.dygraph.Conv2D( + num_channels=num_channels, + num_filters=num_filters, + filter_size=filter_size, + stride=stride, + padding=(filter_size - 1) // 2, + groups=groups, + act=None, + bias_attr=False, + use_cudnn=use_cudnn) + + self._batch_norm = fluid.dygraph.BatchNorm(num_filters, act=act) + + def forward(self, inputs): + y = self._conv(inputs) + y = self._batch_norm(y) + return y + + +def create_program(): + main = fluid.Program() + startup = fluid.Program() + with fluid.program_guard(main, startup): + x = fluid.data(name='img', shape=[-1, 3, 224, 224]) + x.stop_gradient = False + x = fluid.layers.prelu(x, mode="channel") + conv = ConvBNLayer( + num_channels=3, + num_filters=3, + filter_size=1, + act='relu', + use_cudnn=True) + y = conv(x) + x + + loss = fluid.layers.reduce_sum(y) + + sgd = fluid.optimizer.SGD(learning_rate=0.01) + sgd.minimize(loss) + + return loss, main, startup, conv._conv.weight + + +class TestInplaceAddto(unittest.TestCase): + def test_result(self): + def run_program(enable_addto): + np.random.seed(10) + paddle.manual_seed(10) + paddle.framework.random._manual_program_seed(10) + if fluid.core.is_compiled_with_cuda(): + fluid.set_flags({"FLAGS_cudnn_deterministic": True}) + fluid.set_flags({"FLAGS_max_inplace_grad_add": 2}) + loss, main, startup, w = create_program() + place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda( + ) else fluid.CPUPlace() + exe = fluid.Executor(place) + + strategy = fluid.BuildStrategy() + strategy.enable_addto = enable_addto + compiled = fluid.CompiledProgram(main).with_data_parallel( + loss_name=loss.name, build_strategy=strategy) + + exe.run(startup) + img = np.random.uniform(-128, 128, + [8, 3, 224, 224]).astype(np.float32) + for i in range(2): + res = exe.run(compiled, + feed={'img': img}, + fetch_list=[loss.name, w.name]) + return res + + res1, w1 = run_program(True) + res2, w2 = run_program(False) + print(res1, res2) + self.assertTrue(np.array_equal(res1, res2)) + + +if __name__ == "__main__": + unittest.main() -- GitLab