From c0a991c8740b413559bfc894aa5ae1d5ed3704b5 Mon Sep 17 00:00:00 2001 From: Zhou Wei <52485244+zhouwei25@users.noreply.github.com> Date: Tue, 1 Dec 2020 03:06:38 +0800 Subject: [PATCH] accumulate gradient for leaf tensor with previous graph and expose leaf tensor concept (#28429) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * The leaf tensor concept is exposed and the gradient accumulation of leaf tensor * The leaf tensor concept is exposed and the gradient accumulation of leaf tensor * fix coverage * fix api doc * fix CI unittest * fix CI unittest * fix unitest * empty tensor does’t need inner_var_ * fix some error message --- paddle/fluid/imperative/basic_engine.cc | 82 +++++--- paddle/fluid/imperative/basic_engine.h | 7 +- paddle/fluid/imperative/dygraph_grad_maker.h | 1 + .../fluid/imperative/gradient_accumulator.cc | 190 ++++++++++++------ .../fluid/imperative/gradient_accumulator.h | 97 ++++++--- paddle/fluid/imperative/layer.cc | 4 + paddle/fluid/imperative/layer.h | 8 + .../fluid/imperative/partial_grad_engine.cc | 2 +- paddle/fluid/imperative/tests/CMakeLists.txt | 2 +- .../tests/test_gradient_accmulator.cc | 64 +++++- paddle/fluid/imperative/variable_wrapper.h | 46 +++++ paddle/fluid/pybind/imperative.cc | 31 ++- .../fluid/dygraph/varbase_patch_methods.py | 76 ++++--- python/paddle/fluid/optimizer.py | 2 + .../tests/unittests/test_imperative_basic.py | 108 ++++++++++ .../unittests/test_imperative_double_grad.py | 12 +- .../fluid/tests/unittests/test_momentum_op.py | 1 + .../fluid/tests/unittests/test_var_base.py | 26 +++ python/paddle/optimizer/optimizer.py | 2 + 19 files changed, 596 insertions(+), 165 deletions(-) diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc index b37d8619e7..f97ab4f4e0 100644 --- a/paddle/fluid/imperative/basic_engine.cc +++ b/paddle/fluid/imperative/basic_engine.cc @@ -38,7 +38,20 @@ namespace imperative { void BasicEngine::Init(VarBase* var, bool retain_graph) { retain_graph_ = retain_graph; init_node_ = var->GradVarBase()->GradNode(); - var->GradVarBase()->ClearGradNode(); + PADDLE_ENFORCE_EQ(var->GradVarBase()->GraphIsFreed(), false, + platform::errors::Unavailable( + "%s trying to backward through the same graph a second " + "time, but this graph have already been freed. Please " + "specify Tensor.backward(retain_graph=True) when " + "calling backward at the first time.", + var->Name())); + + if (!retain_graph) { + VLOG(5) << "Clear the auto-grad graph from grad var " << var->Name() + << " because of retain_graph=False when calling backward"; + var->GradVarBase()->SetGraphIsFreed(true); + var->GradVarBase()->ClearGradNode(); + } if (init_node_ == nullptr || var->OverridedStopGradient()) { VLOG(3) << "Skip auto grad since there is no grad op for var or loss is " @@ -47,7 +60,7 @@ void BasicEngine::Init(VarBase* var, bool retain_graph) { return; } - VLOG(3) << "start backward"; + VLOG(3) << "Init first node of backward"; PADDLE_ENFORCE_EQ( var->HasGradVar(), true, @@ -114,6 +127,10 @@ void BasicEngine::PrepareGradAccumulators(const OpBase& op) { accumulator->IncreaseRefCnt(); + VLOG(3) << "Prepare to acccumulate variable grad " << var->Name() << "(" + << var.get() << ") with reference count " + << accumulator->RefCnt(); + if (var->HasLeafHooks()) { VLOG(3) << "Grad variable wrapper (" << var->Name() << ") has leaf grad hooks."; @@ -123,10 +140,6 @@ void BasicEngine::PrepareGradAccumulators(const OpBase& op) { "Gradientaccumulator.")); accumulator->SetPostHooks(var->GetLeafHooks()); } - - VLOG(3) << "Prepare to acccumulate variable grad " << var->Name() << "(" - << var.get() << ") with reference count " - << accumulator->RefCnt(); } } } @@ -190,13 +203,14 @@ void BasicEngine::Execute() { // CheckBackWardInput CheckBackwardInputs(cur_op); - // Step 1: Run Backward + // Step 1: Run Backward OP auto& bwd_ins = cur_op.GetInsMap(); auto& bwd_outs = cur_op.GetOutsMap(); NameVarMap<VariableWrapper> tmp_outs(bwd_outs); - // 1. construct the output map 2. replace the element in the map - // A var may be coresponding to several grad var in one op + // 1. construct the temp output map, avoid to disrupt graph + // 2. replace the element in the map by temp var, because a + // var may be coresponding to several grad var in one op for (auto& pair : tmp_outs) { if (!pair.second.IsGrad()) { continue; @@ -213,15 +227,23 @@ void BasicEngine::Execute() { platform::errors::NotFound("Cannot find gradient of variable %s", var->Name())); - if (!var->OverridedStopGradient() && iter->second->RefCnt() == 1) { - no_need_run_accumulators_.emplace_back(iter->second.get()); - continue; + // leaf_accumulators_ : hooks and accumulate-grad for leaf tensor + if (var->IsLeafGrad()) { + leaf_accumulators_.insert(iter->second.get()); + + if (iter->second->HasInnerVar()) { + var = iter->second->InnerVar(); + } } - auto tmp_var = std::make_shared<VariableWrapper>(var->Name()); - tmp_var->SetType(var->Type()); - var = tmp_var; - need_accu_var_list_.emplace_back(iter->second.get(), var); + if (var->OverridedStopGradient() || iter->second->RefCnt() > 1) { + auto tmp_var = std::make_shared<VariableWrapper>(var->Name()); + tmp_var->SetType(var->Type()); + var = tmp_var; + need_accu_var_list_.emplace_back(iter->second.get(), var); + VLOG(10) << "create temporary var of " << var->Name() + << " for sum gradient within this graph!"; + } } } @@ -256,22 +278,32 @@ void BasicEngine::Execute() { cur_op.place()); } - // Step 2: Sum Gradient & Call Accumulator Hooks - for (auto* accumulator : no_need_run_accumulators_) { + // Step 2: Sum Gradient of This graph + for (auto& pair : need_accu_var_list_) { + pair.first->SumGrad(std::move(pair.second), cur_op.id()); + } + + // Step 3: Call Hooks && Sum Gradient with Pre-Graph && Call BackwardHooks + for (auto* accumulator : leaf_accumulators_) { + if (!accumulator->SumGradCompleted()) { + continue; + } + // 1. Call Hooks for **inner_var_** + + // 2. Sum Gradient with Previous Graph + accumulator->AccumulateGrad(); + + // 3. Call backward Hooks for **var_** if (accumulator->HasPostHooks()) { accumulator->CallBackwardPostHooks(); } } - for (auto& pair : need_accu_var_list_) { - pair.first->Add(std::move(pair.second), cur_op.id()); - } - need_accu_var_list_.clear(); - no_need_run_accumulators_.clear(); + leaf_accumulators_.clear(); - VLOG(3) << "Remove op after op " << cur_op.Type() << " runs"; if (!retain_graph_) { + VLOG(3) << "Remove op after op " << cur_op.Type() << " runs"; cur_op.ClearBackwardTrace(); } } @@ -301,7 +333,7 @@ void BasicEngine::Clear() { node_deps_.clear(); accumulators_.clear(); need_accu_var_list_.clear(); - no_need_run_accumulators_.clear(); + leaf_accumulators_.clear(); } } // namespace imperative diff --git a/paddle/fluid/imperative/basic_engine.h b/paddle/fluid/imperative/basic_engine.h index 92e7fe7eb8..d7ac7594ef 100644 --- a/paddle/fluid/imperative/basic_engine.h +++ b/paddle/fluid/imperative/basic_engine.h @@ -16,6 +16,7 @@ #include <memory> #include <unordered_map> +#include <unordered_set> #include <utility> #include <vector> #include "paddle/fluid/imperative/engine.h" @@ -49,9 +50,9 @@ class BasicEngine : public Engine { accumulators_; std::vector<std::pair<GradientAccumulator*, std::shared_ptr<VariableWrapper>>> need_accu_var_list_; - // Accumulators that does not need to perform accumulation operations, - // the ref_cnt_=1, corresponding to need_accu_var_list_ - std::vector<GradientAccumulator*> no_need_run_accumulators_; + // leaf_accumulators_ is only for leaf tensor(hooks/accumulate grad) + std::unordered_set<GradientAccumulator*> leaf_accumulators_; + bool retain_graph_; }; diff --git a/paddle/fluid/imperative/dygraph_grad_maker.h b/paddle/fluid/imperative/dygraph_grad_maker.h index 0d81221c43..d650452ad9 100644 --- a/paddle/fluid/imperative/dygraph_grad_maker.h +++ b/paddle/fluid/imperative/dygraph_grad_maker.h @@ -219,6 +219,7 @@ class TracedGradOp { if (kRole == TracedVarRole::kBackward) { for (auto& var : vars) { if (var && !var->OverridedStopGradient()) { + var->SetGraphIsFreed(false); var->SetGradNode(node_); } } diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc index 00fd18e5e2..66c4d1c5f5 100644 --- a/paddle/fluid/imperative/gradient_accumulator.cc +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -35,11 +35,12 @@ namespace imperative { static void MoveOrCopyVar(framework::Variable* dst, framework::Variable* src, bool force_copy) { if (!force_copy) { + VLOG(6) << "Just Move Variable when sum gradients within this graph"; *dst = std::move(*src); return; } - VLOG(10) << "Copy occurs when accumulating gradients"; + VLOG(6) << "Copy occurs when sum gradients within this graph"; if (src->IsType<framework::LoDTensor>()) { auto& src_tensor = src->Get<framework::LoDTensor>(); if (!dst->IsType<framework::LoDTensor>()) { @@ -61,7 +62,7 @@ static void MoveOrCopyVar(framework::Variable* dst, framework::Variable* src, dst_selected_rows->set_height(src_selected_rows.height()); } else { PADDLE_THROW(platform::errors::PermissionDenied( - "Only support LoDTensor and SelectedRows for gradient accumulation")); + "Only support LoDTensor and SelectedRows for sum gradient")); } } @@ -313,9 +314,9 @@ std::shared_ptr<VariableWrapper> SelectedRowsMerge( } void VariableWrapperAdd(std::shared_ptr<VariableWrapper> var, - VariableWrapper* var_, bool unchange_input) { + VariableWrapper* dst_var, bool unchange_input) { auto& src = var->Var(); - auto* dst = var_->MutableVar(); + auto* dst = dst_var->MutableVar(); if (dst->IsType<framework::LoDTensor>()) { if (src.IsType<framework::LoDTensor>()) { TensorAdd(src, dst); @@ -362,8 +363,57 @@ static platform::Place GetPlaceOfVar( return place; } -void EagerGradientAccumulator::Add(std::shared_ptr<VariableWrapper> var, - size_t trace_id, bool unchange_input) { +void GradientAccumulator::AccumulateGrad() { + /** + * If the gradient has been calculated by previous graph, + * it should be added to the previous graph result. + */ + if (!var_->IsLeafGrad() || !SumGradCompleted() || !HasInnerVar()) { + return; + } + PADDLE_ENFORCE_EQ(HasInnerVar(), true, + platform::errors::InvalidArgument( + "Leaf tensor should have inner var to store results of " + "this auto-grad")); + PADDLE_ENFORCE_EQ(inner_var_->Var().IsInitialized(), true, + platform::errors::InvalidArgument( + "Interior var of Leaf tensor should be initialized.")); + auto* src = inner_var_->MutableVar(); + auto* dst = var_->MutableVar(); + if (!var_->IsEmpty()) { + VLOG(6) << "Leaf Gradient Var(" << var_->Name() + << ") has been calculated by previous graph, will accumulate on " + "previous graph."; + if (dst->IsType<framework::LoDTensor>()) { + if (src->IsType<framework::LoDTensor>()) { + TensorAdd(*src, dst); + } else if (src->IsType<framework::SelectedRows>()) { + SelectedRowsAddToTensor(*src, dst); + } + } else if (dst->IsType<framework::SelectedRows>()) { + if (src->IsType<framework::LoDTensor>()) { + SelectedRowsAddToTensor(*dst, src); + *dst = std::move(*src); + } else if (src->IsType<framework::SelectedRows>()) { + auto temp = SelectedRowsMerge(*src, *dst); + *dst = std::move(*(temp->MutableVar())); + } + } else { + PADDLE_THROW(platform::errors::PermissionDenied( + "Only support LoDTensor and SelectedRows for gradient var")); + } + } else { + VLOG(6) << "Leaf Gradient Var(" << var_->Name() + << ") has not been initialized, not accumulate. Just move"; + *(dst) = std::move(*src); + var_->SetType(inner_var_->Type()); + var_->SetDataType(inner_var_->DataType()); + } + inner_var_.reset(); +} + +void EagerGradientAccumulator::SumGrad(std::shared_ptr<VariableWrapper> var, + size_t trace_id, bool unchange_input) { /** * If var has grad node, it indicates that this var would be an input * of a grad op. Therefore, it should not be changed. @@ -372,53 +422,57 @@ void EagerGradientAccumulator::Add(std::shared_ptr<VariableWrapper> var, unchange_input = true; } - auto* dst_var = var_->MutableVar(); + auto* dst_var = Var(); platform::Place place = GetPlaceOfVar(var); - if (!var_->OverridedStopGradient()) { - VLOG(3) << "Sum Gradient for: " << var_->Name(); - if (cur_cnt_ == 0) { - MoveOrCopyVar(dst_var, var->MutableVar(), unchange_input); + if (!dst_var->OverridedStopGradient()) { + if (CurCnt() == 0) { + MoveOrCopyVar(dst_var->MutableVar(), var->MutableVar(), unchange_input); } else { - VariableWrapperAdd(var, var_, unchange_input); + VLOG(6) << "Sum Gradient for: " << dst_var->Name() + << " within this graph."; + VariableWrapperAdd(var, dst_var, unchange_input); } } else { - if (!var_->Var().IsInitialized() || - !var_->Var().Get<framework::LoDTensor>().IsInitialized()) { - VLOG(6) << "Set StopGradient Grad: " << var_->Name() << " as zero "; - + if (!dst_var->Var().IsInitialized() || + !dst_var->Var().Get<framework::LoDTensor>().IsInitialized()) { + VLOG(6) << "Set StopGradient Grad: " << dst_var->Name() << " as zero "; auto* dev_ctx = platform::DeviceContextPool::Instance().Get(place); - if (!var_->Var().IsInitialized()) { - auto* tensor = var_->MutableVar()->GetMutable<framework::LoDTensor>(); - VLOG(6) << "Dims of " << var_->Name() << " is set as: " + if (!dst_var->Var().IsInitialized()) { + auto* tensor = + dst_var->MutableVar()->GetMutable<framework::LoDTensor>(); + VLOG(6) << "Dims of " << dst_var->Name() << " is set as: " << var->Var().Get<framework::LoDTensor>().dims(); tensor->Resize(var->Var().Get<framework::LoDTensor>().dims()); tensor->mutable_data(place, var->DataType()); operators::math::set_constant(*dev_ctx, tensor, 0.0); } else { - auto* tensor = var_->MutableVar()->GetMutable<framework::LoDTensor>(); + auto* tensor = + dst_var->MutableVar()->GetMutable<framework::LoDTensor>(); tensor->mutable_data(place, var->DataType()); operators::math::set_constant(*dev_ctx, tensor, 0.0); } } } - if (var_->Var().IsType<framework::LoDTensor>()) { - var_->SetType(framework::proto::VarType::LOD_TENSOR); - } else if (var_->Var().IsType<framework::SelectedRows>()) { - var_->SetType(framework::proto::VarType::SELECTED_ROWS); + // Type may be changed after OP run, such as VarTypeInference + // so synchronous VariableWrapper with Variable. + if (dst_var->Var().IsType<framework::LoDTensor>()) { + dst_var->SetType(framework::proto::VarType::LOD_TENSOR); + } else if (dst_var->Var().IsType<framework::SelectedRows>()) { + dst_var->SetType(framework::proto::VarType::SELECTED_ROWS); } - // Increase count & call post hooks + // Increase curent count IncreaseCurCnt(); } -void SortedGradientAccumulator::Add(std::shared_ptr<VariableWrapper> var, - size_t trace_id, bool unchange_input) { - auto* dst_var = var_->MutableVar(); +void SortedGradientAccumulator::SumGrad(std::shared_ptr<VariableWrapper> var, + size_t trace_id, bool unchange_input) { + auto* dst_var = Var(); platform::Place place = GetPlaceOfVar(var); - if (!var_->OverridedStopGradient()) { + if (!dst_var->OverridedStopGradient()) { if (ref_cnt_ == 1) { - MoveOrCopyVar(dst_var, var->MutableVar(), + MoveOrCopyVar(dst_var->MutableVar(), var->MutableVar(), unchange_input || var->HasGradNode()); } else { if (tmp_grad_vars_.empty()) { @@ -431,6 +485,8 @@ void SortedGradientAccumulator::Add(std::shared_ptr<VariableWrapper> var, return; } + VLOG(6) << "Sum Gradient for: " << dst_var->Name() + << " within this graph."; std::sort(tmp_grad_vars_.begin(), tmp_grad_vars_.end(), [](const SavedVarInfo& info1, const SavedVarInfo& info2) { return info1.trace_id > info2.trace_id; @@ -444,22 +500,22 @@ void SortedGradientAccumulator::Add(std::shared_ptr<VariableWrapper> var, #ifdef PADDLE_WITH_CUDA if (paddle::platform::is_gpu_place(place)) { - bool dst_varbase_is_initialized = false; - // accumulate selected rows firstly + // sum selected rows firstly for (auto& var_info : tmp_grad_vars_) { if (!var_info.var->Var().IsType<framework::SelectedRows>()) { continue; } - if (!dst_varbase_is_initialized) { - dst_varbase_is_initialized = true; - MoveOrCopyVar(dst_var, var_info.var->MutableVar(), + if (CurCnt() == 0) { + MoveOrCopyVar(dst_var->MutableVar(), var_info.var->MutableVar(), var_info.unchange_input); } else { - VariableWrapperAdd(var_info.var, var_, var_info.unchange_input); + VariableWrapperAdd(var_info.var, dst_var, var_info.unchange_input); } var_info.var = nullptr; + // Increase count + IncreaseCurCnt(); } for (auto& var_info : tmp_grad_vars_) { @@ -470,25 +526,38 @@ void SortedGradientAccumulator::Add(std::shared_ptr<VariableWrapper> var, PADDLE_ENFORCE_EQ(var_info.var->Var().IsType<framework::LoDTensor>(), true, platform::errors::PermissionDenied( "Gradient var must be LoDTensor")); - - if (!dst_varbase_is_initialized) { - dst_varbase_is_initialized = true; - MoveOrCopyVar(dst_var, var_info.var->MutableVar(), + if (CurCnt() == 0) { + MoveOrCopyVar(dst_var->MutableVar(), var_info.var->MutableVar(), var_info.unchange_input); } else { - VariableWrapperAdd(var_info.var, var_, var_info.unchange_input); + VariableWrapperAdd(var_info.var, dst_var, var_info.unchange_input); } var_info.var = nullptr; + // Increase count + IncreaseCurCnt(); } } else { #endif - MoveOrCopyVar(dst_var, tmp_grad_vars_[0].var->MutableVar(), - tmp_grad_vars_[0].unchange_input); - for (size_t i = 1; i < tmp_grad_vars_.size(); ++i) { - VariableWrapperAdd(tmp_grad_vars_[i].var, var_, - tmp_grad_vars_[i].unchange_input); - tmp_grad_vars_[i].var = nullptr; + for (auto& var_info : tmp_grad_vars_) { + if (!var_info.var) { + continue; + } + PADDLE_ENFORCE_EQ( + var_info.var->Var().IsType<framework::LoDTensor>() || + var_info.var->Var().IsType<framework::SelectedRows>(), + true, platform::errors::PermissionDenied("The type of Gradient " + "var must be LoDTensor " + "or SelectedRows")); + if (CurCnt() == 0) { + MoveOrCopyVar(dst_var->MutableVar(), var_info.var->MutableVar(), + var_info.unchange_input); + } else { + VariableWrapperAdd(var_info.var, dst_var, var_info.unchange_input); + } + var_info.var = nullptr; + // Increase count + IncreaseCurCnt(); } #ifdef PADDLE_WITH_CUDA } @@ -496,19 +565,21 @@ void SortedGradientAccumulator::Add(std::shared_ptr<VariableWrapper> var, tmp_grad_vars_.clear(); } } else { - if (!var_->Var().IsInitialized() || - !var_->Var().Get<framework::LoDTensor>().IsInitialized()) { + if (!dst_var->Var().IsInitialized() || + !dst_var->Var().Get<framework::LoDTensor>().IsInitialized()) { VLOG(6) << "Set StopGradient Grad: " << var->Name() << " as zero"; auto* dev_ctx = platform::DeviceContextPool::Instance().Get(place); - if (!var_->Var().IsInitialized()) { - auto* tensor = var_->MutableVar()->GetMutable<framework::LoDTensor>(); - VLOG(6) << "Dims of " << var_->Name() << " is set as: " + if (!dst_var->Var().IsInitialized()) { + auto* tensor = + dst_var->MutableVar()->GetMutable<framework::LoDTensor>(); + VLOG(6) << "Dims of " << dst_var->Name() << " is set as: " << var->Var().Get<framework::LoDTensor>().dims(); tensor->Resize(var->Var().Get<framework::LoDTensor>().dims()); tensor->mutable_data(place, var->DataType()); operators::math::set_constant(*dev_ctx, tensor, 0.0); } else { - auto* tensor = var_->MutableVar()->GetMutable<framework::LoDTensor>(); + auto* tensor = + dst_var->MutableVar()->GetMutable<framework::LoDTensor>(); tensor->mutable_data(place, var->DataType()); operators::math::set_constant(*dev_ctx, tensor, 0.0); } @@ -517,15 +588,10 @@ void SortedGradientAccumulator::Add(std::shared_ptr<VariableWrapper> var, tmp_grad_vars_.clear(); } - if (var_->Var().IsType<framework::LoDTensor>()) { - var_->SetType(framework::proto::VarType::LOD_TENSOR); - } else if (var_->Var().IsType<framework::SelectedRows>()) { - var_->SetType(framework::proto::VarType::SELECTED_ROWS); - } - - // call post hooks - if (HasPostHooks()) { - CallBackwardPostHooks(); + if (dst_var->Var().IsType<framework::LoDTensor>()) { + dst_var->SetType(framework::proto::VarType::LOD_TENSOR); + } else if (dst_var->Var().IsType<framework::SelectedRows>()) { + dst_var->SetType(framework::proto::VarType::SELECTED_ROWS); } } diff --git a/paddle/fluid/imperative/gradient_accumulator.h b/paddle/fluid/imperative/gradient_accumulator.h index 2d0cc6e892..ab5ec52fb2 100644 --- a/paddle/fluid/imperative/gradient_accumulator.h +++ b/paddle/fluid/imperative/gradient_accumulator.h @@ -26,17 +26,72 @@ namespace imperative { class GradientAccumulator { public: - explicit GradientAccumulator(VariableWrapper* var) : var_(var) {} + explicit GradientAccumulator(VariableWrapper* var) { + // var may be initialized, so Synchronous VariableWrapper with Variable + if (var && var->Var().IsInitialized()) { + if (var->Var().IsType<framework::LoDTensor>()) { + var->SetType(framework::proto::VarType::LOD_TENSOR); + } else if (var->Var().IsType<framework::SelectedRows>()) { + var->SetType(framework::proto::VarType::SELECTED_ROWS); + } else { + PADDLE_THROW(platform::errors::PermissionDenied( + "Only support LoDTensor and SelectedRows for gradient var")); + } + } + + // inner_var_ record the grad of this auto-grad. + // Only need to generate inner var for non-empty leaf-tensor. + if (var->IsLeafGrad() && !var->IsEmpty()) { + inner_var_ = std::make_shared<VariableWrapper>(var->Name()); + inner_var_->SetType(var->Type()); + inner_var_->SetDataType(var->DataType()); + inner_var_->InnerSetOverridedStopGradient( + var->InnerOverridedStopGradient()); + VLOG(6) << " Create inner grad var for (" << var->Name() + << ") to store result of this Graph"; + } + + // TODO(zhouwei): fix Tensor.clear_gradient() bug, remove this hard flag + var->SetIsEmpty(false); - virtual void Add(std::shared_ptr<VariableWrapper> var, size_t trace_id, - bool unchange_input = false) = 0; + // var_ is the final grad, processed by hooks and grad accumulation + var_ = var; + } + + // function that Sum Gradient with this Graph + virtual void SumGrad(std::shared_ptr<VariableWrapper> var, size_t trace_id, + bool unchange_input = false) = 0; virtual ~GradientAccumulator() = default; - inline void IncreaseRefCnt() { ++ref_cnt_; } + inline void IncreaseRefCnt() { + ++ref_cnt_; + VLOG(6) << var_->Name() << " Increase total count to " << ref_cnt_; + } + + inline void IncreaseCurCnt() { + ++cur_cnt_; + VLOG(6) << var_->Name() << " Increase current count to " << cur_cnt_ + << ", total count: " << ref_cnt_; + } + + inline size_t CurCnt() const { return cur_cnt_; } inline size_t RefCnt() const { return ref_cnt_; } + inline bool SumGradCompleted() const { + return cur_cnt_ == ref_cnt_ || ref_cnt_ == 1; + } + + std::shared_ptr<VariableWrapper>& InnerVar() { return inner_var_; } + + // return the var that will be calculated in this graph + VariableWrapper* Var() { + return inner_var_ != nullptr ? inner_var_.get() : var_; + } + + inline bool HasInnerVar() const { return inner_var_ != nullptr; } + /* Hook related methods */ inline bool HasPostHooks() const { return !post_hooks_.expired(); } @@ -54,6 +109,11 @@ class GradientAccumulator { post_hooks_ = hooks; } } + // void CallHooks(){} + // ** inner_var_ ** + + // function that Sum Gradient with Previous Graph + void AccumulateGrad(); // call backward post hooks, such as reduce hook void CallBackwardPostHooks() { @@ -71,8 +131,11 @@ class GradientAccumulator { protected: VariableWrapper* var_; + // NOTE: only gradient accumulater of leaf tensor should hold + // inner_var_, So not hold it by other shared pointer. + std::shared_ptr<VariableWrapper> inner_var_; size_t ref_cnt_{0}; - + size_t cur_cnt_{0}; std::weak_ptr<LeafVarHookPipeline> post_hooks_; }; @@ -80,32 +143,16 @@ class EagerGradientAccumulator : public GradientAccumulator { public: using GradientAccumulator::GradientAccumulator; - void Add(std::shared_ptr<VariableWrapper> var, size_t trace_id, - bool unchange_input) override; - - private: - inline bool AccumulateCompleted() const { return cur_cnt_ == ref_cnt_; } - - void IncreaseCurCnt() { - ++cur_cnt_; - VLOG(3) << "IncreaseCurCnt: cur_cnt " << cur_cnt_ << ", ref_cnt " - << ref_cnt_; - // After all tmp gradient being accumulated to grad var, run hooks - if (AccumulateCompleted() && HasPostHooks()) { - CallBackwardPostHooks(); - } - } - - private: - size_t cur_cnt_{0}; + void SumGrad(std::shared_ptr<VariableWrapper> var, size_t trace_id, + bool unchange_input) override; }; class SortedGradientAccumulator : public GradientAccumulator { public: using GradientAccumulator::GradientAccumulator; - void Add(std::shared_ptr<VariableWrapper> var, size_t trace_id, - bool unchange_input) override; + void SumGrad(std::shared_ptr<VariableWrapper> var, size_t trace_id, + bool unchange_input) override; private: struct SavedVarInfo { diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index eaf9986b20..6f490c3c2b 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -215,6 +215,10 @@ void VarBase::ClearGradient() { #endif } } + // TODO(zhouwei): It's better to free memory of grad by grad_t->claer. + // But will have some bug on mac CPU of yolov3 model, why? + // After fix this bug, function SetIsEmpty() isn't need + grad_var_->SharedVar()->SetIsEmpty(true); } } diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index 9a587fd6d6..1a974ab346 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -146,6 +146,8 @@ class VarBase { bool OverridedStopGradient() const { return var_->OverridedStopGradient(); } + bool IsLeaf() const { return var_->IsLeaf(); } + void InnerSetOverridedStopGradient(bool stop_gradient) { if (var_->InnerOverridedStopGradient() == -1) { var_->InnerSetOverridedStopGradient(stop_gradient); @@ -182,6 +184,10 @@ class VarBase { std::string GradVarName() { return framework::GradVarName(Name()); } + void SetGraphIsFreed(bool free) { graph_is_free_ = free; } + + const bool& GraphIsFreed() const { return graph_is_free_; } + void SetType(framework::proto::VarType::Type type) { var_->SetType(type); } framework::proto::VarType::Type Type() const { return var_->Type(); } @@ -220,6 +226,8 @@ class VarBase { */ std::shared_ptr<GradOpNode> grad_node_; + bool graph_is_free_ = false; + mutable size_t copied_counter_ = 0; static ThreadSafeNameSet name_set_; diff --git a/paddle/fluid/imperative/partial_grad_engine.cc b/paddle/fluid/imperative/partial_grad_engine.cc index 5c717835e5..d8f828ede2 100644 --- a/paddle/fluid/imperative/partial_grad_engine.cc +++ b/paddle/fluid/imperative/partial_grad_engine.cc @@ -367,7 +367,7 @@ class GradientAccumulationInfo { "Reference count overflows, this may be a bug")); *is_finished = (cur_ref_cnt_ == total_ref_cnt_); - accumulator_->Add(grad_var_partial, trace_id, unchange_input); + accumulator_->SumGrad(grad_var_partial, trace_id, unchange_input); if (create_graph_) { VLOG(10) << "Store partial grad grad for double grad " diff --git a/paddle/fluid/imperative/tests/CMakeLists.txt b/paddle/fluid/imperative/tests/CMakeLists.txt index a8de1e6b03..782f6dad58 100644 --- a/paddle/fluid/imperative/tests/CMakeLists.txt +++ b/paddle/fluid/imperative/tests/CMakeLists.txt @@ -7,7 +7,7 @@ else() endif(WIN32) -cc_test(test_gradient_accmulator SRCS test_gradient_accmulator.cc DEPS memcpy selected_rows selected_rows_functor gradient_accumulator) +cc_test(test_gradient_accmulator SRCS test_gradient_accmulator.cc DEPS memcpy selected_rows selected_rows_functor gradient_accumulator math_function) cc_test(test_layer SRCS test_layer.cc DEPS layer proto_desc operator op_registry variable_helper mul_op memcpy) cc_test(test_prepare_op SRCS test_prepare_op.cc DEPS prepared_operator op_info split_op layer concat_and_split activation_op place) cc_test(test_tracer SRCS test_tracer.cc DEPS tracer layer proto_desc operator op_registry variable_helper mul_op reduce_sum_op elementwise_add_op memcpy) diff --git a/paddle/fluid/imperative/tests/test_gradient_accmulator.cc b/paddle/fluid/imperative/tests/test_gradient_accmulator.cc index 49bc24edba..c394ce07df 100644 --- a/paddle/fluid/imperative/tests/test_gradient_accmulator.cc +++ b/paddle/fluid/imperative/tests/test_gradient_accmulator.cc @@ -19,6 +19,7 @@ #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/imperative/gradient_accumulator.h" #include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/operators/math/math_function.h" namespace imperative = paddle::imperative; namespace platform = paddle::platform; @@ -263,6 +264,9 @@ static void TestGradientAccumulatorTestUnchangeInput( for (auto use_tensor1 : use_tensors) { for (auto use_tensor2 : use_tensors) { + /** g_accum1 && g_accum2: has not been initialized + * test accumulate on this graph + */ auto g_var1 = std::make_shared<VariableWrapper>("g_var1"); g_var1->SetOverridedStopGradient(false); auto g_accum1 = CreateAccumulator(g_var1, sort_gradient); @@ -278,8 +282,14 @@ static void TestGradientAccumulatorTestUnchangeInput( auto var1 = create_var(use_tensor1); auto var_wrapper1_1 = std::make_shared<VariableWrapper>("tmp1_1"); auto var_wrapper2_1 = std::make_shared<VariableWrapper>("tmp2_1"); + + ASSERT_EQ(var_wrapper1_1->IsEmpty(), true); CopyVar(var1, var_wrapper1_1->MutableVar()); + ASSERT_EQ(var_wrapper1_1->IsEmpty(), false); + + ASSERT_EQ(var_wrapper2_1->IsEmpty(), true); CopyVar(var1, var_wrapper2_1->MutableVar()); + ASSERT_EQ(var_wrapper2_1->IsEmpty(), false); auto var2 = create_var(use_tensor2); auto var_wrapper1_2 = std::make_shared<VariableWrapper>("tmp1_2"); @@ -287,15 +297,59 @@ static void TestGradientAccumulatorTestUnchangeInput( CopyVar(var2, var_wrapper1_2->MutableVar()); CopyVar(var2, var_wrapper2_2->MutableVar()); - g_accum1->Add(var_wrapper1_1, 0, false); - g_accum1->Add(var_wrapper1_2, 1, false); - - g_accum2->Add(var_wrapper2_1, 0, true); - g_accum2->Add(var_wrapper2_2, 1, true); + // g_accum1: inner_var_ = var1 + var2 + g_accum1->SumGrad(var_wrapper1_1, 0, false); + g_accum1->SumGrad(var_wrapper1_2, 1, false); + ASSERT_EQ(g_accum1->CurCnt(), g_accum1->RefCnt()); + ASSERT_TRUE(g_accum1->SumGradCompleted()); + // g_accum1: inner_var_ -> var_ + g_accum1->AccumulateGrad(); + + // g_accum2: inner_var_ = var1 + var2 + g_accum2->SumGrad(var_wrapper2_1, 0, true); + g_accum2->SumGrad(var_wrapper2_2, 1, true); + ASSERT_EQ(g_accum2->CurCnt(), g_accum2->RefCnt()); + ASSERT_TRUE(g_accum2->SumGradCompleted()); + // g_accum2: inner_var_ -> var_ + g_accum2->AccumulateGrad(); ASSERT_TRUE(IsEqualVar(var_wrapper2_1->Var(), var1)); ASSERT_TRUE(IsEqualVar(var_wrapper2_2->Var(), var2)); ASSERT_TRUE(IsEqualVar(g_var1->Var(), g_var2->Var())); + + /** g_accum3 && g_accum4: has been initialized + * test accumulate on previous graph + */ + auto var3 = create_var(use_tensor1); + auto var_wrapper3_3 = std::make_shared<VariableWrapper>("tmp1_3"); + auto var_wrapper4_3 = std::make_shared<VariableWrapper>("tmp2_3"); + var_wrapper3_3->SetOverridedStopGradient(false); + var_wrapper4_3->SetOverridedStopGradient(false); + CopyVar(var3, var_wrapper3_3->MutableVar()); + CopyVar(var3, var_wrapper4_3->MutableVar()); + + auto g_accum3 = CreateAccumulator(var_wrapper3_3, sort_gradient); + g_accum3->IncreaseRefCnt(); + auto g_accum4 = CreateAccumulator(var_wrapper4_3, sort_gradient); + g_accum4->IncreaseRefCnt(); + + auto var4 = create_var(use_tensor2); + auto var_wrapper3_4 = std::make_shared<VariableWrapper>("tmp1_4"); + auto var_wrapper4_4 = std::make_shared<VariableWrapper>("tmp2_4"); + CopyVar(var4, var_wrapper3_4->MutableVar()); + CopyVar(var4, var_wrapper4_4->MutableVar()); + + g_accum3->SumGrad(var_wrapper3_4, 0, false); + ASSERT_TRUE(g_accum3->SumGradCompleted()); + // g_accum4: var_(var_wrapper3_3) + inner_var_ -> var_ + g_accum3->AccumulateGrad(); + + g_accum4->SumGrad(var_wrapper4_4, 0, false); + ASSERT_TRUE(g_accum4->SumGradCompleted()); + // g_accum4: var_(var_wrapper4_3) + inner_var_ -> var_ + g_accum4->AccumulateGrad(); + + ASSERT_TRUE(IsEqualVar(var_wrapper3_3->Var(), var_wrapper4_3->Var())); } } } diff --git a/paddle/fluid/imperative/variable_wrapper.h b/paddle/fluid/imperative/variable_wrapper.h index df972035ae..fec12f2da1 100644 --- a/paddle/fluid/imperative/variable_wrapper.h +++ b/paddle/fluid/imperative/variable_wrapper.h @@ -68,10 +68,50 @@ class VariableWrapper { } } + bool IsLeaf() const { + if (OverridedStopGradient()) { + return true; + } + if (HasGradVar() && !GetGradVar()->HasGradNode()) { + return true; + } + return false; + } + + bool IsLeafGrad() const { + if (!HasGradVar() && !HasGradNode() && !OverridedStopGradient()) { + return true; + } + return false; + } + void SetPersistable(bool persistable) { persistable_ = persistable; } bool Persistable() const { return persistable_; } + bool IsEmpty() const { + bool is_empty = true; + if (var_.IsInitialized()) { + const framework::Tensor* tensor = nullptr; + if (var_.IsType<framework::LoDTensor>()) { + tensor = &(var_.Get<framework::LoDTensor>()); + } else if (var_.IsType<framework::SelectedRows>()) { + tensor = &(var_.Get<framework::SelectedRows>().value()); + } else { + PADDLE_THROW(platform::errors::PermissionDenied( + "Only support LoDTensor and SelectedRows for gradient var")); + } + if (tensor && tensor->IsInitialized()) { + is_empty = false; + } + } + return is_empty || is_empty_; + } + + // TODO(zhouwei): fix Tensor.clear_gradient() bug, function SetIsEmpty() isn't + // need + void SetIsEmpty(bool is_empty) { is_empty_ = is_empty; } + const std::string& Name() const { return name_; } void SetName(const std::string& name) { name_ = name; } @@ -96,6 +136,8 @@ class VariableWrapper { bool HasGradNode() const { return !grad_node_.expired(); } + bool HasGradVar() const { return !grad_var_.expired(); } + framework::proto::VarType::Type DataType() const { const framework::Tensor* tensor = nullptr; if (var_.IsInitialized()) { @@ -265,6 +307,10 @@ class VariableWrapper { std::weak_ptr<VariableWrapper> grad_var_; std::weak_ptr<GradOpNode> grad_node_; + // TODO(zhouwei): fix bug of Tensor.clear_gradient(), function SetIsEmpty() + // isn't need + bool is_empty_{false}; + // NOTE: only grad var can hold hooks now // only interior var can hold interior hooks std::shared_ptr<InteriorVarHookPipeline> interior_hooks_; diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index d675782a48..3510c9d152 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -670,7 +670,6 @@ void BindImperative(py::module *m_ptr) { return TensorToPyArray(tensor, true); }, R"DOC( - Returns a numpy array shows the value of current Tensor. Returns: @@ -689,7 +688,6 @@ void BindImperative(py::module *m_ptr) { data = paddle.to_tensor(data) x = linear(data) print(x.numpy()) - )DOC") .def("detach", [](const imperative::VarBase @@ -1080,6 +1078,35 @@ void BindImperative(py::module *m_ptr) { return std::vector<int>(); } }) + .def_property_readonly("is_leaf", &imperative::VarBase::IsLeaf, + R"DOC( + Whether a Tensor is leaf Tensor. + + For the Tensor whose stop_gradient is ``True`` , it will be leaf Tensor. + + For the Tensor whose stop_gradient is ``False`` , it will be leaf Tensor too if it is created by user. + + Returns: + bool: Whether a Tensor is leaf Tensor. + + Examples: + .. code-block:: python + + import paddle + + x = paddle.to_tensor(1.) + print(x.is_leaf) # True + + x = paddle.to_tensor(1., stop_gradient=True) + y = x + 1 + print(x.is_leaf) # True + print(y.is_leaf) # True + + x = paddle.to_tensor(1., stop_gradient=False) + y = x + 1 + print(x.is_leaf) # True + print(y.is_leaf) # False + )DOC") .def_property_readonly( "place", [](imperative::VarBase &self) { return self.Place(); }, py::return_value_policy::copy) diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py index ab5135645a..6a59e33285 100644 --- a/python/paddle/fluid/dygraph/varbase_patch_methods.py +++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py @@ -133,11 +133,12 @@ def monkey_patch_varbase(): @framework.dygraph_only def backward(self, retain_graph=False): """ - **Notes**: - **This API is ONLY available in Dygraph mode** - Run backward of current Graph which starts from current Tensor. + The new gradient will accumulat on previous gradient. + + You can clear gradient by ``Tensor.clear_grad()`` . + Args: retain_graph(bool, optional): If False, the graph used to compute grads will be freed. If you would like to add more ops to the built graph after calling this method( :code:`backward` ), set the parameter @@ -150,21 +151,20 @@ def monkey_patch_varbase(): Examples: .. code-block:: python - import numpy as np - import paddle - paddle.disable_static() - - x = np.ones([2, 2], np.float32) - inputs = [] - for _ in range(10): - tmp = paddle.to_tensor(x) - # if we don't set tmp's stop_gradient as False then, all path to loss will has no gradient since - # there is no one need gradient on it. - tmp.stop_gradient=False - inputs.append(tmp) - ret = paddle.add_n(inputs) - loss = paddle.sum(ret) - loss.backward() + x = paddle.to_tensor(5., stop_gradient=False) + for i in range(5): + y = paddle.pow(x, 4.0) + y.backward() + print("{}: {}".format(i, x.grad)) + # 0: [500.] + # 1: [1000.] + # 2: [1500.] + # 3: [2000.] + # 4: [2500.] + + x.clear_grad() + print("{}".format(x.grad)) + # 0. """ if framework.in_dygraph_mode(): @@ -181,31 +181,21 @@ def monkey_patch_varbase(): @framework.dygraph_only def gradient(self): """ - **Notes**: - **This API is ONLY available in Dygraph mode** - - Get the Gradient of Current Variable + Get the Gradient of Current Tensor. Returns: - ndarray: Numpy value of the gradient of current Variable + ndarray: Numpy value of the gradient of current Tensor Examples: .. code-block:: python - import paddle.fluid as fluid - import numpy as np + import paddle - x = np.ones([2, 2], np.float32) - with fluid.dygraph.guard(): - inputs2 = [] - for _ in range(10): - tmp = fluid.dygraph.base.to_variable(x) - tmp.stop_gradient=False - inputs2.append(tmp) - ret2 = fluid.layers.sums(inputs2) - loss2 = fluid.layers.reduce_sum(ret2) - loss2.backward() - print(loss2.gradient()) + x = paddle.to_tensor(5., stop_gradient=False) + y = paddle.pow(x, 4.0) + y.backward() + print("grad of x: {}".format(x.grad)) + # [500.] """ if self._grad_ivar() is None: @@ -226,6 +216,12 @@ def monkey_patch_varbase(): return self.gradient() + def clear_grad(self): + """ + The alias of clear_gradient(). + """ + self.clear_gradient() + @property def inplace_version(self): """ @@ -284,10 +280,10 @@ def monkey_patch_varbase(): for method_name, method in ( ("__bool__", __bool__), ("__nonzero__", __nonzero__), ("_to_static_var", _to_static_var), ("set_value", set_value), - ("block", block), ("backward", backward), ("grad", grad), - ("inplace_version", inplace_version), ("gradient", gradient), - ("__str__", __str__), ("__repr__", __str__), ("__module__", "paddle"), - ("__name__", "Tensor")): + ("block", block), ("backward", backward), ("clear_grad", clear_grad), + ("inplace_version", inplace_version), ("grad", grad), + ("gradient", gradient), ("__str__", __str__), ("__repr__", __str__), + ("__module__", "paddle"), ("__name__", "Tensor")): setattr(core.VarBase, method_name, method) # patch math methods for varbase diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index f3c4984e29..d4468f0193 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -874,6 +874,8 @@ class Optimizer(object): def clear_gradients(self): """ Clear the gradients of all optimized parameters for model. + + If not, new gradient will accumulat on previous gradient. Returns: None diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py index 514154f1dd..d2f143d7ad 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py @@ -478,6 +478,114 @@ class TestImperative(unittest.TestCase): self.assertEqual(mlp._linear2, sublayers[1]) self.assertEqual(len(sublayers), 2) + def test_gradient_accumulation(self): + def test_single_api(sort_sum_gradient): + fluid.set_flags({'FLAGS_sort_sum_gradient': sort_sum_gradient}) + x = paddle.to_tensor(5., stop_gradient=False) + for i in range(10): + y = paddle.pow(x, 4.0) + y.backward() + print(x.grad) + self.assertEqual(x.grad, (i + 1) * 500) + x.clear_gradient() + self.assertEqual(x.grad, 0.) + for i in range(5): + y = paddle.pow(x, 4.0) + y.backward() + print(x.grad) + self.assertEqual(x.grad, (i + 1) * 500) + + def test_simple_net(sort_sum_gradient): + fluid.set_flags({'FLAGS_sort_sum_gradient': sort_sum_gradient}) + x = paddle.to_tensor(5., stop_gradient=False) + y = paddle.to_tensor(2., stop_gradient=False) + z = paddle.to_tensor(3., stop_gradient=False) + + def fun(x, y, z): + loss1 = x * x * y + loss2 = x * z + dx = paddle.grad([loss1], x, create_graph=True)[0] + # loss = x*x*y + x*z + 2*x*y + loss = loss1 + loss2 + dx + return loss + + loss = fun(x, y, z) + loss.backward(retain_graph=True) + # x.grad = 2*x*y + z + 2*y = 27 + self.assertTrue(np.array_equal(x.grad, [27])) + + loss.backward(retain_graph=True) + self.assertTrue(np.array_equal(x.grad, [54])) + + loss.backward() + self.assertTrue(np.array_equal(x.grad, [81])) + + with self.assertRaises(RuntimeError): + loss.backward() + + loss1 = x * x * y + loss2 = x * z + dx = paddle.grad([loss1], x, create_graph=True)[0] + loss = loss1 + loss2 + dx + loss.backward() + self.assertTrue(np.array_equal(dx.grad, [1])) + self.assertTrue(np.array_equal(x.grad, [108])) + + def test_mlp(sort_sum_gradient): + fluid.set_flags({'FLAGS_sort_sum_gradient': sort_sum_gradient}) + input_size = 5 + paddle.seed(1) + mlp1 = MLP(input_size=input_size) + # generate the gradient of each step + mlp2 = MLP(input_size=input_size) + + expected_weight1_grad = np.zeros(mlp2._linear1.weight.shape) + expected_bias1_grad = np.zeros(mlp2._linear1.bias.shape) + expected_weight2_grad = np.zeros(mlp2._linear2.weight.shape) + expected_bias2_grad = np.zeros(mlp2._linear2.bias.shape) + + for batch_id in range(24): + x = paddle.uniform([10, input_size]) + detach_x = x.detach() + clear_loss = mlp2(detach_x) + clear_loss.backward() + expected_weight1_grad = expected_weight1_grad + mlp2._linear1.weight.grad + expected_bias1_grad = expected_bias1_grad + mlp2._linear1.bias.grad + expected_weight2_grad = expected_weight2_grad + mlp2._linear2.weight.grad + expected_bias2_grad = expected_bias2_grad + mlp2._linear2.bias.grad + + loss = mlp1(x) + loss.backward() + + self.assertTrue(np.array_equal(loss.grad, [1])) + self.assertTrue( + np.allclose(mlp1._linear1.weight.grad, + expected_weight1_grad)) + self.assertTrue( + np.allclose(mlp1._linear1.bias.grad, expected_bias1_grad)) + self.assertTrue( + np.allclose(mlp1._linear2.weight.grad, + expected_weight2_grad)) + self.assertTrue( + np.allclose(mlp1._linear2.bias.grad, expected_bias2_grad)) + + mlp2.clear_gradients() + self.assertTrue(np.array_equal(clear_loss.grad, [1])) + if ((batch_id + 1) % 8) == 0: + mlp1.clear_gradients() + expected_weight1_grad = np.zeros(mlp2._linear1.weight.shape) + expected_bias1_grad = np.zeros(mlp2._linear1.bias.shape) + expected_weight2_grad = np.zeros(mlp2._linear2.weight.shape) + expected_bias2_grad = np.zeros(mlp2._linear2.bias.shape) + + with fluid.dygraph.guard(): + test_single_api(False) + test_single_api(True) + test_simple_net(False) + test_simple_net(True) + test_mlp(False) + test_mlp(True) + def test_dygraph_vs_static(self): np_inp1 = np.random.rand(4, 3, 3) np_inp2 = np.random.rand(4, 3, 3) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py index 8f3116f653..e41960f6b4 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py @@ -214,7 +214,7 @@ class TestDygraphDoubleGrad(TestCase): self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected)) loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) - loss.backward() + loss.backward(retain_graph=True) x_grad_actual = x.gradient() x_grad_expected = (2.0 / float(numel) * @@ -222,6 +222,16 @@ class TestDygraphDoubleGrad(TestCase): (x_np > 0) * 2 / float(numel))).astype('float32') self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) + for i in range(5): + loss.backward(retain_graph=True) + x_grad_actual = x.gradient() + x_grad_expected = (i + 2) * (2.0 / float(numel) * ( + x_np + dx_expected * + (x_np > 0) * 2 / float(numel))).astype('float32') + print(x_grad_actual) + print(x_grad_expected) + self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) + @dygraph_guard def test_example_with_gradient_accumulation_and_no_grad_vars(self): x = random_var(self.shape) diff --git a/python/paddle/fluid/tests/unittests/test_momentum_op.py b/python/paddle/fluid/tests/unittests/test_momentum_op.py index 6ee7940e17..40a1c8def5 100644 --- a/python/paddle/fluid/tests/unittests/test_momentum_op.py +++ b/python/paddle/fluid/tests/unittests/test_momentum_op.py @@ -457,6 +457,7 @@ class TestMomentumOpVsMomentumOpWithDecayAPI(unittest.TestCase): loss = paddle.mean(out) loss.backward() momentum.minimize(loss) + linear.clear_gradients() def __test_vs(self, place=fluid.CPUPlace()): paddle.disable_static(place=place) diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py index 1f101a17da..86ba5a96b8 100644 --- a/python/paddle/fluid/tests/unittests/test_var_base.py +++ b/python/paddle/fluid/tests/unittests/test_var_base.py @@ -198,6 +198,32 @@ class TestVarBase(unittest.TestCase): var = fluid.dygraph.to_variable(t) self.assertTrue(np.array_equal(t, var.numpy())) + def test_leaf_tensor(self): + with fluid.dygraph.guard(): + x = paddle.to_tensor(np.random.uniform(-1, 1, size=[10, 10])) + self.assertTrue(x.is_leaf) + y = x + 1 + self.assertTrue(y.is_leaf) + + x = paddle.to_tensor( + np.random.uniform( + -1, 1, size=[10, 10]), stop_gradient=False) + self.assertTrue(x.is_leaf) + y = x + 1 + self.assertFalse(y.is_leaf) + + linear = paddle.nn.Linear(10, 10) + input = paddle.to_tensor( + np.random.uniform( + -1, 1, size=[10, 10]).astype('float32'), + stop_gradient=False) + self.assertTrue(input.is_leaf) + + out = linear(input) + self.assertTrue(linear.weight.is_leaf) + self.assertTrue(linear.bias.is_leaf) + self.assertFalse(out.is_leaf) + def test_detach(self): with fluid.dygraph.guard(): x = paddle.to_tensor(1.0, dtype="float64", stop_gradient=False) diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index 295821a93c..1cfc0b66e7 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -793,6 +793,8 @@ class Optimizer(object): def clear_grad(self): """ Clear the gradients of all optimized parameters for model. + + If not, new gradient will accumulat on previous gradient. Returns: None -- GitLab