未验证 提交 c0a991c8 编写于 作者: Z Zhou Wei 提交者: GitHub

accumulate gradient for leaf tensor with previous graph and expose leaf tensor concept (#28429)

* The leaf tensor concept is exposed and the gradient accumulation of leaf tensor

* The leaf tensor concept is exposed and the gradient accumulation of leaf tensor

* fix coverage

* fix api doc

* fix CI unittest

* fix CI unittest

* fix unitest

* empty tensor does’t need inner_var_

* fix some error message
上级 74c43ac6
...@@ -38,7 +38,20 @@ namespace imperative { ...@@ -38,7 +38,20 @@ namespace imperative {
void BasicEngine::Init(VarBase* var, bool retain_graph) { void BasicEngine::Init(VarBase* var, bool retain_graph) {
retain_graph_ = retain_graph; retain_graph_ = retain_graph;
init_node_ = var->GradVarBase()->GradNode(); init_node_ = var->GradVarBase()->GradNode();
var->GradVarBase()->ClearGradNode(); PADDLE_ENFORCE_EQ(var->GradVarBase()->GraphIsFreed(), false,
platform::errors::Unavailable(
"%s trying to backward through the same graph a second "
"time, but this graph have already been freed. Please "
"specify Tensor.backward(retain_graph=True) when "
"calling backward at the first time.",
var->Name()));
if (!retain_graph) {
VLOG(5) << "Clear the auto-grad graph from grad var " << var->Name()
<< " because of retain_graph=False when calling backward";
var->GradVarBase()->SetGraphIsFreed(true);
var->GradVarBase()->ClearGradNode();
}
if (init_node_ == nullptr || var->OverridedStopGradient()) { if (init_node_ == nullptr || var->OverridedStopGradient()) {
VLOG(3) << "Skip auto grad since there is no grad op for var or loss is " VLOG(3) << "Skip auto grad since there is no grad op for var or loss is "
...@@ -47,7 +60,7 @@ void BasicEngine::Init(VarBase* var, bool retain_graph) { ...@@ -47,7 +60,7 @@ void BasicEngine::Init(VarBase* var, bool retain_graph) {
return; return;
} }
VLOG(3) << "start backward"; VLOG(3) << "Init first node of backward";
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
var->HasGradVar(), true, var->HasGradVar(), true,
...@@ -114,6 +127,10 @@ void BasicEngine::PrepareGradAccumulators(const OpBase& op) { ...@@ -114,6 +127,10 @@ void BasicEngine::PrepareGradAccumulators(const OpBase& op) {
accumulator->IncreaseRefCnt(); accumulator->IncreaseRefCnt();
VLOG(3) << "Prepare to acccumulate variable grad " << var->Name() << "("
<< var.get() << ") with reference count "
<< accumulator->RefCnt();
if (var->HasLeafHooks()) { if (var->HasLeafHooks()) {
VLOG(3) << "Grad variable wrapper (" << var->Name() VLOG(3) << "Grad variable wrapper (" << var->Name()
<< ") has leaf grad hooks."; << ") has leaf grad hooks.";
...@@ -123,10 +140,6 @@ void BasicEngine::PrepareGradAccumulators(const OpBase& op) { ...@@ -123,10 +140,6 @@ void BasicEngine::PrepareGradAccumulators(const OpBase& op) {
"Gradientaccumulator.")); "Gradientaccumulator."));
accumulator->SetPostHooks(var->GetLeafHooks()); accumulator->SetPostHooks(var->GetLeafHooks());
} }
VLOG(3) << "Prepare to acccumulate variable grad " << var->Name() << "("
<< var.get() << ") with reference count "
<< accumulator->RefCnt();
} }
} }
} }
...@@ -190,13 +203,14 @@ void BasicEngine::Execute() { ...@@ -190,13 +203,14 @@ void BasicEngine::Execute() {
// CheckBackWardInput // CheckBackWardInput
CheckBackwardInputs(cur_op); CheckBackwardInputs(cur_op);
// Step 1: Run Backward // Step 1: Run Backward OP
auto& bwd_ins = cur_op.GetInsMap(); auto& bwd_ins = cur_op.GetInsMap();
auto& bwd_outs = cur_op.GetOutsMap(); auto& bwd_outs = cur_op.GetOutsMap();
NameVarMap<VariableWrapper> tmp_outs(bwd_outs); NameVarMap<VariableWrapper> tmp_outs(bwd_outs);
// 1. construct the output map 2. replace the element in the map // 1. construct the temp output map, avoid to disrupt graph
// A var may be coresponding to several grad var in one op // 2. replace the element in the map by temp var, because a
// var may be coresponding to several grad var in one op
for (auto& pair : tmp_outs) { for (auto& pair : tmp_outs) {
if (!pair.second.IsGrad()) { if (!pair.second.IsGrad()) {
continue; continue;
...@@ -213,15 +227,23 @@ void BasicEngine::Execute() { ...@@ -213,15 +227,23 @@ void BasicEngine::Execute() {
platform::errors::NotFound("Cannot find gradient of variable %s", platform::errors::NotFound("Cannot find gradient of variable %s",
var->Name())); var->Name()));
if (!var->OverridedStopGradient() && iter->second->RefCnt() == 1) { // leaf_accumulators_ : hooks and accumulate-grad for leaf tensor
no_need_run_accumulators_.emplace_back(iter->second.get()); if (var->IsLeafGrad()) {
continue; leaf_accumulators_.insert(iter->second.get());
if (iter->second->HasInnerVar()) {
var = iter->second->InnerVar();
}
} }
auto tmp_var = std::make_shared<VariableWrapper>(var->Name()); if (var->OverridedStopGradient() || iter->second->RefCnt() > 1) {
tmp_var->SetType(var->Type()); auto tmp_var = std::make_shared<VariableWrapper>(var->Name());
var = tmp_var; tmp_var->SetType(var->Type());
need_accu_var_list_.emplace_back(iter->second.get(), var); var = tmp_var;
need_accu_var_list_.emplace_back(iter->second.get(), var);
VLOG(10) << "create temporary var of " << var->Name()
<< " for sum gradient within this graph!";
}
} }
} }
...@@ -256,22 +278,32 @@ void BasicEngine::Execute() { ...@@ -256,22 +278,32 @@ void BasicEngine::Execute() {
cur_op.place()); cur_op.place());
} }
// Step 2: Sum Gradient & Call Accumulator Hooks // Step 2: Sum Gradient of This graph
for (auto* accumulator : no_need_run_accumulators_) { for (auto& pair : need_accu_var_list_) {
pair.first->SumGrad(std::move(pair.second), cur_op.id());
}
// Step 3: Call Hooks && Sum Gradient with Pre-Graph && Call BackwardHooks
for (auto* accumulator : leaf_accumulators_) {
if (!accumulator->SumGradCompleted()) {
continue;
}
// 1. Call Hooks for **inner_var_**
// 2. Sum Gradient with Previous Graph
accumulator->AccumulateGrad();
// 3. Call backward Hooks for **var_**
if (accumulator->HasPostHooks()) { if (accumulator->HasPostHooks()) {
accumulator->CallBackwardPostHooks(); accumulator->CallBackwardPostHooks();
} }
} }
for (auto& pair : need_accu_var_list_) {
pair.first->Add(std::move(pair.second), cur_op.id());
}
need_accu_var_list_.clear(); need_accu_var_list_.clear();
no_need_run_accumulators_.clear(); leaf_accumulators_.clear();
VLOG(3) << "Remove op after op " << cur_op.Type() << " runs";
if (!retain_graph_) { if (!retain_graph_) {
VLOG(3) << "Remove op after op " << cur_op.Type() << " runs";
cur_op.ClearBackwardTrace(); cur_op.ClearBackwardTrace();
} }
} }
...@@ -301,7 +333,7 @@ void BasicEngine::Clear() { ...@@ -301,7 +333,7 @@ void BasicEngine::Clear() {
node_deps_.clear(); node_deps_.clear();
accumulators_.clear(); accumulators_.clear();
need_accu_var_list_.clear(); need_accu_var_list_.clear();
no_need_run_accumulators_.clear(); leaf_accumulators_.clear();
} }
} // namespace imperative } // namespace imperative
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <memory> #include <memory>
#include <unordered_map> #include <unordered_map>
#include <unordered_set>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/imperative/engine.h" #include "paddle/fluid/imperative/engine.h"
...@@ -49,9 +50,9 @@ class BasicEngine : public Engine { ...@@ -49,9 +50,9 @@ class BasicEngine : public Engine {
accumulators_; accumulators_;
std::vector<std::pair<GradientAccumulator*, std::shared_ptr<VariableWrapper>>> std::vector<std::pair<GradientAccumulator*, std::shared_ptr<VariableWrapper>>>
need_accu_var_list_; need_accu_var_list_;
// Accumulators that does not need to perform accumulation operations, // leaf_accumulators_ is only for leaf tensor(hooks/accumulate grad)
// the ref_cnt_=1, corresponding to need_accu_var_list_ std::unordered_set<GradientAccumulator*> leaf_accumulators_;
std::vector<GradientAccumulator*> no_need_run_accumulators_;
bool retain_graph_; bool retain_graph_;
}; };
......
...@@ -219,6 +219,7 @@ class TracedGradOp { ...@@ -219,6 +219,7 @@ class TracedGradOp {
if (kRole == TracedVarRole::kBackward) { if (kRole == TracedVarRole::kBackward) {
for (auto& var : vars) { for (auto& var : vars) {
if (var && !var->OverridedStopGradient()) { if (var && !var->OverridedStopGradient()) {
var->SetGraphIsFreed(false);
var->SetGradNode(node_); var->SetGradNode(node_);
} }
} }
......
...@@ -35,11 +35,12 @@ namespace imperative { ...@@ -35,11 +35,12 @@ namespace imperative {
static void MoveOrCopyVar(framework::Variable* dst, framework::Variable* src, static void MoveOrCopyVar(framework::Variable* dst, framework::Variable* src,
bool force_copy) { bool force_copy) {
if (!force_copy) { if (!force_copy) {
VLOG(6) << "Just Move Variable when sum gradients within this graph";
*dst = std::move(*src); *dst = std::move(*src);
return; return;
} }
VLOG(10) << "Copy occurs when accumulating gradients"; VLOG(6) << "Copy occurs when sum gradients within this graph";
if (src->IsType<framework::LoDTensor>()) { if (src->IsType<framework::LoDTensor>()) {
auto& src_tensor = src->Get<framework::LoDTensor>(); auto& src_tensor = src->Get<framework::LoDTensor>();
if (!dst->IsType<framework::LoDTensor>()) { if (!dst->IsType<framework::LoDTensor>()) {
...@@ -61,7 +62,7 @@ static void MoveOrCopyVar(framework::Variable* dst, framework::Variable* src, ...@@ -61,7 +62,7 @@ static void MoveOrCopyVar(framework::Variable* dst, framework::Variable* src,
dst_selected_rows->set_height(src_selected_rows.height()); dst_selected_rows->set_height(src_selected_rows.height());
} else { } else {
PADDLE_THROW(platform::errors::PermissionDenied( PADDLE_THROW(platform::errors::PermissionDenied(
"Only support LoDTensor and SelectedRows for gradient accumulation")); "Only support LoDTensor and SelectedRows for sum gradient"));
} }
} }
...@@ -313,9 +314,9 @@ std::shared_ptr<VariableWrapper> SelectedRowsMerge( ...@@ -313,9 +314,9 @@ std::shared_ptr<VariableWrapper> SelectedRowsMerge(
} }
void VariableWrapperAdd(std::shared_ptr<VariableWrapper> var, void VariableWrapperAdd(std::shared_ptr<VariableWrapper> var,
VariableWrapper* var_, bool unchange_input) { VariableWrapper* dst_var, bool unchange_input) {
auto& src = var->Var(); auto& src = var->Var();
auto* dst = var_->MutableVar(); auto* dst = dst_var->MutableVar();
if (dst->IsType<framework::LoDTensor>()) { if (dst->IsType<framework::LoDTensor>()) {
if (src.IsType<framework::LoDTensor>()) { if (src.IsType<framework::LoDTensor>()) {
TensorAdd(src, dst); TensorAdd(src, dst);
...@@ -362,8 +363,57 @@ static platform::Place GetPlaceOfVar( ...@@ -362,8 +363,57 @@ static platform::Place GetPlaceOfVar(
return place; return place;
} }
void EagerGradientAccumulator::Add(std::shared_ptr<VariableWrapper> var, void GradientAccumulator::AccumulateGrad() {
size_t trace_id, bool unchange_input) { /**
* If the gradient has been calculated by previous graph,
* it should be added to the previous graph result.
*/
if (!var_->IsLeafGrad() || !SumGradCompleted() || !HasInnerVar()) {
return;
}
PADDLE_ENFORCE_EQ(HasInnerVar(), true,
platform::errors::InvalidArgument(
"Leaf tensor should have inner var to store results of "
"this auto-grad"));
PADDLE_ENFORCE_EQ(inner_var_->Var().IsInitialized(), true,
platform::errors::InvalidArgument(
"Interior var of Leaf tensor should be initialized."));
auto* src = inner_var_->MutableVar();
auto* dst = var_->MutableVar();
if (!var_->IsEmpty()) {
VLOG(6) << "Leaf Gradient Var(" << var_->Name()
<< ") has been calculated by previous graph, will accumulate on "
"previous graph.";
if (dst->IsType<framework::LoDTensor>()) {
if (src->IsType<framework::LoDTensor>()) {
TensorAdd(*src, dst);
} else if (src->IsType<framework::SelectedRows>()) {
SelectedRowsAddToTensor(*src, dst);
}
} else if (dst->IsType<framework::SelectedRows>()) {
if (src->IsType<framework::LoDTensor>()) {
SelectedRowsAddToTensor(*dst, src);
*dst = std::move(*src);
} else if (src->IsType<framework::SelectedRows>()) {
auto temp = SelectedRowsMerge(*src, *dst);
*dst = std::move(*(temp->MutableVar()));
}
} else {
PADDLE_THROW(platform::errors::PermissionDenied(
"Only support LoDTensor and SelectedRows for gradient var"));
}
} else {
VLOG(6) << "Leaf Gradient Var(" << var_->Name()
<< ") has not been initialized, not accumulate. Just move";
*(dst) = std::move(*src);
var_->SetType(inner_var_->Type());
var_->SetDataType(inner_var_->DataType());
}
inner_var_.reset();
}
void EagerGradientAccumulator::SumGrad(std::shared_ptr<VariableWrapper> var,
size_t trace_id, bool unchange_input) {
/** /**
* If var has grad node, it indicates that this var would be an input * If var has grad node, it indicates that this var would be an input
* of a grad op. Therefore, it should not be changed. * of a grad op. Therefore, it should not be changed.
...@@ -372,53 +422,57 @@ void EagerGradientAccumulator::Add(std::shared_ptr<VariableWrapper> var, ...@@ -372,53 +422,57 @@ void EagerGradientAccumulator::Add(std::shared_ptr<VariableWrapper> var,
unchange_input = true; unchange_input = true;
} }
auto* dst_var = var_->MutableVar(); auto* dst_var = Var();
platform::Place place = GetPlaceOfVar(var); platform::Place place = GetPlaceOfVar(var);
if (!var_->OverridedStopGradient()) { if (!dst_var->OverridedStopGradient()) {
VLOG(3) << "Sum Gradient for: " << var_->Name(); if (CurCnt() == 0) {
if (cur_cnt_ == 0) { MoveOrCopyVar(dst_var->MutableVar(), var->MutableVar(), unchange_input);
MoveOrCopyVar(dst_var, var->MutableVar(), unchange_input);
} else { } else {
VariableWrapperAdd(var, var_, unchange_input); VLOG(6) << "Sum Gradient for: " << dst_var->Name()
<< " within this graph.";
VariableWrapperAdd(var, dst_var, unchange_input);
} }
} else { } else {
if (!var_->Var().IsInitialized() || if (!dst_var->Var().IsInitialized() ||
!var_->Var().Get<framework::LoDTensor>().IsInitialized()) { !dst_var->Var().Get<framework::LoDTensor>().IsInitialized()) {
VLOG(6) << "Set StopGradient Grad: " << var_->Name() << " as zero "; VLOG(6) << "Set StopGradient Grad: " << dst_var->Name() << " as zero ";
auto* dev_ctx = platform::DeviceContextPool::Instance().Get(place); auto* dev_ctx = platform::DeviceContextPool::Instance().Get(place);
if (!var_->Var().IsInitialized()) { if (!dst_var->Var().IsInitialized()) {
auto* tensor = var_->MutableVar()->GetMutable<framework::LoDTensor>(); auto* tensor =
VLOG(6) << "Dims of " << var_->Name() << " is set as: " dst_var->MutableVar()->GetMutable<framework::LoDTensor>();
VLOG(6) << "Dims of " << dst_var->Name() << " is set as: "
<< var->Var().Get<framework::LoDTensor>().dims(); << var->Var().Get<framework::LoDTensor>().dims();
tensor->Resize(var->Var().Get<framework::LoDTensor>().dims()); tensor->Resize(var->Var().Get<framework::LoDTensor>().dims());
tensor->mutable_data(place, var->DataType()); tensor->mutable_data(place, var->DataType());
operators::math::set_constant(*dev_ctx, tensor, 0.0); operators::math::set_constant(*dev_ctx, tensor, 0.0);
} else { } else {
auto* tensor = var_->MutableVar()->GetMutable<framework::LoDTensor>(); auto* tensor =
dst_var->MutableVar()->GetMutable<framework::LoDTensor>();
tensor->mutable_data(place, var->DataType()); tensor->mutable_data(place, var->DataType());
operators::math::set_constant(*dev_ctx, tensor, 0.0); operators::math::set_constant(*dev_ctx, tensor, 0.0);
} }
} }
} }
if (var_->Var().IsType<framework::LoDTensor>()) { // Type may be changed after OP run, such as VarTypeInference
var_->SetType(framework::proto::VarType::LOD_TENSOR); // so synchronous VariableWrapper with Variable.
} else if (var_->Var().IsType<framework::SelectedRows>()) { if (dst_var->Var().IsType<framework::LoDTensor>()) {
var_->SetType(framework::proto::VarType::SELECTED_ROWS); dst_var->SetType(framework::proto::VarType::LOD_TENSOR);
} else if (dst_var->Var().IsType<framework::SelectedRows>()) {
dst_var->SetType(framework::proto::VarType::SELECTED_ROWS);
} }
// Increase count & call post hooks // Increase curent count
IncreaseCurCnt(); IncreaseCurCnt();
} }
void SortedGradientAccumulator::Add(std::shared_ptr<VariableWrapper> var, void SortedGradientAccumulator::SumGrad(std::shared_ptr<VariableWrapper> var,
size_t trace_id, bool unchange_input) { size_t trace_id, bool unchange_input) {
auto* dst_var = var_->MutableVar(); auto* dst_var = Var();
platform::Place place = GetPlaceOfVar(var); platform::Place place = GetPlaceOfVar(var);
if (!var_->OverridedStopGradient()) { if (!dst_var->OverridedStopGradient()) {
if (ref_cnt_ == 1) { if (ref_cnt_ == 1) {
MoveOrCopyVar(dst_var, var->MutableVar(), MoveOrCopyVar(dst_var->MutableVar(), var->MutableVar(),
unchange_input || var->HasGradNode()); unchange_input || var->HasGradNode());
} else { } else {
if (tmp_grad_vars_.empty()) { if (tmp_grad_vars_.empty()) {
...@@ -431,6 +485,8 @@ void SortedGradientAccumulator::Add(std::shared_ptr<VariableWrapper> var, ...@@ -431,6 +485,8 @@ void SortedGradientAccumulator::Add(std::shared_ptr<VariableWrapper> var,
return; return;
} }
VLOG(6) << "Sum Gradient for: " << dst_var->Name()
<< " within this graph.";
std::sort(tmp_grad_vars_.begin(), tmp_grad_vars_.end(), std::sort(tmp_grad_vars_.begin(), tmp_grad_vars_.end(),
[](const SavedVarInfo& info1, const SavedVarInfo& info2) { [](const SavedVarInfo& info1, const SavedVarInfo& info2) {
return info1.trace_id > info2.trace_id; return info1.trace_id > info2.trace_id;
...@@ -444,22 +500,22 @@ void SortedGradientAccumulator::Add(std::shared_ptr<VariableWrapper> var, ...@@ -444,22 +500,22 @@ void SortedGradientAccumulator::Add(std::shared_ptr<VariableWrapper> var,
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (paddle::platform::is_gpu_place(place)) { if (paddle::platform::is_gpu_place(place)) {
bool dst_varbase_is_initialized = false; // sum selected rows firstly
// accumulate selected rows firstly
for (auto& var_info : tmp_grad_vars_) { for (auto& var_info : tmp_grad_vars_) {
if (!var_info.var->Var().IsType<framework::SelectedRows>()) { if (!var_info.var->Var().IsType<framework::SelectedRows>()) {
continue; continue;
} }
if (!dst_varbase_is_initialized) { if (CurCnt() == 0) {
dst_varbase_is_initialized = true; MoveOrCopyVar(dst_var->MutableVar(), var_info.var->MutableVar(),
MoveOrCopyVar(dst_var, var_info.var->MutableVar(),
var_info.unchange_input); var_info.unchange_input);
} else { } else {
VariableWrapperAdd(var_info.var, var_, var_info.unchange_input); VariableWrapperAdd(var_info.var, dst_var, var_info.unchange_input);
} }
var_info.var = nullptr; var_info.var = nullptr;
// Increase count
IncreaseCurCnt();
} }
for (auto& var_info : tmp_grad_vars_) { for (auto& var_info : tmp_grad_vars_) {
...@@ -470,25 +526,38 @@ void SortedGradientAccumulator::Add(std::shared_ptr<VariableWrapper> var, ...@@ -470,25 +526,38 @@ void SortedGradientAccumulator::Add(std::shared_ptr<VariableWrapper> var,
PADDLE_ENFORCE_EQ(var_info.var->Var().IsType<framework::LoDTensor>(), PADDLE_ENFORCE_EQ(var_info.var->Var().IsType<framework::LoDTensor>(),
true, platform::errors::PermissionDenied( true, platform::errors::PermissionDenied(
"Gradient var must be LoDTensor")); "Gradient var must be LoDTensor"));
if (CurCnt() == 0) {
if (!dst_varbase_is_initialized) { MoveOrCopyVar(dst_var->MutableVar(), var_info.var->MutableVar(),
dst_varbase_is_initialized = true;
MoveOrCopyVar(dst_var, var_info.var->MutableVar(),
var_info.unchange_input); var_info.unchange_input);
} else { } else {
VariableWrapperAdd(var_info.var, var_, var_info.unchange_input); VariableWrapperAdd(var_info.var, dst_var, var_info.unchange_input);
} }
var_info.var = nullptr; var_info.var = nullptr;
// Increase count
IncreaseCurCnt();
} }
} else { } else {
#endif #endif
MoveOrCopyVar(dst_var, tmp_grad_vars_[0].var->MutableVar(), for (auto& var_info : tmp_grad_vars_) {
tmp_grad_vars_[0].unchange_input); if (!var_info.var) {
for (size_t i = 1; i < tmp_grad_vars_.size(); ++i) { continue;
VariableWrapperAdd(tmp_grad_vars_[i].var, var_, }
tmp_grad_vars_[i].unchange_input); PADDLE_ENFORCE_EQ(
tmp_grad_vars_[i].var = nullptr; var_info.var->Var().IsType<framework::LoDTensor>() ||
var_info.var->Var().IsType<framework::SelectedRows>(),
true, platform::errors::PermissionDenied("The type of Gradient "
"var must be LoDTensor "
"or SelectedRows"));
if (CurCnt() == 0) {
MoveOrCopyVar(dst_var->MutableVar(), var_info.var->MutableVar(),
var_info.unchange_input);
} else {
VariableWrapperAdd(var_info.var, dst_var, var_info.unchange_input);
}
var_info.var = nullptr;
// Increase count
IncreaseCurCnt();
} }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
} }
...@@ -496,19 +565,21 @@ void SortedGradientAccumulator::Add(std::shared_ptr<VariableWrapper> var, ...@@ -496,19 +565,21 @@ void SortedGradientAccumulator::Add(std::shared_ptr<VariableWrapper> var,
tmp_grad_vars_.clear(); tmp_grad_vars_.clear();
} }
} else { } else {
if (!var_->Var().IsInitialized() || if (!dst_var->Var().IsInitialized() ||
!var_->Var().Get<framework::LoDTensor>().IsInitialized()) { !dst_var->Var().Get<framework::LoDTensor>().IsInitialized()) {
VLOG(6) << "Set StopGradient Grad: " << var->Name() << " as zero"; VLOG(6) << "Set StopGradient Grad: " << var->Name() << " as zero";
auto* dev_ctx = platform::DeviceContextPool::Instance().Get(place); auto* dev_ctx = platform::DeviceContextPool::Instance().Get(place);
if (!var_->Var().IsInitialized()) { if (!dst_var->Var().IsInitialized()) {
auto* tensor = var_->MutableVar()->GetMutable<framework::LoDTensor>(); auto* tensor =
VLOG(6) << "Dims of " << var_->Name() << " is set as: " dst_var->MutableVar()->GetMutable<framework::LoDTensor>();
VLOG(6) << "Dims of " << dst_var->Name() << " is set as: "
<< var->Var().Get<framework::LoDTensor>().dims(); << var->Var().Get<framework::LoDTensor>().dims();
tensor->Resize(var->Var().Get<framework::LoDTensor>().dims()); tensor->Resize(var->Var().Get<framework::LoDTensor>().dims());
tensor->mutable_data(place, var->DataType()); tensor->mutable_data(place, var->DataType());
operators::math::set_constant(*dev_ctx, tensor, 0.0); operators::math::set_constant(*dev_ctx, tensor, 0.0);
} else { } else {
auto* tensor = var_->MutableVar()->GetMutable<framework::LoDTensor>(); auto* tensor =
dst_var->MutableVar()->GetMutable<framework::LoDTensor>();
tensor->mutable_data(place, var->DataType()); tensor->mutable_data(place, var->DataType());
operators::math::set_constant(*dev_ctx, tensor, 0.0); operators::math::set_constant(*dev_ctx, tensor, 0.0);
} }
...@@ -517,15 +588,10 @@ void SortedGradientAccumulator::Add(std::shared_ptr<VariableWrapper> var, ...@@ -517,15 +588,10 @@ void SortedGradientAccumulator::Add(std::shared_ptr<VariableWrapper> var,
tmp_grad_vars_.clear(); tmp_grad_vars_.clear();
} }
if (var_->Var().IsType<framework::LoDTensor>()) { if (dst_var->Var().IsType<framework::LoDTensor>()) {
var_->SetType(framework::proto::VarType::LOD_TENSOR); dst_var->SetType(framework::proto::VarType::LOD_TENSOR);
} else if (var_->Var().IsType<framework::SelectedRows>()) { } else if (dst_var->Var().IsType<framework::SelectedRows>()) {
var_->SetType(framework::proto::VarType::SELECTED_ROWS); dst_var->SetType(framework::proto::VarType::SELECTED_ROWS);
}
// call post hooks
if (HasPostHooks()) {
CallBackwardPostHooks();
} }
} }
......
...@@ -26,17 +26,72 @@ namespace imperative { ...@@ -26,17 +26,72 @@ namespace imperative {
class GradientAccumulator { class GradientAccumulator {
public: public:
explicit GradientAccumulator(VariableWrapper* var) : var_(var) {} explicit GradientAccumulator(VariableWrapper* var) {
// var may be initialized, so Synchronous VariableWrapper with Variable
if (var && var->Var().IsInitialized()) {
if (var->Var().IsType<framework::LoDTensor>()) {
var->SetType(framework::proto::VarType::LOD_TENSOR);
} else if (var->Var().IsType<framework::SelectedRows>()) {
var->SetType(framework::proto::VarType::SELECTED_ROWS);
} else {
PADDLE_THROW(platform::errors::PermissionDenied(
"Only support LoDTensor and SelectedRows for gradient var"));
}
}
// inner_var_ record the grad of this auto-grad.
// Only need to generate inner var for non-empty leaf-tensor.
if (var->IsLeafGrad() && !var->IsEmpty()) {
inner_var_ = std::make_shared<VariableWrapper>(var->Name());
inner_var_->SetType(var->Type());
inner_var_->SetDataType(var->DataType());
inner_var_->InnerSetOverridedStopGradient(
var->InnerOverridedStopGradient());
VLOG(6) << " Create inner grad var for (" << var->Name()
<< ") to store result of this Graph";
}
// TODO(zhouwei): fix Tensor.clear_gradient() bug, remove this hard flag
var->SetIsEmpty(false);
virtual void Add(std::shared_ptr<VariableWrapper> var, size_t trace_id, // var_ is the final grad, processed by hooks and grad accumulation
bool unchange_input = false) = 0; var_ = var;
}
// function that Sum Gradient with this Graph
virtual void SumGrad(std::shared_ptr<VariableWrapper> var, size_t trace_id,
bool unchange_input = false) = 0;
virtual ~GradientAccumulator() = default; virtual ~GradientAccumulator() = default;
inline void IncreaseRefCnt() { ++ref_cnt_; } inline void IncreaseRefCnt() {
++ref_cnt_;
VLOG(6) << var_->Name() << " Increase total count to " << ref_cnt_;
}
inline void IncreaseCurCnt() {
++cur_cnt_;
VLOG(6) << var_->Name() << " Increase current count to " << cur_cnt_
<< ", total count: " << ref_cnt_;
}
inline size_t CurCnt() const { return cur_cnt_; }
inline size_t RefCnt() const { return ref_cnt_; } inline size_t RefCnt() const { return ref_cnt_; }
inline bool SumGradCompleted() const {
return cur_cnt_ == ref_cnt_ || ref_cnt_ == 1;
}
std::shared_ptr<VariableWrapper>& InnerVar() { return inner_var_; }
// return the var that will be calculated in this graph
VariableWrapper* Var() {
return inner_var_ != nullptr ? inner_var_.get() : var_;
}
inline bool HasInnerVar() const { return inner_var_ != nullptr; }
/* Hook related methods */ /* Hook related methods */
inline bool HasPostHooks() const { return !post_hooks_.expired(); } inline bool HasPostHooks() const { return !post_hooks_.expired(); }
...@@ -54,6 +109,11 @@ class GradientAccumulator { ...@@ -54,6 +109,11 @@ class GradientAccumulator {
post_hooks_ = hooks; post_hooks_ = hooks;
} }
} }
// void CallHooks(){}
// ** inner_var_ **
// function that Sum Gradient with Previous Graph
void AccumulateGrad();
// call backward post hooks, such as reduce hook // call backward post hooks, such as reduce hook
void CallBackwardPostHooks() { void CallBackwardPostHooks() {
...@@ -71,8 +131,11 @@ class GradientAccumulator { ...@@ -71,8 +131,11 @@ class GradientAccumulator {
protected: protected:
VariableWrapper* var_; VariableWrapper* var_;
// NOTE: only gradient accumulater of leaf tensor should hold
// inner_var_, So not hold it by other shared pointer.
std::shared_ptr<VariableWrapper> inner_var_;
size_t ref_cnt_{0}; size_t ref_cnt_{0};
size_t cur_cnt_{0};
std::weak_ptr<LeafVarHookPipeline> post_hooks_; std::weak_ptr<LeafVarHookPipeline> post_hooks_;
}; };
...@@ -80,32 +143,16 @@ class EagerGradientAccumulator : public GradientAccumulator { ...@@ -80,32 +143,16 @@ class EagerGradientAccumulator : public GradientAccumulator {
public: public:
using GradientAccumulator::GradientAccumulator; using GradientAccumulator::GradientAccumulator;
void Add(std::shared_ptr<VariableWrapper> var, size_t trace_id, void SumGrad(std::shared_ptr<VariableWrapper> var, size_t trace_id,
bool unchange_input) override; bool unchange_input) override;
private:
inline bool AccumulateCompleted() const { return cur_cnt_ == ref_cnt_; }
void IncreaseCurCnt() {
++cur_cnt_;
VLOG(3) << "IncreaseCurCnt: cur_cnt " << cur_cnt_ << ", ref_cnt "
<< ref_cnt_;
// After all tmp gradient being accumulated to grad var, run hooks
if (AccumulateCompleted() && HasPostHooks()) {
CallBackwardPostHooks();
}
}
private:
size_t cur_cnt_{0};
}; };
class SortedGradientAccumulator : public GradientAccumulator { class SortedGradientAccumulator : public GradientAccumulator {
public: public:
using GradientAccumulator::GradientAccumulator; using GradientAccumulator::GradientAccumulator;
void Add(std::shared_ptr<VariableWrapper> var, size_t trace_id, void SumGrad(std::shared_ptr<VariableWrapper> var, size_t trace_id,
bool unchange_input) override; bool unchange_input) override;
private: private:
struct SavedVarInfo { struct SavedVarInfo {
......
...@@ -215,6 +215,10 @@ void VarBase::ClearGradient() { ...@@ -215,6 +215,10 @@ void VarBase::ClearGradient() {
#endif #endif
} }
} }
// TODO(zhouwei): It's better to free memory of grad by grad_t->claer.
// But will have some bug on mac CPU of yolov3 model, why?
// After fix this bug, function SetIsEmpty() isn't need
grad_var_->SharedVar()->SetIsEmpty(true);
} }
} }
......
...@@ -146,6 +146,8 @@ class VarBase { ...@@ -146,6 +146,8 @@ class VarBase {
bool OverridedStopGradient() const { return var_->OverridedStopGradient(); } bool OverridedStopGradient() const { return var_->OverridedStopGradient(); }
bool IsLeaf() const { return var_->IsLeaf(); }
void InnerSetOverridedStopGradient(bool stop_gradient) { void InnerSetOverridedStopGradient(bool stop_gradient) {
if (var_->InnerOverridedStopGradient() == -1) { if (var_->InnerOverridedStopGradient() == -1) {
var_->InnerSetOverridedStopGradient(stop_gradient); var_->InnerSetOverridedStopGradient(stop_gradient);
...@@ -182,6 +184,10 @@ class VarBase { ...@@ -182,6 +184,10 @@ class VarBase {
std::string GradVarName() { return framework::GradVarName(Name()); } std::string GradVarName() { return framework::GradVarName(Name()); }
void SetGraphIsFreed(bool free) { graph_is_free_ = free; }
const bool& GraphIsFreed() const { return graph_is_free_; }
void SetType(framework::proto::VarType::Type type) { var_->SetType(type); } void SetType(framework::proto::VarType::Type type) { var_->SetType(type); }
framework::proto::VarType::Type Type() const { return var_->Type(); } framework::proto::VarType::Type Type() const { return var_->Type(); }
...@@ -220,6 +226,8 @@ class VarBase { ...@@ -220,6 +226,8 @@ class VarBase {
*/ */
std::shared_ptr<GradOpNode> grad_node_; std::shared_ptr<GradOpNode> grad_node_;
bool graph_is_free_ = false;
mutable size_t copied_counter_ = 0; mutable size_t copied_counter_ = 0;
static ThreadSafeNameSet name_set_; static ThreadSafeNameSet name_set_;
......
...@@ -367,7 +367,7 @@ class GradientAccumulationInfo { ...@@ -367,7 +367,7 @@ class GradientAccumulationInfo {
"Reference count overflows, this may be a bug")); "Reference count overflows, this may be a bug"));
*is_finished = (cur_ref_cnt_ == total_ref_cnt_); *is_finished = (cur_ref_cnt_ == total_ref_cnt_);
accumulator_->Add(grad_var_partial, trace_id, unchange_input); accumulator_->SumGrad(grad_var_partial, trace_id, unchange_input);
if (create_graph_) { if (create_graph_) {
VLOG(10) << "Store partial grad grad for double grad " VLOG(10) << "Store partial grad grad for double grad "
......
...@@ -7,7 +7,7 @@ else() ...@@ -7,7 +7,7 @@ else()
endif(WIN32) endif(WIN32)
cc_test(test_gradient_accmulator SRCS test_gradient_accmulator.cc DEPS memcpy selected_rows selected_rows_functor gradient_accumulator) cc_test(test_gradient_accmulator SRCS test_gradient_accmulator.cc DEPS memcpy selected_rows selected_rows_functor gradient_accumulator math_function)
cc_test(test_layer SRCS test_layer.cc DEPS layer proto_desc operator op_registry variable_helper mul_op memcpy) cc_test(test_layer SRCS test_layer.cc DEPS layer proto_desc operator op_registry variable_helper mul_op memcpy)
cc_test(test_prepare_op SRCS test_prepare_op.cc DEPS prepared_operator op_info split_op layer concat_and_split activation_op place) cc_test(test_prepare_op SRCS test_prepare_op.cc DEPS prepared_operator op_info split_op layer concat_and_split activation_op place)
cc_test(test_tracer SRCS test_tracer.cc DEPS tracer layer proto_desc operator op_registry variable_helper mul_op reduce_sum_op elementwise_add_op memcpy) cc_test(test_tracer SRCS test_tracer.cc DEPS tracer layer proto_desc operator op_registry variable_helper mul_op reduce_sum_op elementwise_add_op memcpy)
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include "paddle/fluid/framework/variable.h" #include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/imperative/gradient_accumulator.h" #include "paddle/fluid/imperative/gradient_accumulator.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace imperative = paddle::imperative; namespace imperative = paddle::imperative;
namespace platform = paddle::platform; namespace platform = paddle::platform;
...@@ -263,6 +264,9 @@ static void TestGradientAccumulatorTestUnchangeInput( ...@@ -263,6 +264,9 @@ static void TestGradientAccumulatorTestUnchangeInput(
for (auto use_tensor1 : use_tensors) { for (auto use_tensor1 : use_tensors) {
for (auto use_tensor2 : use_tensors) { for (auto use_tensor2 : use_tensors) {
/** g_accum1 && g_accum2: has not been initialized
* test accumulate on this graph
*/
auto g_var1 = std::make_shared<VariableWrapper>("g_var1"); auto g_var1 = std::make_shared<VariableWrapper>("g_var1");
g_var1->SetOverridedStopGradient(false); g_var1->SetOverridedStopGradient(false);
auto g_accum1 = CreateAccumulator(g_var1, sort_gradient); auto g_accum1 = CreateAccumulator(g_var1, sort_gradient);
...@@ -278,8 +282,14 @@ static void TestGradientAccumulatorTestUnchangeInput( ...@@ -278,8 +282,14 @@ static void TestGradientAccumulatorTestUnchangeInput(
auto var1 = create_var(use_tensor1); auto var1 = create_var(use_tensor1);
auto var_wrapper1_1 = std::make_shared<VariableWrapper>("tmp1_1"); auto var_wrapper1_1 = std::make_shared<VariableWrapper>("tmp1_1");
auto var_wrapper2_1 = std::make_shared<VariableWrapper>("tmp2_1"); auto var_wrapper2_1 = std::make_shared<VariableWrapper>("tmp2_1");
ASSERT_EQ(var_wrapper1_1->IsEmpty(), true);
CopyVar(var1, var_wrapper1_1->MutableVar()); CopyVar(var1, var_wrapper1_1->MutableVar());
ASSERT_EQ(var_wrapper1_1->IsEmpty(), false);
ASSERT_EQ(var_wrapper2_1->IsEmpty(), true);
CopyVar(var1, var_wrapper2_1->MutableVar()); CopyVar(var1, var_wrapper2_1->MutableVar());
ASSERT_EQ(var_wrapper2_1->IsEmpty(), false);
auto var2 = create_var(use_tensor2); auto var2 = create_var(use_tensor2);
auto var_wrapper1_2 = std::make_shared<VariableWrapper>("tmp1_2"); auto var_wrapper1_2 = std::make_shared<VariableWrapper>("tmp1_2");
...@@ -287,15 +297,59 @@ static void TestGradientAccumulatorTestUnchangeInput( ...@@ -287,15 +297,59 @@ static void TestGradientAccumulatorTestUnchangeInput(
CopyVar(var2, var_wrapper1_2->MutableVar()); CopyVar(var2, var_wrapper1_2->MutableVar());
CopyVar(var2, var_wrapper2_2->MutableVar()); CopyVar(var2, var_wrapper2_2->MutableVar());
g_accum1->Add(var_wrapper1_1, 0, false); // g_accum1: inner_var_ = var1 + var2
g_accum1->Add(var_wrapper1_2, 1, false); g_accum1->SumGrad(var_wrapper1_1, 0, false);
g_accum1->SumGrad(var_wrapper1_2, 1, false);
g_accum2->Add(var_wrapper2_1, 0, true); ASSERT_EQ(g_accum1->CurCnt(), g_accum1->RefCnt());
g_accum2->Add(var_wrapper2_2, 1, true); ASSERT_TRUE(g_accum1->SumGradCompleted());
// g_accum1: inner_var_ -> var_
g_accum1->AccumulateGrad();
// g_accum2: inner_var_ = var1 + var2
g_accum2->SumGrad(var_wrapper2_1, 0, true);
g_accum2->SumGrad(var_wrapper2_2, 1, true);
ASSERT_EQ(g_accum2->CurCnt(), g_accum2->RefCnt());
ASSERT_TRUE(g_accum2->SumGradCompleted());
// g_accum2: inner_var_ -> var_
g_accum2->AccumulateGrad();
ASSERT_TRUE(IsEqualVar(var_wrapper2_1->Var(), var1)); ASSERT_TRUE(IsEqualVar(var_wrapper2_1->Var(), var1));
ASSERT_TRUE(IsEqualVar(var_wrapper2_2->Var(), var2)); ASSERT_TRUE(IsEqualVar(var_wrapper2_2->Var(), var2));
ASSERT_TRUE(IsEqualVar(g_var1->Var(), g_var2->Var())); ASSERT_TRUE(IsEqualVar(g_var1->Var(), g_var2->Var()));
/** g_accum3 && g_accum4: has been initialized
* test accumulate on previous graph
*/
auto var3 = create_var(use_tensor1);
auto var_wrapper3_3 = std::make_shared<VariableWrapper>("tmp1_3");
auto var_wrapper4_3 = std::make_shared<VariableWrapper>("tmp2_3");
var_wrapper3_3->SetOverridedStopGradient(false);
var_wrapper4_3->SetOverridedStopGradient(false);
CopyVar(var3, var_wrapper3_3->MutableVar());
CopyVar(var3, var_wrapper4_3->MutableVar());
auto g_accum3 = CreateAccumulator(var_wrapper3_3, sort_gradient);
g_accum3->IncreaseRefCnt();
auto g_accum4 = CreateAccumulator(var_wrapper4_3, sort_gradient);
g_accum4->IncreaseRefCnt();
auto var4 = create_var(use_tensor2);
auto var_wrapper3_4 = std::make_shared<VariableWrapper>("tmp1_4");
auto var_wrapper4_4 = std::make_shared<VariableWrapper>("tmp2_4");
CopyVar(var4, var_wrapper3_4->MutableVar());
CopyVar(var4, var_wrapper4_4->MutableVar());
g_accum3->SumGrad(var_wrapper3_4, 0, false);
ASSERT_TRUE(g_accum3->SumGradCompleted());
// g_accum4: var_(var_wrapper3_3) + inner_var_ -> var_
g_accum3->AccumulateGrad();
g_accum4->SumGrad(var_wrapper4_4, 0, false);
ASSERT_TRUE(g_accum4->SumGradCompleted());
// g_accum4: var_(var_wrapper4_3) + inner_var_ -> var_
g_accum4->AccumulateGrad();
ASSERT_TRUE(IsEqualVar(var_wrapper3_3->Var(), var_wrapper4_3->Var()));
} }
} }
} }
......
...@@ -68,10 +68,50 @@ class VariableWrapper { ...@@ -68,10 +68,50 @@ class VariableWrapper {
} }
} }
bool IsLeaf() const {
if (OverridedStopGradient()) {
return true;
}
if (HasGradVar() && !GetGradVar()->HasGradNode()) {
return true;
}
return false;
}
bool IsLeafGrad() const {
if (!HasGradVar() && !HasGradNode() && !OverridedStopGradient()) {
return true;
}
return false;
}
void SetPersistable(bool persistable) { persistable_ = persistable; } void SetPersistable(bool persistable) { persistable_ = persistable; }
bool Persistable() const { return persistable_; } bool Persistable() const { return persistable_; }
bool IsEmpty() const {
bool is_empty = true;
if (var_.IsInitialized()) {
const framework::Tensor* tensor = nullptr;
if (var_.IsType<framework::LoDTensor>()) {
tensor = &(var_.Get<framework::LoDTensor>());
} else if (var_.IsType<framework::SelectedRows>()) {
tensor = &(var_.Get<framework::SelectedRows>().value());
} else {
PADDLE_THROW(platform::errors::PermissionDenied(
"Only support LoDTensor and SelectedRows for gradient var"));
}
if (tensor && tensor->IsInitialized()) {
is_empty = false;
}
}
return is_empty || is_empty_;
}
// TODO(zhouwei): fix Tensor.clear_gradient() bug, function SetIsEmpty() isn't
// need
void SetIsEmpty(bool is_empty) { is_empty_ = is_empty; }
const std::string& Name() const { return name_; } const std::string& Name() const { return name_; }
void SetName(const std::string& name) { name_ = name; } void SetName(const std::string& name) { name_ = name; }
...@@ -96,6 +136,8 @@ class VariableWrapper { ...@@ -96,6 +136,8 @@ class VariableWrapper {
bool HasGradNode() const { return !grad_node_.expired(); } bool HasGradNode() const { return !grad_node_.expired(); }
bool HasGradVar() const { return !grad_var_.expired(); }
framework::proto::VarType::Type DataType() const { framework::proto::VarType::Type DataType() const {
const framework::Tensor* tensor = nullptr; const framework::Tensor* tensor = nullptr;
if (var_.IsInitialized()) { if (var_.IsInitialized()) {
...@@ -265,6 +307,10 @@ class VariableWrapper { ...@@ -265,6 +307,10 @@ class VariableWrapper {
std::weak_ptr<VariableWrapper> grad_var_; std::weak_ptr<VariableWrapper> grad_var_;
std::weak_ptr<GradOpNode> grad_node_; std::weak_ptr<GradOpNode> grad_node_;
// TODO(zhouwei): fix bug of Tensor.clear_gradient(), function SetIsEmpty()
// isn't need
bool is_empty_{false};
// NOTE: only grad var can hold hooks now // NOTE: only grad var can hold hooks now
// only interior var can hold interior hooks // only interior var can hold interior hooks
std::shared_ptr<InteriorVarHookPipeline> interior_hooks_; std::shared_ptr<InteriorVarHookPipeline> interior_hooks_;
......
...@@ -670,7 +670,6 @@ void BindImperative(py::module *m_ptr) { ...@@ -670,7 +670,6 @@ void BindImperative(py::module *m_ptr) {
return TensorToPyArray(tensor, true); return TensorToPyArray(tensor, true);
}, },
R"DOC( R"DOC(
Returns a numpy array shows the value of current Tensor. Returns a numpy array shows the value of current Tensor.
Returns: Returns:
...@@ -689,7 +688,6 @@ void BindImperative(py::module *m_ptr) { ...@@ -689,7 +688,6 @@ void BindImperative(py::module *m_ptr) {
data = paddle.to_tensor(data) data = paddle.to_tensor(data)
x = linear(data) x = linear(data)
print(x.numpy()) print(x.numpy())
)DOC") )DOC")
.def("detach", .def("detach",
[](const imperative::VarBase [](const imperative::VarBase
...@@ -1080,6 +1078,35 @@ void BindImperative(py::module *m_ptr) { ...@@ -1080,6 +1078,35 @@ void BindImperative(py::module *m_ptr) {
return std::vector<int>(); return std::vector<int>();
} }
}) })
.def_property_readonly("is_leaf", &imperative::VarBase::IsLeaf,
R"DOC(
Whether a Tensor is leaf Tensor.
For the Tensor whose stop_gradient is ``True`` , it will be leaf Tensor.
For the Tensor whose stop_gradient is ``False`` , it will be leaf Tensor too if it is created by user.
Returns:
bool: Whether a Tensor is leaf Tensor.
Examples:
.. code-block:: python
import paddle
x = paddle.to_tensor(1.)
print(x.is_leaf) # True
x = paddle.to_tensor(1., stop_gradient=True)
y = x + 1
print(x.is_leaf) # True
print(y.is_leaf) # True
x = paddle.to_tensor(1., stop_gradient=False)
y = x + 1
print(x.is_leaf) # True
print(y.is_leaf) # False
)DOC")
.def_property_readonly( .def_property_readonly(
"place", [](imperative::VarBase &self) { return self.Place(); }, "place", [](imperative::VarBase &self) { return self.Place(); },
py::return_value_policy::copy) py::return_value_policy::copy)
......
...@@ -133,11 +133,12 @@ def monkey_patch_varbase(): ...@@ -133,11 +133,12 @@ def monkey_patch_varbase():
@framework.dygraph_only @framework.dygraph_only
def backward(self, retain_graph=False): def backward(self, retain_graph=False):
""" """
**Notes**:
**This API is ONLY available in Dygraph mode**
Run backward of current Graph which starts from current Tensor. Run backward of current Graph which starts from current Tensor.
The new gradient will accumulat on previous gradient.
You can clear gradient by ``Tensor.clear_grad()`` .
Args: Args:
retain_graph(bool, optional): If False, the graph used to compute grads will be freed. If you would retain_graph(bool, optional): If False, the graph used to compute grads will be freed. If you would
like to add more ops to the built graph after calling this method( :code:`backward` ), set the parameter like to add more ops to the built graph after calling this method( :code:`backward` ), set the parameter
...@@ -150,21 +151,20 @@ def monkey_patch_varbase(): ...@@ -150,21 +151,20 @@ def monkey_patch_varbase():
Examples: Examples:
.. code-block:: python .. code-block:: python
import numpy as np x = paddle.to_tensor(5., stop_gradient=False)
import paddle for i in range(5):
paddle.disable_static() y = paddle.pow(x, 4.0)
y.backward()
x = np.ones([2, 2], np.float32) print("{}: {}".format(i, x.grad))
inputs = [] # 0: [500.]
for _ in range(10): # 1: [1000.]
tmp = paddle.to_tensor(x) # 2: [1500.]
# if we don't set tmp's stop_gradient as False then, all path to loss will has no gradient since # 3: [2000.]
# there is no one need gradient on it. # 4: [2500.]
tmp.stop_gradient=False
inputs.append(tmp) x.clear_grad()
ret = paddle.add_n(inputs) print("{}".format(x.grad))
loss = paddle.sum(ret) # 0.
loss.backward()
""" """
if framework.in_dygraph_mode(): if framework.in_dygraph_mode():
...@@ -181,31 +181,21 @@ def monkey_patch_varbase(): ...@@ -181,31 +181,21 @@ def monkey_patch_varbase():
@framework.dygraph_only @framework.dygraph_only
def gradient(self): def gradient(self):
""" """
**Notes**: Get the Gradient of Current Tensor.
**This API is ONLY available in Dygraph mode**
Get the Gradient of Current Variable
Returns: Returns:
ndarray: Numpy value of the gradient of current Variable ndarray: Numpy value of the gradient of current Tensor
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.fluid as fluid import paddle
import numpy as np
x = np.ones([2, 2], np.float32) x = paddle.to_tensor(5., stop_gradient=False)
with fluid.dygraph.guard(): y = paddle.pow(x, 4.0)
inputs2 = [] y.backward()
for _ in range(10): print("grad of x: {}".format(x.grad))
tmp = fluid.dygraph.base.to_variable(x) # [500.]
tmp.stop_gradient=False
inputs2.append(tmp)
ret2 = fluid.layers.sums(inputs2)
loss2 = fluid.layers.reduce_sum(ret2)
loss2.backward()
print(loss2.gradient())
""" """
if self._grad_ivar() is None: if self._grad_ivar() is None:
...@@ -226,6 +216,12 @@ def monkey_patch_varbase(): ...@@ -226,6 +216,12 @@ def monkey_patch_varbase():
return self.gradient() return self.gradient()
def clear_grad(self):
"""
The alias of clear_gradient().
"""
self.clear_gradient()
@property @property
def inplace_version(self): def inplace_version(self):
""" """
...@@ -284,10 +280,10 @@ def monkey_patch_varbase(): ...@@ -284,10 +280,10 @@ def monkey_patch_varbase():
for method_name, method in ( for method_name, method in (
("__bool__", __bool__), ("__nonzero__", __nonzero__), ("__bool__", __bool__), ("__nonzero__", __nonzero__),
("_to_static_var", _to_static_var), ("set_value", set_value), ("_to_static_var", _to_static_var), ("set_value", set_value),
("block", block), ("backward", backward), ("grad", grad), ("block", block), ("backward", backward), ("clear_grad", clear_grad),
("inplace_version", inplace_version), ("gradient", gradient), ("inplace_version", inplace_version), ("grad", grad),
("__str__", __str__), ("__repr__", __str__), ("__module__", "paddle"), ("gradient", gradient), ("__str__", __str__), ("__repr__", __str__),
("__name__", "Tensor")): ("__module__", "paddle"), ("__name__", "Tensor")):
setattr(core.VarBase, method_name, method) setattr(core.VarBase, method_name, method)
# patch math methods for varbase # patch math methods for varbase
......
...@@ -874,6 +874,8 @@ class Optimizer(object): ...@@ -874,6 +874,8 @@ class Optimizer(object):
def clear_gradients(self): def clear_gradients(self):
""" """
Clear the gradients of all optimized parameters for model. Clear the gradients of all optimized parameters for model.
If not, new gradient will accumulat on previous gradient.
Returns: Returns:
None None
......
...@@ -478,6 +478,114 @@ class TestImperative(unittest.TestCase): ...@@ -478,6 +478,114 @@ class TestImperative(unittest.TestCase):
self.assertEqual(mlp._linear2, sublayers[1]) self.assertEqual(mlp._linear2, sublayers[1])
self.assertEqual(len(sublayers), 2) self.assertEqual(len(sublayers), 2)
def test_gradient_accumulation(self):
def test_single_api(sort_sum_gradient):
fluid.set_flags({'FLAGS_sort_sum_gradient': sort_sum_gradient})
x = paddle.to_tensor(5., stop_gradient=False)
for i in range(10):
y = paddle.pow(x, 4.0)
y.backward()
print(x.grad)
self.assertEqual(x.grad, (i + 1) * 500)
x.clear_gradient()
self.assertEqual(x.grad, 0.)
for i in range(5):
y = paddle.pow(x, 4.0)
y.backward()
print(x.grad)
self.assertEqual(x.grad, (i + 1) * 500)
def test_simple_net(sort_sum_gradient):
fluid.set_flags({'FLAGS_sort_sum_gradient': sort_sum_gradient})
x = paddle.to_tensor(5., stop_gradient=False)
y = paddle.to_tensor(2., stop_gradient=False)
z = paddle.to_tensor(3., stop_gradient=False)
def fun(x, y, z):
loss1 = x * x * y
loss2 = x * z
dx = paddle.grad([loss1], x, create_graph=True)[0]
# loss = x*x*y + x*z + 2*x*y
loss = loss1 + loss2 + dx
return loss
loss = fun(x, y, z)
loss.backward(retain_graph=True)
# x.grad = 2*x*y + z + 2*y = 27
self.assertTrue(np.array_equal(x.grad, [27]))
loss.backward(retain_graph=True)
self.assertTrue(np.array_equal(x.grad, [54]))
loss.backward()
self.assertTrue(np.array_equal(x.grad, [81]))
with self.assertRaises(RuntimeError):
loss.backward()
loss1 = x * x * y
loss2 = x * z
dx = paddle.grad([loss1], x, create_graph=True)[0]
loss = loss1 + loss2 + dx
loss.backward()
self.assertTrue(np.array_equal(dx.grad, [1]))
self.assertTrue(np.array_equal(x.grad, [108]))
def test_mlp(sort_sum_gradient):
fluid.set_flags({'FLAGS_sort_sum_gradient': sort_sum_gradient})
input_size = 5
paddle.seed(1)
mlp1 = MLP(input_size=input_size)
# generate the gradient of each step
mlp2 = MLP(input_size=input_size)
expected_weight1_grad = np.zeros(mlp2._linear1.weight.shape)
expected_bias1_grad = np.zeros(mlp2._linear1.bias.shape)
expected_weight2_grad = np.zeros(mlp2._linear2.weight.shape)
expected_bias2_grad = np.zeros(mlp2._linear2.bias.shape)
for batch_id in range(24):
x = paddle.uniform([10, input_size])
detach_x = x.detach()
clear_loss = mlp2(detach_x)
clear_loss.backward()
expected_weight1_grad = expected_weight1_grad + mlp2._linear1.weight.grad
expected_bias1_grad = expected_bias1_grad + mlp2._linear1.bias.grad
expected_weight2_grad = expected_weight2_grad + mlp2._linear2.weight.grad
expected_bias2_grad = expected_bias2_grad + mlp2._linear2.bias.grad
loss = mlp1(x)
loss.backward()
self.assertTrue(np.array_equal(loss.grad, [1]))
self.assertTrue(
np.allclose(mlp1._linear1.weight.grad,
expected_weight1_grad))
self.assertTrue(
np.allclose(mlp1._linear1.bias.grad, expected_bias1_grad))
self.assertTrue(
np.allclose(mlp1._linear2.weight.grad,
expected_weight2_grad))
self.assertTrue(
np.allclose(mlp1._linear2.bias.grad, expected_bias2_grad))
mlp2.clear_gradients()
self.assertTrue(np.array_equal(clear_loss.grad, [1]))
if ((batch_id + 1) % 8) == 0:
mlp1.clear_gradients()
expected_weight1_grad = np.zeros(mlp2._linear1.weight.shape)
expected_bias1_grad = np.zeros(mlp2._linear1.bias.shape)
expected_weight2_grad = np.zeros(mlp2._linear2.weight.shape)
expected_bias2_grad = np.zeros(mlp2._linear2.bias.shape)
with fluid.dygraph.guard():
test_single_api(False)
test_single_api(True)
test_simple_net(False)
test_simple_net(True)
test_mlp(False)
test_mlp(True)
def test_dygraph_vs_static(self): def test_dygraph_vs_static(self):
np_inp1 = np.random.rand(4, 3, 3) np_inp1 = np.random.rand(4, 3, 3)
np_inp2 = np.random.rand(4, 3, 3) np_inp2 = np.random.rand(4, 3, 3)
......
...@@ -214,7 +214,7 @@ class TestDygraphDoubleGrad(TestCase): ...@@ -214,7 +214,7 @@ class TestDygraphDoubleGrad(TestCase):
self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected)) self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))
loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
loss.backward() loss.backward(retain_graph=True)
x_grad_actual = x.gradient() x_grad_actual = x.gradient()
x_grad_expected = (2.0 / float(numel) * x_grad_expected = (2.0 / float(numel) *
...@@ -222,6 +222,16 @@ class TestDygraphDoubleGrad(TestCase): ...@@ -222,6 +222,16 @@ class TestDygraphDoubleGrad(TestCase):
(x_np > 0) * 2 / float(numel))).astype('float32') (x_np > 0) * 2 / float(numel))).astype('float32')
self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
for i in range(5):
loss.backward(retain_graph=True)
x_grad_actual = x.gradient()
x_grad_expected = (i + 2) * (2.0 / float(numel) * (
x_np + dx_expected *
(x_np > 0) * 2 / float(numel))).astype('float32')
print(x_grad_actual)
print(x_grad_expected)
self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
@dygraph_guard @dygraph_guard
def test_example_with_gradient_accumulation_and_no_grad_vars(self): def test_example_with_gradient_accumulation_and_no_grad_vars(self):
x = random_var(self.shape) x = random_var(self.shape)
......
...@@ -457,6 +457,7 @@ class TestMomentumOpVsMomentumOpWithDecayAPI(unittest.TestCase): ...@@ -457,6 +457,7 @@ class TestMomentumOpVsMomentumOpWithDecayAPI(unittest.TestCase):
loss = paddle.mean(out) loss = paddle.mean(out)
loss.backward() loss.backward()
momentum.minimize(loss) momentum.minimize(loss)
linear.clear_gradients()
def __test_vs(self, place=fluid.CPUPlace()): def __test_vs(self, place=fluid.CPUPlace()):
paddle.disable_static(place=place) paddle.disable_static(place=place)
......
...@@ -198,6 +198,32 @@ class TestVarBase(unittest.TestCase): ...@@ -198,6 +198,32 @@ class TestVarBase(unittest.TestCase):
var = fluid.dygraph.to_variable(t) var = fluid.dygraph.to_variable(t)
self.assertTrue(np.array_equal(t, var.numpy())) self.assertTrue(np.array_equal(t, var.numpy()))
def test_leaf_tensor(self):
with fluid.dygraph.guard():
x = paddle.to_tensor(np.random.uniform(-1, 1, size=[10, 10]))
self.assertTrue(x.is_leaf)
y = x + 1
self.assertTrue(y.is_leaf)
x = paddle.to_tensor(
np.random.uniform(
-1, 1, size=[10, 10]), stop_gradient=False)
self.assertTrue(x.is_leaf)
y = x + 1
self.assertFalse(y.is_leaf)
linear = paddle.nn.Linear(10, 10)
input = paddle.to_tensor(
np.random.uniform(
-1, 1, size=[10, 10]).astype('float32'),
stop_gradient=False)
self.assertTrue(input.is_leaf)
out = linear(input)
self.assertTrue(linear.weight.is_leaf)
self.assertTrue(linear.bias.is_leaf)
self.assertFalse(out.is_leaf)
def test_detach(self): def test_detach(self):
with fluid.dygraph.guard(): with fluid.dygraph.guard():
x = paddle.to_tensor(1.0, dtype="float64", stop_gradient=False) x = paddle.to_tensor(1.0, dtype="float64", stop_gradient=False)
......
...@@ -793,6 +793,8 @@ class Optimizer(object): ...@@ -793,6 +793,8 @@ class Optimizer(object):
def clear_grad(self): def clear_grad(self):
""" """
Clear the gradients of all optimized parameters for model. Clear the gradients of all optimized parameters for model.
If not, new gradient will accumulat on previous gradient.
Returns: Returns:
None None
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册