From 454254115eace4d7c18cc078f6026d9d2aefe6f1 Mon Sep 17 00:00:00 2001 From: Jiabin Yang Date: Sat, 21 Sep 2019 12:19:11 +0800 Subject: [PATCH] Feature/auto prune in dygraph (#19757) * refactor dygraph,test=develop * fix failed unittest,test=develop * polish code,test=develop * check windows ci error,test=develop try to fix windows ci error by np.allclose,test=develop * polish vlog and profiler, test=develop * try to fix preceding ops order,test=develop * test transformer in windows ci, test=develop * use python c-api to speed up tracer.trace,test=develop * test=develop, fix docker with paddle nccl problem * test=develop, add ut for debug string and gradient_accumulator * test=develop, add tests for layer/gradient_accumulator/prepared_op * test=develop, fix complie error for test_prepared_op * test=develop, add more ut for dygraph * test=develop, create API.spec for dygraph api change * test=develop, refoctor name to make it easier to understand * test=develop, refoctor name to make it easier to understand * test=develop, fix multi-gpu failed problem , add Tracer tests, change PADDLEENFORCE to PADDLEENFORCE_EQ * test=develop, fix ut failed on parallel se-resnext * test=develop, change one more PADDLE_ENFORCE * support auto prune in dygraph mode * test=develop, support auto prune * test=develop, merge develop conflict * test=develop, fix test_layer and test_tracer ut * test=develop, fix bug which may cause stop_gradient disabled with a list of backward inputs --- paddle/fluid/imperative/engine.cc | 68 ++-- paddle/fluid/imperative/engine.h | 16 +- .../fluid/imperative/gradient_accumulator.cc | 77 ++-- paddle/fluid/imperative/layer.h | 45 ++- paddle/fluid/imperative/tests/test_layer.cc | 6 +- paddle/fluid/imperative/tests/test_tracer.cc | 2 + paddle/fluid/imperative/tracer.cc | 111 ++++-- paddle/fluid/pybind/imperative.cc | 9 +- python/paddle/fluid/framework.py | 4 +- python/paddle/fluid/layer_helper_base.py | 4 +- python/paddle/fluid/layers/nn.py | 7 +- .../unittests/test_imperative_auto_prune.py | 336 ++++++++++++++++++ .../tests/unittests/test_imperative_basic.py | 10 +- .../test_imperative_recurrent_usage.py | 2 + 14 files changed, 587 insertions(+), 110 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py diff --git a/paddle/fluid/imperative/engine.cc b/paddle/fluid/imperative/engine.cc index 158ba7a2a1..3a41bafbfc 100644 --- a/paddle/fluid/imperative/engine.cc +++ b/paddle/fluid/imperative/engine.cc @@ -70,23 +70,48 @@ void BasicEngine::Init(VarBase* var, const detail::BackwardStrategy& strategy) { auto& fwd_var = var->Var().Get(); auto* grad_var = var->GradVarBase()->MutableVar()->GetMutable(); + VLOG(6) << "init loss grad:" << var->GradVarBase()->Name() + << " as stop_gradient false"; + var->GradVarBase()->InnerSetOverridedStopGradient(false); + var->GradVarBase()->SetGradGenerated(true); auto* dev_ctx = platform::DeviceContextPool::Instance().Get(fwd_var.place()); grad_var->Resize(fwd_var.dims()); grad_var->mutable_data(fwd_var.place(), fwd_var.type()); operators::math::set_constant(*dev_ctx, grad_var, 1.0); } -bool BasicEngine::CheckBackwardInputs(OpBase* op) { +void BasicEngine::CheckBackwardInputs(OpBase* op) { for (auto& pair : op->GetInsMap()) { for (auto& var : pair.second) { - if (var && !var->StopGradient()) { - return true; + if (var && IsGrad(var.get())) { + // if grad var has OverridedStopGradient skip this Op + if (!var->GradGenerated()) { + VLOG(6) << "Set ungenerated Grad: " << var->Name() << " as zero"; + auto* dev_ctx = + platform::DeviceContextPool::Instance().Get(op->place()); + auto* tensor = var->MutableVar()->GetMutable(); + tensor->mutable_data(op->place(), var->DataType()); + operators::math::set_constant(*dev_ctx, tensor, 0.0); + } else { + continue; + } } } } - return false; } +void BasicEngine::SetBackwardOutputs(paddle::imperative::OpBase* op) { + for (auto& pair : op->GetOutsMap()) { + for (auto& var : pair.second) { + if (var) { + // Set Backward outputs's generate_grad as true + var->SetGradGenerated(true); + VLOG(6) << "Set backward output: " << var->Name() + << "'s SetGeneratedGrad as True"; + } + } + } +} void BasicEngine::PrepareGradAccumulators(OpBase* op) { for (const auto& pair : op->GetOutsMap()) { for (const auto& var : pair.second) { @@ -126,22 +151,19 @@ void BasicEngine::PrepareDeps() { q.pop(); VLOG(3) << "Checking grads of op " << cur_op->Type(); - if (!CheckBackwardInputs(cur_op)) { - // TODO(zjl): clear ops that do not need grad before running autograd - VLOG(3) << "Stop checking preceding ops of " << cur_op->Type() - << " because all of its backward inputs is stop_gradient=True"; - continue; - } + CheckBackwardInputs(cur_op); + + SetBackwardOutputs(cur_op); PrepareGradAccumulators(cur_op); - auto& preceding_ops = cur_op->GradPendingOps(); - for (auto* preceding_op : preceding_ops) { - PADDLE_ENFORCE_NOT_NULL(preceding_op); - ++op_deps_[preceding_op]; - if (visited.count(preceding_op) == 0) { - visited.insert(preceding_op); - q.push(preceding_op); + auto& grad_pending_ops = cur_op->GradPendingOps(); + for (auto* grad_pending_op : grad_pending_ops) { + PADDLE_ENFORCE_NOT_NULL(grad_pending_op); + ++op_deps_[grad_pending_op]; + if (visited.count(grad_pending_op) == 0) { + visited.insert(grad_pending_op); + q.push(grad_pending_op); } } } @@ -204,19 +226,19 @@ void BasicEngine::Execute() { } // Step 3: Collect ready ops - for (auto* preceding_op : cur_op->GradPendingOps()) { - PADDLE_ENFORCE_NOT_NULL(preceding_op); - auto iter = op_deps_.find(preceding_op); + for (auto* grad_pending_op : cur_op->GradPendingOps()) { + PADDLE_ENFORCE_NOT_NULL(grad_pending_op); + auto iter = op_deps_.find(grad_pending_op); if (iter == op_deps_.end()) { continue; } - VLOG(3) << "Found preceding op of " << cur_op->Type(); + VLOG(3) << "Found grad_pending op of " << cur_op->Type(); // An Op is ready to go while its deps comes to zero if (--(iter->second) == 0) { - q.push(preceding_op); - VLOG(3) << "Push preceding op " << preceding_op->Type() + q.push(grad_pending_op); + VLOG(3) << "Push grad_pending op " << grad_pending_op->Type() << " into queue"; } } diff --git a/paddle/fluid/imperative/engine.h b/paddle/fluid/imperative/engine.h index efc005c7f3..a268004552 100644 --- a/paddle/fluid/imperative/engine.h +++ b/paddle/fluid/imperative/engine.h @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include "paddle/fluid/imperative/backward_strategy.h" @@ -49,11 +50,20 @@ class Engine { void InsertOp(OpBase* op, std::shared_ptr op_shared) { grad_ops_[op] = std::move(op_shared); } - void Clear() { grad_ops_.clear(); } + + void InsertGradVar(VarBase* grad) { grad_vars_.emplace(grad); } + + bool IsGrad(VarBase* var) { return grad_vars_.count(var) > 0; } + + void Clear() { + grad_ops_.clear(); + grad_vars_.clear(); + } private: std::unordered_map> grad_ops_; // opBase for remove - grad_op + std::unordered_set grad_vars_; }; class BasicEngine : public Engine { @@ -69,7 +79,9 @@ class BasicEngine : public Engine { private: void PrepareDeps(); - bool CheckBackwardInputs(OpBase* op); + void CheckBackwardInputs(OpBase* op); + + void SetBackwardOutputs(OpBase* op); void PrepareGradAccumulators(OpBase* op); diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc index 7f424dcfbc..509415a367 100644 --- a/paddle/fluid/imperative/gradient_accumulator.cc +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -105,10 +105,23 @@ void TensorAdd(const framework::Variable& src, framework::Variable* dst) { void EagerGradientAccumulator::Add(std::shared_ptr var, size_t trace_id) { auto* dst_var = var_->MutableVar(); - if (cur_cnt_ == 0) { - *dst_var = std::move(*(var->MutableVar())); + auto place = var->Var().Get().place(); + if (!var_->OverridedStopGradient()) { + VLOG(3) << "Sum Gradient for: " << var_->Name(); + if (cur_cnt_ == 0) { + *dst_var = std::move(*(var->MutableVar())); + } else { + TensorAdd(var->Var(), dst_var); + } } else { - TensorAdd(var->Var(), dst_var); + if (!var_->Var().IsInitialized() || + !var_->Var().Get().IsInitialized()) { + VLOG(6) << "Set StopGradient Grad: " << var->Name() << " as zero"; + auto* dev_ctx = platform::DeviceContextPool::Instance().Get(place); + auto* tensor = var_->MutableVar()->GetMutable(); + tensor->mutable_data(place, var->DataType()); + operators::math::set_constant(*dev_ctx, tensor, 0.0); + } } ++cur_cnt_; } @@ -116,30 +129,44 @@ void EagerGradientAccumulator::Add(std::shared_ptr var, void SortedGradientAccumulator::Add(std::shared_ptr var, size_t trace_id) { auto* dst_var = var_->MutableVar(); - if (ref_cnt_ == 1) { - *dst_var = std::move(*(var->MutableVar())); - } else { - if (tmp_grad_vars_.empty()) { - tmp_grad_vars_.reserve(ref_cnt_); - } - - tmp_grad_vars_.emplace_back(std::move(var), trace_id); - - if (tmp_grad_vars_.size() != ref_cnt_) { - return; + auto place = var->Var().Get().place(); + if (!var_->OverridedStopGradient()) { + if (ref_cnt_ == 1) { + *dst_var = std::move(*(var->MutableVar())); + } else { + if (tmp_grad_vars_.empty()) { + tmp_grad_vars_.reserve(ref_cnt_); + } + + tmp_grad_vars_.emplace_back(std::move(var), trace_id); + + if (tmp_grad_vars_.size() != ref_cnt_) { + return; + } + + std::sort(tmp_grad_vars_.begin(), tmp_grad_vars_.end(), + [](const std::pair, size_t>& p1, + const std::pair, size_t>& p2) { + return p1.second > p2.second; + }); + + *dst_var = std::move(*(tmp_grad_vars_[0].first->MutableVar())); + for (size_t i = 1; i < tmp_grad_vars_.size(); ++i) { + TensorAdd(tmp_grad_vars_[i].first->Var(), dst_var); + } + + tmp_grad_vars_.clear(); } - - std::sort(tmp_grad_vars_.begin(), tmp_grad_vars_.end(), - [](const std::pair, size_t>& p1, - const std::pair, size_t>& p2) { - return p1.second > p2.second; - }); - - *dst_var = std::move(*(tmp_grad_vars_[0].first->MutableVar())); - for (size_t i = 1; i < tmp_grad_vars_.size(); ++i) { - TensorAdd(tmp_grad_vars_[i].first->Var(), dst_var); + } else { + if (!var_->Var().IsInitialized() || + !var_->Var().Get().IsInitialized()) { + VLOG(6) << "Set StopGradient Grad: " << var->Name() << " as zero"; + auto* dev_ctx = platform::DeviceContextPool::Instance().Get(place); + auto* tensor = var_->MutableVar()->GetMutable(); + tensor->mutable_data(place, var->DataType()); + operators::math::set_constant(*dev_ctx, tensor, 0.0); } - + // looks like tmp_grad_vars will not have any member but just in case tmp_grad_vars_.clear(); } } diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index 37741fef24..4ef22c97d0 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -93,14 +93,44 @@ class VarBase { return &(grad_var_->var_); } - void SetStopGradient(bool stop_gradient) { - stop_gradient_ = stop_gradient; + // This is used for python api + void SetOverridedStopGradient(bool stop_gradient) { + if (stop_gradient) { + overrided_stop_gradient_ = 1; + } else { + overrided_stop_gradient_ = 0; + } if (grad_var_) { - grad_var_->stop_gradient_ = stop_gradient; + grad_var_->SetOverridedStopGradient(stop_gradient); + } + } + // This is used for python api + bool OverridedStopGradient() const { + if (overrided_stop_gradient_ == 0) { + return false; + } else { + return true; } } - bool StopGradient() const { return stop_gradient_; } + // This is used inside C++ + int InnerOverridedStopGradient() const { return overrided_stop_gradient_; } + + bool GradGenerated() const { return grad_generated_; } + + void SetGradGenerated(bool generated) { grad_generated_ = generated; } + // This is used inside C++ + void InnerSetOverridedStopGradient(bool stop_gradient) { + if (overrided_stop_gradient_ == -1) { + overrided_stop_gradient_ = static_cast(stop_gradient); + if (grad_var_) { + grad_var_->InnerSetOverridedStopGradient(stop_gradient); + } + } else { + VLOG(6) << "Ignore Stop gradient conversion for Var: " << Name() + << "Set value is: " << overrided_stop_gradient_; + } + } void SetPersistable(bool persistable) { persistable_ = persistable; } @@ -156,8 +186,11 @@ class VarBase { // grad_op indicates which grad_op will this var be used as input std::vector> grad_ops_; - - bool stop_gradient_{false}; + // add this property for users may set stop_gradient themselves and this + // should override the + // frameworks setting (-1) unset, (1) true, (0) false + int overrided_stop_gradient_{-1}; + bool grad_generated_{false}; bool persistable_{false}; framework::proto::VarType::Type type_{framework::proto::VarType::LOD_TENSOR}; diff --git a/paddle/fluid/imperative/tests/test_layer.cc b/paddle/fluid/imperative/tests/test_layer.cc index a25ec66903..c92d0fd67c 100644 --- a/paddle/fluid/imperative/tests/test_layer.cc +++ b/paddle/fluid/imperative/tests/test_layer.cc @@ -139,10 +139,10 @@ TEST(test_layer, test_varbase_basic) { vin_with_grad->MutableGradVar()) != 0)); ASSERT_TRUE( dynamic_cast(vin_with_grad->MutableGradVar()) != 0); - vin_with_grad->SetStopGradient(true); - ASSERT_TRUE(vin_with_grad->StopGradient()); + vin_with_grad->SetOverridedStopGradient(false); + ASSERT_FALSE(vin_with_grad->OverridedStopGradient()); ASSERT_NO_FATAL_FAILURE(vin_with_grad->SetPersistable(true)); - ASSERT_TRUE(vin_with_grad->StopGradient()); + ASSERT_FALSE(vin_with_grad->OverridedStopGradient()); ASSERT_NO_FATAL_FAILURE(vin_with_grad->SetName("new_name")); ASSERT_EQ(vin_with_grad->Name(), "new_name"); } diff --git a/paddle/fluid/imperative/tests/test_tracer.cc b/paddle/fluid/imperative/tests/test_tracer.cc index 1bd0e8bc9d..f112b9fc1e 100644 --- a/paddle/fluid/imperative/tests/test_tracer.cc +++ b/paddle/fluid/imperative/tests/test_tracer.cc @@ -81,6 +81,7 @@ TEST(test_tracer, test_track_backward_output) { new imperative::VarBase(true, "x_in")); std::shared_ptr y_in( new imperative::VarBase(false, "y_in")); + x_in->SetOverridedStopGradient(false); std::shared_ptr vout( new imperative::VarBase(true, "vout")); platform::CPUPlace place; @@ -119,6 +120,7 @@ TEST(test_tracer, test_track_backward_input) { std::shared_ptr vout( new imperative::VarBase(false, "vout")); platform::CPUPlace place; + x_in->SetOverridedStopGradient(false); std::vector src_data(10, 2.0); std::vector dims1 = {2, 5}; std::vector dims2 = {5, 2}; diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index ff40f2e8d8..0fff6b8cda 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -32,6 +32,16 @@ static std::vector> CreateGradOpDescs( } } +static void PassStopGradient(const NameVarBaseMap& outs, bool generate_grad) { + for (const auto& name_pair : outs) { + for (const auto& vb : name_pair.second) { + VLOG(6) << "Set output: " << vb->Name() << "'s OverridedStopGradient as " + << generate_grad; + vb->InnerSetOverridedStopGradient(generate_grad); + } + } +} + void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins, const NameVarBaseMap& outs, framework::AttributeMap attrs, const platform::Place& place, bool trace_backward) { @@ -45,16 +55,27 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins, TraceBackward(op, framework::OpDesc(op->Type(), op->InputNameMap(), op->OutputNameMap(), op->Attrs()), ins, outs); - VLOG(6) << "Finish tracking Backward of op: " << type; + } else { + VLOG(3) << "No Grad to track for Op: " << type; } - VLOG(6) << "Finish tracing fwd op: " << type; } bool Tracer::ComputeRequiredGrad(const NameVarBaseMap& ins, const NameVarBaseMap& outs, bool trace_backward) { - // TODO(jiabin): Implement auto prune here - return trace_backward; + if (!trace_backward) return false; + + for (const auto& name_pair : ins) { + for (const auto& var_base : name_pair.second) { + if (!var_base->OverridedStopGradient()) { + VLOG(6) << "Find out input: " << var_base->Name() + << "'s GeneratedGrad is True"; + PassStopGradient(outs, var_base->OverridedStopGradient()); + return true; + } + } + } + return false; } void Tracer::TraceBackward(const std::shared_ptr& fwd_op, @@ -133,14 +154,25 @@ void Tracer::TraceBackward(const std::shared_ptr& fwd_op, PADDLE_ENFORCE_EQ(fwd_var_iter != name_to_var.end(), true, "Cannot find forward variable named %s", fwd_var_name); + const auto& tmp = (*(fwd_var_iter->second))->GradVarBase(); PADDLE_ENFORCE_NOT_NULL( - (*(fwd_var_iter->second))->GradVarBase(), + tmp.get(), "Grad of %s should " "not be NULL when we Track_Backward Input of %s", (*(fwd_var_iter->second))->Name(), grad_op->Type()); - (*(fwd_var_iter->second))->GradVarBase()->AddGradOps(grad_op); + // Create grad_in's dim in tensor for Grad Dependency compute + auto* tensor = tmp->MutableVar()->GetMutable(); + tensor->Resize((*(fwd_var_iter->second)) + ->Var() + .Get() + .dims()); + // Add Grad Op for grad_in + tmp->AddGradOps(grad_op); VLOG(3) << "Add Grad Op " << grad_op->Type() << " for :" << (*(fwd_var_iter->second))->GradVarBase()->Name(); + // Add Grad var input to engine set + engine_->InsertGradVar(tmp.get()); + VLOG(3) << "Add Grad: " << tmp->Name() << " in to Engine"; bwd_in.emplace_back((*(fwd_var_iter->second))->GradVarBase()); } else { // If it is a forward var, just add it @@ -150,8 +182,7 @@ void Tracer::TraceBackward(const std::shared_ptr& fwd_op, grad_in_var_name); bwd_in.emplace_back(*(fwd_var_iter->second)); } - - VLOG(3) << "Set backward input " << grad_ins.first << " of " + VLOG(3) << "Set backward input from fwd var" << grad_ins.first << " of " << grad_op->Type() << " to be " << (bwd_in.back() ? bwd_in.back()->Name() : "nullptr"); } @@ -173,40 +204,44 @@ void Tracer::TraceBackward(const std::shared_ptr& fwd_op, PADDLE_ENFORCE_EQ(fwd_var_iter != name_to_var.end(), true, "Cannot find forward variable named %s", iter->second); - PADDLE_ENFORCE_NOT_NULL( - (*(fwd_var_iter->second))->GradVarBase(), - "Grad of %s should " - "not be NULL when we Track_Backward Output of %s", - (*(fwd_var_iter->second))->Name(), grad_op->Type()); - bwd_out.emplace_back((*(fwd_var_iter->second))->GradVarBase()); - VLOG(3) << "Set backward output " << grad_outs.first << " of " - << grad_op->Type() << " to be " - << (bwd_out.back() ? bwd_out.back()->Name() : "nullptr"); - - auto preceding_ops = - (*(fwd_var_iter->second))->GradVarBase()->GradOps(); - - if (VLOG_IS_ON(3) && !preceding_ops.empty()) { - VLOG(3) << "Add preceding Op of :" - << (*(fwd_var_iter->second))->GradVarBase()->Name() - << " It's preceding Op are: "; - for (const auto& op : preceding_ops) { - VLOG(3) << op->Type(); + const auto& tmp = (*(fwd_var_iter->second))->GradVarBase(); + + PADDLE_ENFORCE_NOT_NULL(tmp.get(), + "Grad output: %s of op: %s should not be NULL", + (tmp->Name(), grad_op->Type())); + + if ((!tmp->OverridedStopGradient()) || (grad_outs.second.size() > 1)) { + VLOG(3) << "Set backward output " << grad_outs.first << " of " + << grad_op->Type() << " to be " << tmp->Name() + << ". Its Overrided Stop_Gradient is: False"; + bwd_out.emplace_back(tmp); + auto grad_pending_ops = + (*(fwd_var_iter->second))->GradVarBase()->GradOps(); + if (VLOG_IS_ON(3) && !grad_pending_ops.empty()) { + VLOG(3) << "Add grad_pending Op of :" + << (*(fwd_var_iter->second))->GradVarBase()->Name() + << " It's grad_pending Op are: "; + for (const auto& op : grad_pending_ops) { + VLOG(3) << op->Type(); + } } - } - - if (!preceding_ops.empty()) { - for (const auto& op : preceding_ops) { - PADDLE_ENFORCE_NOT_NULL(op, "No nullptr should be preceding_op"); - if (visited_preceding_ops.count(op) == 0) { - visited_preceding_ops.insert(op); - grad_op->InsertGradPendingOps(op); + if (!grad_pending_ops.empty()) { + for (const auto& op : grad_pending_ops) { + PADDLE_ENFORCE_NOT_NULL(op, + "No nullptr should be grad_pending op"); + if (visited_preceding_ops.count(op) == 0) { + visited_preceding_ops.insert(op); + grad_op->InsertGradPendingOps(op); + } } + } else { + VLOG(5) << "Hit leaf VarBase" + << (*(fwd_var_iter->second))->GradVarBase()->Name(); } } else { - VLOG(5) << "Hit leaf VarBase"; - VLOG(5) << "Hit leaf VarBase" - << (*(fwd_var_iter->second))->GradVarBase()->Name(); + VLOG(3) << "Skip backward output " << grad_outs.first << " of " + << grad_op->Type() << " Named: " << tmp->Name() + << ", since its Overrided Stop_Gradient is: True"; } } } diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 82867faff8..63e3e7e857 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -230,13 +230,11 @@ void BindImperative(py::module *m_ptr) { [](imperative::VarBase &self, const std::string &name, framework::proto::VarType::Type type, framework::proto::VarType::Type dtype, - const std::vector &dims, bool stop_gradient, - bool persistable) { + const std::vector &dims, bool persistable) { new (&self) imperative::VarBase(name); self.SetPersistable(persistable); self.SetType(type); self.SetDataType(dtype); - self.SetStopGradient(stop_gradient); if (type == framework::proto::VarType::LOD_TENSOR) { auto *tensor = self.MutableVar()->GetMutable(); @@ -302,8 +300,9 @@ void BindImperative(py::module *m_ptr) { .def_property_readonly("dtype", &imperative::VarBase::DataType) .def_property("persistable", &imperative::VarBase::Persistable, &imperative::VarBase::SetPersistable) - .def_property("stop_gradient", &imperative::VarBase::StopGradient, - &imperative::VarBase::SetStopGradient); + .def_property("stop_gradient", + &imperative::VarBase::OverridedStopGradient, + &imperative::VarBase::SetOverridedStopGradient); py::class_ layer(m, "Layer"); layer.def(py::init<>()) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index db3231f6b5..f938c09b1f 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -456,12 +456,13 @@ class Variable(object): if in_dygraph_mode(): # record vars in tracer rather than blocks self._ivar = kwargs.get("ivar", None) + self.stop_gradient_ = kwargs.get("stop_gradient", True) if not self._ivar: self._ivar = core.VarBase( name, type if type else core.VarDesc.VarType.LOD_TENSOR, dtype if dtype else core.VarDesc.VarType.FP32, - list(shape) if shape else [], stop_gradient, True + list(shape) if shape else [], True if persistable else False) if persistable: _dygraph_tracer().trace_var(name, self) @@ -1847,6 +1848,7 @@ class Block(object): pass else: initializer(param, self) + param.stop_gradient = False return param def append_op(self, *args, **kwargs): diff --git a/python/paddle/fluid/layer_helper_base.py b/python/paddle/fluid/layer_helper_base.py index 5e4eac6b5c..5fa7fef381 100644 --- a/python/paddle/fluid/layer_helper_base.py +++ b/python/paddle/fluid/layer_helper_base.py @@ -266,7 +266,8 @@ class LayerHelperBase(object): shape, dtype, is_bias=False, - default_initializer=None): + default_initializer=None, + stop_gradient=False): """Create parameters for this layers. Args: @@ -320,6 +321,7 @@ class LayerHelperBase(object): return self.main_program.global_block().create_parameter( dtype=dtype, shape=shape, + stop_gradient=stop_gradient, **attr._to_kwargs(with_initializer=True)) else: self.startup_program.global_block().create_parameter( diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 2e12a9e9c5..606acbfbee 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -6980,8 +6980,8 @@ def one_hot(input, depth, allow_out_of_range=False): type="one_hot", inputs=inputs, attrs=attrs, - outputs={'Out': one_hot_out}, - stop_gradient=True) + outputs={'Out': one_hot_out}) + one_hot_out.stop_gradient = True return one_hot_out @@ -7019,8 +7019,7 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1): type='increment', inputs={'X': [counter]}, outputs={'Out': [counter]}, - attrs={'step': float(step)}, - stop_gradient=True) + attrs={'step': float(step)}) counter.stop_gradient = True return counter diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py new file mode 100644 index 0000000000..ac849e1cfb --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py @@ -0,0 +1,336 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import paddle.fluid as fluid +import numpy as np + + +class AutoPruneLayer0(fluid.Layer): + def __init__(self, name_scope): + super(AutoPruneLayer0, self).__init__(name_scope) + self.fc1 = fluid.dygraph.FC( + "FC_1", + 5, + param_attr=fluid.initializer.ConstantInitializer(value=2), + bias_attr=False) + self.fc2 = fluid.dygraph.FC( + "FC_2", + 5, + param_attr=fluid.initializer.ConstantInitializer(value=2), + bias_attr=False) + + def forward(self, x, y): + a = self.fc1(x) + b = self.fc2(y) + c = fluid.layers.mul(a, b) + d = fluid.layers.reduce_mean(c) + return d + + +class AutoPruneLayer1(fluid.Layer): + def __init__(self, name_scope): + super(AutoPruneLayer1, self).__init__(name_scope) + self.fc1 = fluid.dygraph.FC( + "FC_1", + 5, + param_attr=fluid.initializer.ConstantInitializer(value=2), + bias_attr=False) + self.fc2 = fluid.dygraph.FC( + "FC_2", + 5, + param_attr=fluid.initializer.ConstantInitializer(value=2), + bias_attr=False) + + def forward(self, x, y): + a = self.fc1(x) + b = self.fc2(y) + b.stop_gradient = True + c = fluid.layers.mul(a, b) + d = fluid.layers.reduce_mean(c) + return d + + +class AutoPruneLayer2(fluid.Layer): + def __init__(self, name_scope): + super(AutoPruneLayer2, self).__init__(name_scope) + self.fc = fluid.dygraph.FC("FC1", size=10, act=None) + self.fc2 = fluid.dygraph.FC("FC2", size=1, act=None) + + def forward(self, x, label): + feature = self.fc(x) + label = self.fc2(label) + label = fluid.layers.cast(label, dtype="float32") + label = fluid.layers.cast(label, dtype='int64') + # Note that the label is not persistable in fluid.layers.cross_entropy. + loss = fluid.layers.cross_entropy(input=feature, label=label) + loss = fluid.layers.mean(loss) + return loss + + +class AutoPruneLayer3(fluid.Layer): + def __init__(self, name_scope): + super(AutoPruneLayer3, self).__init__(name_scope) + self.fc = fluid.dygraph.FC("FC1", size=20, act=None) + + def forward(self, x, label, test_num): + feature = self.fc(x) + part1, part2 = fluid.layers.split( + feature, num_or_sections=[10, 10], dim=1) + # Note that: part2 is not used. + loss = fluid.layers.cross_entropy(input=part1, label=label) + loss = fluid.layers.mean(loss) + if test_num == 1: + return loss, part2 + else: + return loss, part1, part2 + + +class MyLayer(fluid.Layer): + def __init__(self, name_scope, vocab_size, size, dtype="float32"): + super(MyLayer, self).__init__(name_scope, dtype) + self.embed0 = fluid.Embedding(self.full_name(), size=(vocab_size, size)) + self.embed1 = fluid.Embedding(self.full_name(), size=(vocab_size, size)) + self.fc0 = fluid.FC(self.full_name(), size=size, dtype=dtype) + self.fc1 = fluid.FC(self.full_name(), size=size, dtype=dtype) + + def forward(self, x): + # this method involves only the fc layers + loss = fluid.layers.reduce_mean(self.fc0(x) + self.fc1(x)) + return loss + + def linear0(self, x): + loss = fluid.layers.reduce_mean(self.fc0(x)) + return loss + + def embed_linear0(self, x): + loss = fluid.layers.reduce_mean(self.fc0(self.embed0(x))) + return loss + + +class MyLayer2(fluid.Layer): + def __init__(self, name_scope, vocab_size, size, dtype="float32"): + super(MyLayer2, self).__init__(name_scope, dtype) + self.embed0 = fluid.Embedding(self.full_name(), size=(vocab_size, size)) + self.embed1 = fluid.Embedding(self.full_name(), size=(vocab_size, size)) + self.fc0 = fluid.FC(self.full_name(), size=size, dtype=dtype) + self.fc1 = fluid.FC(self.full_name(), size=size, dtype=dtype) + + def forward(self, indices): + # mind the difference with MyLayer + # In this example, the forward method involes all params + loss = fluid.layers.reduce_mean( + self.fc0(self.embed0(indices)) + self.fc1(self.embed1(indices))) + return loss + + def linear0(self, x): + loss = fluid.layers.reduce_mean(self.fc0(x)) + return loss + + def embed_linear0(self, x): + loss = fluid.layers.reduce_mean(self.fc0(self.embed0(x))) + return loss + + +class TestImperativeAutoPrune(unittest.TestCase): + def test_auto_prune(self): + with fluid.dygraph.guard(): + case1 = AutoPruneLayer0("l1") + value1 = np.arange(25).reshape(5, 5).astype("float32") + value2 = np.arange(25).reshape(5, 5).astype("float32") + v1 = fluid.dygraph.to_variable(value1) + v2 = fluid.dygraph.to_variable(value2) + loss = case1(v1, v2) + loss.backward() + self.assertTrue(case1.fc2._w._ivar._grad_ivar() is not None) + self.assertTrue(case1.fc1._w._ivar._grad_ivar() is not None) + + def test_auto_prune2(self): + with fluid.dygraph.guard(): + case2 = AutoPruneLayer1("l1") + value1 = np.arange(25).reshape(5, 5).astype("float32") + value2 = np.arange(25).reshape(5, 5).astype("float32") + v1 = fluid.dygraph.to_variable(value1) + v2 = fluid.dygraph.to_variable(value2) + loss = case2(v1, v2) + loss.backward() + self.assertTrue(case2.fc2._w._ivar._grad_ivar() is None) + self.assertTrue(case2.fc1._w._ivar._grad_ivar() is not None) + + def test_auto_prune3(self): + with fluid.dygraph.guard(): + case3 = AutoPruneLayer3("l3") + value1 = np.arange(784).reshape(1, 784).astype("float32") + value2 = np.arange(1).reshape(1, 1).astype("int64") + v1 = fluid.dygraph.to_variable(value1) + v2 = fluid.dygraph.to_variable(value2) + loss, part2 = case3(v1, v2, 1) + loss.backward() + self.assertTrue(case3.fc._w._ivar._grad_ivar() is not None) + self.assertTrue((part2.gradient() == 0).all()) + + def test_auto_prune4(self): + with fluid.dygraph.guard(): + case4 = AutoPruneLayer3("l3") + value1 = np.arange(784).reshape(1, 784).astype("float32") + value2 = np.arange(1).reshape(1, 1).astype("int64") + v1 = fluid.dygraph.to_variable(value1) + v2 = fluid.dygraph.to_variable(value2) + loss, part2 = case4(v1, v2, 1) + part2.backward() + self.assertTrue(case4.fc._w._ivar._grad_ivar() is not None) + self.assertTrue((part2.gradient() == 1).all()) + + def test_auto_prune5(self): + with fluid.dygraph.guard(): + case4 = AutoPruneLayer3("l3") + value1 = np.arange(784).reshape(1, 784).astype("float32") + value2 = np.arange(1).reshape(1, 1).astype("int64") + v1 = fluid.dygraph.to_variable(value1) + v2 = fluid.dygraph.to_variable(value2) + loss, part1, part2 = case4(v1, v2, 2) + part1.backward() + self.assertTrue(case4.fc._w._ivar._grad_ivar() is not None) + self.assertTrue((part2.gradient() == 0).all()) + + def test_auto_prune6(self): + with fluid.dygraph.guard(): + value0 = np.arange(26).reshape(2, 13).astype("float32") + value1 = np.arange(6).reshape(2, 3).astype("float32") + value2 = np.arange(10).reshape(2, 5).astype("float32") + fc = fluid.FC("fc1", size=5, dtype="float32") + fc2 = fluid.FC("fc2", size=3, dtype="float32") + a = fluid.dygraph.to_variable(value0) + b = fluid.dygraph.to_variable(value1) + c = fluid.dygraph.to_variable(value2) + out1 = fc(a) + out2 = fc2(b) + out1.stop_gradient = True + out = fluid.layers.concat(input=[out1, out2, c], axis=1) + out.backward() + self.assertTrue((fc._w.gradient() == 0).all()) + self.assertTrue((out1.gradient() == 0).all()) + + def test_auto_prune7(self): + with fluid.dygraph.guard(): + value0 = np.arange(26).reshape(2, 13).astype("float32") + value1 = np.arange(6).reshape(2, 3).astype("float32") + value2 = np.arange(10).reshape(2, 5).astype("float32") + fc = fluid.FC("fc1", size=5, dtype="float32") + fc2 = fluid.FC("fc2", size=3, dtype="float32") + a = fluid.dygraph.to_variable(value0) + b = fluid.dygraph.to_variable(value1) + c = fluid.dygraph.to_variable(value2) + out1 = fc(a) + out2 = fc2(b) + out1.stop_gradient = True + out = fluid.layers.concat(input=[out1, out2, c], axis=1) + backward_strategy = fluid.dygraph.BackwardStrategy() + out.backward(backward_strategy) + self.assertTrue((fc._w.gradient() == 0).all()) + self.assertTrue((out1.gradient() == 0).all()) + + def test_auto_prune_with_optimizer(self): + vocab_size = 100 + size = 20 + batch_size = 16 + + indices = np.random.randint( + low=0, high=100, size=(batch_size, 1)).astype("int64") + embed = np.random.randn(batch_size, size).astype("float32") + + place = fluid.CPUPlace() + with fluid.dygraph.guard(place): + model = MyLayer("mylayer", vocab_size, size) + optimizer = fluid.optimizer.AdamOptimizer(0.001) + grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(0.001) + + indices = fluid.dygraph.to_variable(indices) + emebd = fluid.dygraph.to_variable(embed) + dummy_loss = model(embed) + + loss = model.embed_linear0(indices) + loss.backward() + _, params_grads = optimizer.minimize(loss, grad_clip=grad_clip) + for items in params_grads: + assert items[0].name is not model.embed1._w.name + assert items[0].name is not model.fc1._w.name + assert model.embed1._w._ivar._grad_ivar() is None + assert model.fc1._w._ivar._grad_ivar() is None + + with fluid.dygraph.guard(place): + model = MyLayer2("mylayer", vocab_size, size) + optimizer = fluid.optimizer.AdamOptimizer(0.001) + grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(0.001) + + indices = fluid.dygraph.to_variable(indices) + emebd = fluid.dygraph.to_variable(embed) + dummy_loss = model(indices) + + loss = model.embed_linear0(indices) + loss.backward() + optimizer.minimize(loss, grad_clip=grad_clip) + for items in params_grads: + assert items[0].name is not model.embed1._w.name + assert items[0].name is not model.fc1._w.name + assert model.embed1._w._ivar._grad_ivar() is None + assert model.fc1._w._ivar._grad_ivar() is None + + def test_case2_prune_no_grad_branch(self): + with fluid.dygraph.guard(): + value1 = np.arange(784).reshape(1, 784) + value2 = np.arange(1).reshape(1, 1) + v1 = fluid.dygraph.to_variable(value1).astype("float32") + v2 = fluid.dygraph.to_variable(value2).astype("float32") + case3 = AutoPruneLayer2("l2") + loss = case3(v1, v2) + loss.backward() + self.assertTrue(case3.fc2._w._ivar._grad_ivar() is None) + self.assertTrue(case3.fc._w._ivar._grad_ivar() is not None) + + def test_case2_prune_no_grad_branch(self): + with fluid.dygraph.guard(): + value1 = np.arange(784).reshape(1, 784) + value2 = np.arange(1).reshape(1, 1) + v1 = fluid.dygraph.to_variable(value1).astype("float32") + v2 = fluid.dygraph.to_variable(value2).astype("float32") + case3 = AutoPruneLayer2("l2") + loss = case3(v1, v2) + loss.backward() + self.assertTrue(case3.fc2._w._ivar._grad_ivar() is None) + self.assertTrue(case3.fc._w._ivar._grad_ivar() is not None) + + def test_case3_prune_no_grad_branch2(self): + with fluid.dygraph.guard(): + value1 = np.arange(1).reshape(1, 1) + fc = fluid.dygraph.FC("FC1", size=1, act=None) + label = fluid.dygraph.to_variable(value1).astype("float32") + label = fc(label) + label = fluid.layers.cast(label, dtype="float32") + label = fluid.layers.cast(label, dtype='int64') + out = fluid.layers.one_hot(input=label, depth=100) + loss = fluid.layers.mean(out) + loss.backward() + self.assertTrue(fc._w._ivar._grad_ivar() is None) + + def test_case4_with_no_grad_op_maker(self): + with fluid.dygraph.guard(): + out = fluid.layers.gaussian_random(shape=[20, 30]) + loss = fluid.layers.mean(out) + loss.backward() + self.assertTrue(out._ivar._grad_ivar() is None) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py index dd721f6671..acfc1e75c0 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py @@ -183,14 +183,18 @@ class TestImperative(unittest.TestCase): with fluid.dygraph.guard(): inputs = [] for _ in range(10): - inputs.append(fluid.dygraph.base.to_variable(x)) + tmp = fluid.dygraph.base.to_variable(x) + tmp.stop_gradient = False + inputs.append(tmp) ret = fluid.layers.sums(inputs) loss = fluid.layers.reduce_sum(ret) loss.backward() with fluid.dygraph.guard(): inputs2 = [] for _ in range(10): - inputs2.append(fluid.dygraph.base.to_variable(x)) + tmp = fluid.dygraph.base.to_variable(x) + tmp.stop_gradient = False + inputs2.append(tmp) ret2 = fluid.layers.sums(inputs2) loss2 = fluid.layers.reduce_sum(ret2) backward_strategy = fluid.dygraph.BackwardStrategy() @@ -214,6 +218,7 @@ class TestImperative(unittest.TestCase): np_inp = np.array([1.0, 2.0, -1.0], dtype=np.float32) with fluid.dygraph.guard(): var_inp = fluid.dygraph.base.to_variable(np_inp) + var_inp.stop_gradient = False l = MyLayer("my_layer") x = l(var_inp)[0] self.assertIsNotNone(x) @@ -223,6 +228,7 @@ class TestImperative(unittest.TestCase): with fluid.dygraph.guard(): var_inp2 = fluid.dygraph.base.to_variable(np_inp) + var_inp2.stop_gradient = False l2 = MyLayer("my_layer") x2 = l2(var_inp2)[0] self.assertIsNotNone(x2) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_recurrent_usage.py b/python/paddle/fluid/tests/unittests/test_imperative_recurrent_usage.py index 08c0fc8f00..268e24fa5e 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_recurrent_usage.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_recurrent_usage.py @@ -47,6 +47,8 @@ class TestRecurrentFeed(unittest.TestCase): fluid.default_main_program().random_seed = seed original_in1 = to_variable(original_np1) original_in2 = to_variable(original_np2) + original_in1.stop_gradient = False + original_in2.stop_gradient = False rt = RecurrentTest("RecurrentTest") for i in range(3): -- GitLab