未验证 提交 45425411 编写于 作者: J Jiabin Yang 提交者: GitHub

Feature/auto prune in dygraph (#19757)

* refactor dygraph,test=develop

* fix failed unittest,test=develop

* polish code,test=develop

* check windows ci error,test=develop
try to fix windows ci error by np.allclose,test=develop

* polish vlog and profiler, test=develop

* try to fix preceding ops order,test=develop

* test transformer in windows ci, test=develop

* use python c-api to speed up tracer.trace,test=develop

* test=develop, fix docker with paddle nccl problem

* test=develop, add ut for debug string and gradient_accumulator

* test=develop, add tests for layer/gradient_accumulator/prepared_op

* test=develop, fix complie error for test_prepared_op

* test=develop, add more ut for dygraph

* test=develop, create API.spec for dygraph api change

* test=develop, refoctor name to make it easier to understand

* test=develop, refoctor name to make it easier to understand

* test=develop, fix multi-gpu failed problem , add Tracer tests, change PADDLEENFORCE to PADDLEENFORCE_EQ

* test=develop, fix ut failed on parallel se-resnext

* test=develop, change one more PADDLE_ENFORCE

* support auto prune in dygraph mode

* test=develop, support auto prune

* test=develop, merge develop conflict

* test=develop, fix test_layer and test_tracer ut

* test=develop, fix bug which may cause stop_gradient disabled with a list of backward inputs
上级 418a0967
......@@ -70,23 +70,48 @@ void BasicEngine::Init(VarBase* var, const detail::BackwardStrategy& strategy) {
auto& fwd_var = var->Var().Get<framework::LoDTensor>();
auto* grad_var =
var->GradVarBase()->MutableVar()->GetMutable<framework::LoDTensor>();
VLOG(6) << "init loss grad:" << var->GradVarBase()->Name()
<< " as stop_gradient false";
var->GradVarBase()->InnerSetOverridedStopGradient(false);
var->GradVarBase()->SetGradGenerated(true);
auto* dev_ctx = platform::DeviceContextPool::Instance().Get(fwd_var.place());
grad_var->Resize(fwd_var.dims());
grad_var->mutable_data(fwd_var.place(), fwd_var.type());
operators::math::set_constant(*dev_ctx, grad_var, 1.0);
}
bool BasicEngine::CheckBackwardInputs(OpBase* op) {
void BasicEngine::CheckBackwardInputs(OpBase* op) {
for (auto& pair : op->GetInsMap()) {
for (auto& var : pair.second) {
if (var && !var->StopGradient()) {
return true;
if (var && IsGrad(var.get())) {
// if grad var has OverridedStopGradient skip this Op
if (!var->GradGenerated()) {
VLOG(6) << "Set ungenerated Grad: " << var->Name() << " as zero";
auto* dev_ctx =
platform::DeviceContextPool::Instance().Get(op->place());
auto* tensor = var->MutableVar()->GetMutable<framework::LoDTensor>();
tensor->mutable_data(op->place(), var->DataType());
operators::math::set_constant(*dev_ctx, tensor, 0.0);
} else {
continue;
}
}
}
}
return false;
}
void BasicEngine::SetBackwardOutputs(paddle::imperative::OpBase* op) {
for (auto& pair : op->GetOutsMap()) {
for (auto& var : pair.second) {
if (var) {
// Set Backward outputs's generate_grad as true
var->SetGradGenerated(true);
VLOG(6) << "Set backward output: " << var->Name()
<< "'s SetGeneratedGrad as True";
}
}
}
}
void BasicEngine::PrepareGradAccumulators(OpBase* op) {
for (const auto& pair : op->GetOutsMap()) {
for (const auto& var : pair.second) {
......@@ -126,22 +151,19 @@ void BasicEngine::PrepareDeps() {
q.pop();
VLOG(3) << "Checking grads of op " << cur_op->Type();
if (!CheckBackwardInputs(cur_op)) {
// TODO(zjl): clear ops that do not need grad before running autograd
VLOG(3) << "Stop checking preceding ops of " << cur_op->Type()
<< " because all of its backward inputs is stop_gradient=True";
continue;
}
CheckBackwardInputs(cur_op);
SetBackwardOutputs(cur_op);
PrepareGradAccumulators(cur_op);
auto& preceding_ops = cur_op->GradPendingOps();
for (auto* preceding_op : preceding_ops) {
PADDLE_ENFORCE_NOT_NULL(preceding_op);
++op_deps_[preceding_op];
if (visited.count(preceding_op) == 0) {
visited.insert(preceding_op);
q.push(preceding_op);
auto& grad_pending_ops = cur_op->GradPendingOps();
for (auto* grad_pending_op : grad_pending_ops) {
PADDLE_ENFORCE_NOT_NULL(grad_pending_op);
++op_deps_[grad_pending_op];
if (visited.count(grad_pending_op) == 0) {
visited.insert(grad_pending_op);
q.push(grad_pending_op);
}
}
}
......@@ -204,19 +226,19 @@ void BasicEngine::Execute() {
}
// Step 3: Collect ready ops
for (auto* preceding_op : cur_op->GradPendingOps()) {
PADDLE_ENFORCE_NOT_NULL(preceding_op);
auto iter = op_deps_.find(preceding_op);
for (auto* grad_pending_op : cur_op->GradPendingOps()) {
PADDLE_ENFORCE_NOT_NULL(grad_pending_op);
auto iter = op_deps_.find(grad_pending_op);
if (iter == op_deps_.end()) {
continue;
}
VLOG(3) << "Found preceding op of " << cur_op->Type();
VLOG(3) << "Found grad_pending op of " << cur_op->Type();
// An Op is ready to go while its deps comes to zero
if (--(iter->second) == 0) {
q.push(preceding_op);
VLOG(3) << "Push preceding op " << preceding_op->Type()
q.push(grad_pending_op);
VLOG(3) << "Push grad_pending op " << grad_pending_op->Type()
<< " into queue";
}
}
......
......@@ -18,6 +18,7 @@
#include <cstdint>
#include <memory>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include "paddle/fluid/imperative/backward_strategy.h"
......@@ -49,11 +50,20 @@ class Engine {
void InsertOp(OpBase* op, std::shared_ptr<OpBase> op_shared) {
grad_ops_[op] = std::move(op_shared);
}
void Clear() { grad_ops_.clear(); }
void InsertGradVar(VarBase* grad) { grad_vars_.emplace(grad); }
bool IsGrad(VarBase* var) { return grad_vars_.count(var) > 0; }
void Clear() {
grad_ops_.clear();
grad_vars_.clear();
}
private:
std::unordered_map<OpBase*, std::shared_ptr<OpBase>>
grad_ops_; // opBase for remove - grad_op
std::unordered_set<VarBase*> grad_vars_;
};
class BasicEngine : public Engine {
......@@ -69,7 +79,9 @@ class BasicEngine : public Engine {
private:
void PrepareDeps();
bool CheckBackwardInputs(OpBase* op);
void CheckBackwardInputs(OpBase* op);
void SetBackwardOutputs(OpBase* op);
void PrepareGradAccumulators(OpBase* op);
......
......@@ -105,10 +105,23 @@ void TensorAdd(const framework::Variable& src, framework::Variable* dst) {
void EagerGradientAccumulator::Add(std::shared_ptr<VarBase> var,
size_t trace_id) {
auto* dst_var = var_->MutableVar();
if (cur_cnt_ == 0) {
*dst_var = std::move(*(var->MutableVar()));
auto place = var->Var().Get<framework::LoDTensor>().place();
if (!var_->OverridedStopGradient()) {
VLOG(3) << "Sum Gradient for: " << var_->Name();
if (cur_cnt_ == 0) {
*dst_var = std::move(*(var->MutableVar()));
} else {
TensorAdd(var->Var(), dst_var);
}
} else {
TensorAdd(var->Var(), dst_var);
if (!var_->Var().IsInitialized() ||
!var_->Var().Get<framework::LoDTensor>().IsInitialized()) {
VLOG(6) << "Set StopGradient Grad: " << var->Name() << " as zero";
auto* dev_ctx = platform::DeviceContextPool::Instance().Get(place);
auto* tensor = var_->MutableVar()->GetMutable<framework::LoDTensor>();
tensor->mutable_data(place, var->DataType());
operators::math::set_constant(*dev_ctx, tensor, 0.0);
}
}
++cur_cnt_;
}
......@@ -116,30 +129,44 @@ void EagerGradientAccumulator::Add(std::shared_ptr<VarBase> var,
void SortedGradientAccumulator::Add(std::shared_ptr<VarBase> var,
size_t trace_id) {
auto* dst_var = var_->MutableVar();
if (ref_cnt_ == 1) {
*dst_var = std::move(*(var->MutableVar()));
} else {
if (tmp_grad_vars_.empty()) {
tmp_grad_vars_.reserve(ref_cnt_);
}
tmp_grad_vars_.emplace_back(std::move(var), trace_id);
if (tmp_grad_vars_.size() != ref_cnt_) {
return;
auto place = var->Var().Get<framework::LoDTensor>().place();
if (!var_->OverridedStopGradient()) {
if (ref_cnt_ == 1) {
*dst_var = std::move(*(var->MutableVar()));
} else {
if (tmp_grad_vars_.empty()) {
tmp_grad_vars_.reserve(ref_cnt_);
}
tmp_grad_vars_.emplace_back(std::move(var), trace_id);
if (tmp_grad_vars_.size() != ref_cnt_) {
return;
}
std::sort(tmp_grad_vars_.begin(), tmp_grad_vars_.end(),
[](const std::pair<std::shared_ptr<VarBase>, size_t>& p1,
const std::pair<std::shared_ptr<VarBase>, size_t>& p2) {
return p1.second > p2.second;
});
*dst_var = std::move(*(tmp_grad_vars_[0].first->MutableVar()));
for (size_t i = 1; i < tmp_grad_vars_.size(); ++i) {
TensorAdd(tmp_grad_vars_[i].first->Var(), dst_var);
}
tmp_grad_vars_.clear();
}
std::sort(tmp_grad_vars_.begin(), tmp_grad_vars_.end(),
[](const std::pair<std::shared_ptr<VarBase>, size_t>& p1,
const std::pair<std::shared_ptr<VarBase>, size_t>& p2) {
return p1.second > p2.second;
});
*dst_var = std::move(*(tmp_grad_vars_[0].first->MutableVar()));
for (size_t i = 1; i < tmp_grad_vars_.size(); ++i) {
TensorAdd(tmp_grad_vars_[i].first->Var(), dst_var);
} else {
if (!var_->Var().IsInitialized() ||
!var_->Var().Get<framework::LoDTensor>().IsInitialized()) {
VLOG(6) << "Set StopGradient Grad: " << var->Name() << " as zero";
auto* dev_ctx = platform::DeviceContextPool::Instance().Get(place);
auto* tensor = var_->MutableVar()->GetMutable<framework::LoDTensor>();
tensor->mutable_data(place, var->DataType());
operators::math::set_constant(*dev_ctx, tensor, 0.0);
}
// looks like tmp_grad_vars will not have any member but just in case
tmp_grad_vars_.clear();
}
}
......
......@@ -93,14 +93,44 @@ class VarBase {
return &(grad_var_->var_);
}
void SetStopGradient(bool stop_gradient) {
stop_gradient_ = stop_gradient;
// This is used for python api
void SetOverridedStopGradient(bool stop_gradient) {
if (stop_gradient) {
overrided_stop_gradient_ = 1;
} else {
overrided_stop_gradient_ = 0;
}
if (grad_var_) {
grad_var_->stop_gradient_ = stop_gradient;
grad_var_->SetOverridedStopGradient(stop_gradient);
}
}
// This is used for python api
bool OverridedStopGradient() const {
if (overrided_stop_gradient_ == 0) {
return false;
} else {
return true;
}
}
bool StopGradient() const { return stop_gradient_; }
// This is used inside C++
int InnerOverridedStopGradient() const { return overrided_stop_gradient_; }
bool GradGenerated() const { return grad_generated_; }
void SetGradGenerated(bool generated) { grad_generated_ = generated; }
// This is used inside C++
void InnerSetOverridedStopGradient(bool stop_gradient) {
if (overrided_stop_gradient_ == -1) {
overrided_stop_gradient_ = static_cast<int>(stop_gradient);
if (grad_var_) {
grad_var_->InnerSetOverridedStopGradient(stop_gradient);
}
} else {
VLOG(6) << "Ignore Stop gradient conversion for Var: " << Name()
<< "Set value is: " << overrided_stop_gradient_;
}
}
void SetPersistable(bool persistable) { persistable_ = persistable; }
......@@ -156,8 +186,11 @@ class VarBase {
// grad_op indicates which grad_op will this var be used as input
std::vector<std::weak_ptr<OpBase>> grad_ops_;
bool stop_gradient_{false};
// add this property for users may set stop_gradient themselves and this
// should override the
// frameworks setting (-1) unset, (1) true, (0) false
int overrided_stop_gradient_{-1};
bool grad_generated_{false};
bool persistable_{false};
framework::proto::VarType::Type type_{framework::proto::VarType::LOD_TENSOR};
......
......@@ -139,10 +139,10 @@ TEST(test_layer, test_varbase_basic) {
vin_with_grad->MutableGradVar()) != 0));
ASSERT_TRUE(
dynamic_cast<framework::Variable*>(vin_with_grad->MutableGradVar()) != 0);
vin_with_grad->SetStopGradient(true);
ASSERT_TRUE(vin_with_grad->StopGradient());
vin_with_grad->SetOverridedStopGradient(false);
ASSERT_FALSE(vin_with_grad->OverridedStopGradient());
ASSERT_NO_FATAL_FAILURE(vin_with_grad->SetPersistable(true));
ASSERT_TRUE(vin_with_grad->StopGradient());
ASSERT_FALSE(vin_with_grad->OverridedStopGradient());
ASSERT_NO_FATAL_FAILURE(vin_with_grad->SetName("new_name"));
ASSERT_EQ(vin_with_grad->Name(), "new_name");
}
......
......@@ -81,6 +81,7 @@ TEST(test_tracer, test_track_backward_output) {
new imperative::VarBase(true, "x_in"));
std::shared_ptr<imperative::VarBase> y_in(
new imperative::VarBase(false, "y_in"));
x_in->SetOverridedStopGradient(false);
std::shared_ptr<imperative::VarBase> vout(
new imperative::VarBase(true, "vout"));
platform::CPUPlace place;
......@@ -119,6 +120,7 @@ TEST(test_tracer, test_track_backward_input) {
std::shared_ptr<imperative::VarBase> vout(
new imperative::VarBase(false, "vout"));
platform::CPUPlace place;
x_in->SetOverridedStopGradient(false);
std::vector<float> src_data(10, 2.0);
std::vector<int64_t> dims1 = {2, 5};
std::vector<int64_t> dims2 = {5, 2};
......
......@@ -32,6 +32,16 @@ static std::vector<std::unique_ptr<framework::OpDesc>> CreateGradOpDescs(
}
}
static void PassStopGradient(const NameVarBaseMap& outs, bool generate_grad) {
for (const auto& name_pair : outs) {
for (const auto& vb : name_pair.second) {
VLOG(6) << "Set output: " << vb->Name() << "'s OverridedStopGradient as "
<< generate_grad;
vb->InnerSetOverridedStopGradient(generate_grad);
}
}
}
void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins,
const NameVarBaseMap& outs, framework::AttributeMap attrs,
const platform::Place& place, bool trace_backward) {
......@@ -45,16 +55,27 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins,
TraceBackward(op, framework::OpDesc(op->Type(), op->InputNameMap(),
op->OutputNameMap(), op->Attrs()),
ins, outs);
VLOG(6) << "Finish tracking Backward of op: " << type;
} else {
VLOG(3) << "No Grad to track for Op: " << type;
}
VLOG(6) << "Finish tracing fwd op: " << type;
}
bool Tracer::ComputeRequiredGrad(const NameVarBaseMap& ins,
const NameVarBaseMap& outs,
bool trace_backward) {
// TODO(jiabin): Implement auto prune here
return trace_backward;
if (!trace_backward) return false;
for (const auto& name_pair : ins) {
for (const auto& var_base : name_pair.second) {
if (!var_base->OverridedStopGradient()) {
VLOG(6) << "Find out input: " << var_base->Name()
<< "'s GeneratedGrad is True";
PassStopGradient(outs, var_base->OverridedStopGradient());
return true;
}
}
}
return false;
}
void Tracer::TraceBackward(const std::shared_ptr<OpBase>& fwd_op,
......@@ -133,14 +154,25 @@ void Tracer::TraceBackward(const std::shared_ptr<OpBase>& fwd_op,
PADDLE_ENFORCE_EQ(fwd_var_iter != name_to_var.end(), true,
"Cannot find forward variable named %s",
fwd_var_name);
const auto& tmp = (*(fwd_var_iter->second))->GradVarBase();
PADDLE_ENFORCE_NOT_NULL(
(*(fwd_var_iter->second))->GradVarBase(),
tmp.get(),
"Grad of %s should "
"not be NULL when we Track_Backward Input of %s",
(*(fwd_var_iter->second))->Name(), grad_op->Type());
(*(fwd_var_iter->second))->GradVarBase()->AddGradOps(grad_op);
// Create grad_in's dim in tensor for Grad Dependency compute
auto* tensor = tmp->MutableVar()->GetMutable<framework::LoDTensor>();
tensor->Resize((*(fwd_var_iter->second))
->Var()
.Get<framework::LoDTensor>()
.dims());
// Add Grad Op for grad_in
tmp->AddGradOps(grad_op);
VLOG(3) << "Add Grad Op " << grad_op->Type() << " for :"
<< (*(fwd_var_iter->second))->GradVarBase()->Name();
// Add Grad var input to engine set
engine_->InsertGradVar(tmp.get());
VLOG(3) << "Add Grad: " << tmp->Name() << " in to Engine";
bwd_in.emplace_back((*(fwd_var_iter->second))->GradVarBase());
} else {
// If it is a forward var, just add it
......@@ -150,8 +182,7 @@ void Tracer::TraceBackward(const std::shared_ptr<OpBase>& fwd_op,
grad_in_var_name);
bwd_in.emplace_back(*(fwd_var_iter->second));
}
VLOG(3) << "Set backward input " << grad_ins.first << " of "
VLOG(3) << "Set backward input from fwd var" << grad_ins.first << " of "
<< grad_op->Type() << " to be "
<< (bwd_in.back() ? bwd_in.back()->Name() : "nullptr");
}
......@@ -173,40 +204,44 @@ void Tracer::TraceBackward(const std::shared_ptr<OpBase>& fwd_op,
PADDLE_ENFORCE_EQ(fwd_var_iter != name_to_var.end(), true,
"Cannot find forward variable named %s",
iter->second);
PADDLE_ENFORCE_NOT_NULL(
(*(fwd_var_iter->second))->GradVarBase(),
"Grad of %s should "
"not be NULL when we Track_Backward Output of %s",
(*(fwd_var_iter->second))->Name(), grad_op->Type());
bwd_out.emplace_back((*(fwd_var_iter->second))->GradVarBase());
VLOG(3) << "Set backward output " << grad_outs.first << " of "
<< grad_op->Type() << " to be "
<< (bwd_out.back() ? bwd_out.back()->Name() : "nullptr");
auto preceding_ops =
(*(fwd_var_iter->second))->GradVarBase()->GradOps();
if (VLOG_IS_ON(3) && !preceding_ops.empty()) {
VLOG(3) << "Add preceding Op of :"
<< (*(fwd_var_iter->second))->GradVarBase()->Name()
<< " It's preceding Op are: ";
for (const auto& op : preceding_ops) {
VLOG(3) << op->Type();
const auto& tmp = (*(fwd_var_iter->second))->GradVarBase();
PADDLE_ENFORCE_NOT_NULL(tmp.get(),
"Grad output: %s of op: %s should not be NULL",
(tmp->Name(), grad_op->Type()));
if ((!tmp->OverridedStopGradient()) || (grad_outs.second.size() > 1)) {
VLOG(3) << "Set backward output " << grad_outs.first << " of "
<< grad_op->Type() << " to be " << tmp->Name()
<< ". Its Overrided Stop_Gradient is: False";
bwd_out.emplace_back(tmp);
auto grad_pending_ops =
(*(fwd_var_iter->second))->GradVarBase()->GradOps();
if (VLOG_IS_ON(3) && !grad_pending_ops.empty()) {
VLOG(3) << "Add grad_pending Op of :"
<< (*(fwd_var_iter->second))->GradVarBase()->Name()
<< " It's grad_pending Op are: ";
for (const auto& op : grad_pending_ops) {
VLOG(3) << op->Type();
}
}
}
if (!preceding_ops.empty()) {
for (const auto& op : preceding_ops) {
PADDLE_ENFORCE_NOT_NULL(op, "No nullptr should be preceding_op");
if (visited_preceding_ops.count(op) == 0) {
visited_preceding_ops.insert(op);
grad_op->InsertGradPendingOps(op);
if (!grad_pending_ops.empty()) {
for (const auto& op : grad_pending_ops) {
PADDLE_ENFORCE_NOT_NULL(op,
"No nullptr should be grad_pending op");
if (visited_preceding_ops.count(op) == 0) {
visited_preceding_ops.insert(op);
grad_op->InsertGradPendingOps(op);
}
}
} else {
VLOG(5) << "Hit leaf VarBase"
<< (*(fwd_var_iter->second))->GradVarBase()->Name();
}
} else {
VLOG(5) << "Hit leaf VarBase";
VLOG(5) << "Hit leaf VarBase"
<< (*(fwd_var_iter->second))->GradVarBase()->Name();
VLOG(3) << "Skip backward output " << grad_outs.first << " of "
<< grad_op->Type() << " Named: " << tmp->Name()
<< ", since its Overrided Stop_Gradient is: True";
}
}
}
......
......@@ -230,13 +230,11 @@ void BindImperative(py::module *m_ptr) {
[](imperative::VarBase &self, const std::string &name,
framework::proto::VarType::Type type,
framework::proto::VarType::Type dtype,
const std::vector<int> &dims, bool stop_gradient,
bool persistable) {
const std::vector<int> &dims, bool persistable) {
new (&self) imperative::VarBase(name);
self.SetPersistable(persistable);
self.SetType(type);
self.SetDataType(dtype);
self.SetStopGradient(stop_gradient);
if (type == framework::proto::VarType::LOD_TENSOR) {
auto *tensor =
self.MutableVar()->GetMutable<framework::LoDTensor>();
......@@ -302,8 +300,9 @@ void BindImperative(py::module *m_ptr) {
.def_property_readonly("dtype", &imperative::VarBase::DataType)
.def_property("persistable", &imperative::VarBase::Persistable,
&imperative::VarBase::SetPersistable)
.def_property("stop_gradient", &imperative::VarBase::StopGradient,
&imperative::VarBase::SetStopGradient);
.def_property("stop_gradient",
&imperative::VarBase::OverridedStopGradient,
&imperative::VarBase::SetOverridedStopGradient);
py::class_<imperative::Layer, Layer /* <--- trampoline*/> layer(m, "Layer");
layer.def(py::init<>())
......
......@@ -456,12 +456,13 @@ class Variable(object):
if in_dygraph_mode():
# record vars in tracer rather than blocks
self._ivar = kwargs.get("ivar", None)
self.stop_gradient_ = kwargs.get("stop_gradient", True)
if not self._ivar:
self._ivar = core.VarBase(
name, type
if type else core.VarDesc.VarType.LOD_TENSOR, dtype
if dtype else core.VarDesc.VarType.FP32,
list(shape) if shape else [], stop_gradient, True
list(shape) if shape else [], True
if persistable else False)
if persistable:
_dygraph_tracer().trace_var(name, self)
......@@ -1847,6 +1848,7 @@ class Block(object):
pass
else:
initializer(param, self)
param.stop_gradient = False
return param
def append_op(self, *args, **kwargs):
......
......@@ -266,7 +266,8 @@ class LayerHelperBase(object):
shape,
dtype,
is_bias=False,
default_initializer=None):
default_initializer=None,
stop_gradient=False):
"""Create parameters for this layers.
Args:
......@@ -320,6 +321,7 @@ class LayerHelperBase(object):
return self.main_program.global_block().create_parameter(
dtype=dtype,
shape=shape,
stop_gradient=stop_gradient,
**attr._to_kwargs(with_initializer=True))
else:
self.startup_program.global_block().create_parameter(
......
......@@ -6980,8 +6980,8 @@ def one_hot(input, depth, allow_out_of_range=False):
type="one_hot",
inputs=inputs,
attrs=attrs,
outputs={'Out': one_hot_out},
stop_gradient=True)
outputs={'Out': one_hot_out})
one_hot_out.stop_gradient = True
return one_hot_out
......@@ -7019,8 +7019,7 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1):
type='increment',
inputs={'X': [counter]},
outputs={'Out': [counter]},
attrs={'step': float(step)},
stop_gradient=True)
attrs={'step': float(step)})
counter.stop_gradient = True
return counter
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import paddle.fluid as fluid
import numpy as np
class AutoPruneLayer0(fluid.Layer):
def __init__(self, name_scope):
super(AutoPruneLayer0, self).__init__(name_scope)
self.fc1 = fluid.dygraph.FC(
"FC_1",
5,
param_attr=fluid.initializer.ConstantInitializer(value=2),
bias_attr=False)
self.fc2 = fluid.dygraph.FC(
"FC_2",
5,
param_attr=fluid.initializer.ConstantInitializer(value=2),
bias_attr=False)
def forward(self, x, y):
a = self.fc1(x)
b = self.fc2(y)
c = fluid.layers.mul(a, b)
d = fluid.layers.reduce_mean(c)
return d
class AutoPruneLayer1(fluid.Layer):
def __init__(self, name_scope):
super(AutoPruneLayer1, self).__init__(name_scope)
self.fc1 = fluid.dygraph.FC(
"FC_1",
5,
param_attr=fluid.initializer.ConstantInitializer(value=2),
bias_attr=False)
self.fc2 = fluid.dygraph.FC(
"FC_2",
5,
param_attr=fluid.initializer.ConstantInitializer(value=2),
bias_attr=False)
def forward(self, x, y):
a = self.fc1(x)
b = self.fc2(y)
b.stop_gradient = True
c = fluid.layers.mul(a, b)
d = fluid.layers.reduce_mean(c)
return d
class AutoPruneLayer2(fluid.Layer):
def __init__(self, name_scope):
super(AutoPruneLayer2, self).__init__(name_scope)
self.fc = fluid.dygraph.FC("FC1", size=10, act=None)
self.fc2 = fluid.dygraph.FC("FC2", size=1, act=None)
def forward(self, x, label):
feature = self.fc(x)
label = self.fc2(label)
label = fluid.layers.cast(label, dtype="float32")
label = fluid.layers.cast(label, dtype='int64')
# Note that the label is not persistable in fluid.layers.cross_entropy.
loss = fluid.layers.cross_entropy(input=feature, label=label)
loss = fluid.layers.mean(loss)
return loss
class AutoPruneLayer3(fluid.Layer):
def __init__(self, name_scope):
super(AutoPruneLayer3, self).__init__(name_scope)
self.fc = fluid.dygraph.FC("FC1", size=20, act=None)
def forward(self, x, label, test_num):
feature = self.fc(x)
part1, part2 = fluid.layers.split(
feature, num_or_sections=[10, 10], dim=1)
# Note that: part2 is not used.
loss = fluid.layers.cross_entropy(input=part1, label=label)
loss = fluid.layers.mean(loss)
if test_num == 1:
return loss, part2
else:
return loss, part1, part2
class MyLayer(fluid.Layer):
def __init__(self, name_scope, vocab_size, size, dtype="float32"):
super(MyLayer, self).__init__(name_scope, dtype)
self.embed0 = fluid.Embedding(self.full_name(), size=(vocab_size, size))
self.embed1 = fluid.Embedding(self.full_name(), size=(vocab_size, size))
self.fc0 = fluid.FC(self.full_name(), size=size, dtype=dtype)
self.fc1 = fluid.FC(self.full_name(), size=size, dtype=dtype)
def forward(self, x):
# this method involves only the fc layers
loss = fluid.layers.reduce_mean(self.fc0(x) + self.fc1(x))
return loss
def linear0(self, x):
loss = fluid.layers.reduce_mean(self.fc0(x))
return loss
def embed_linear0(self, x):
loss = fluid.layers.reduce_mean(self.fc0(self.embed0(x)))
return loss
class MyLayer2(fluid.Layer):
def __init__(self, name_scope, vocab_size, size, dtype="float32"):
super(MyLayer2, self).__init__(name_scope, dtype)
self.embed0 = fluid.Embedding(self.full_name(), size=(vocab_size, size))
self.embed1 = fluid.Embedding(self.full_name(), size=(vocab_size, size))
self.fc0 = fluid.FC(self.full_name(), size=size, dtype=dtype)
self.fc1 = fluid.FC(self.full_name(), size=size, dtype=dtype)
def forward(self, indices):
# mind the difference with MyLayer
# In this example, the forward method involes all params
loss = fluid.layers.reduce_mean(
self.fc0(self.embed0(indices)) + self.fc1(self.embed1(indices)))
return loss
def linear0(self, x):
loss = fluid.layers.reduce_mean(self.fc0(x))
return loss
def embed_linear0(self, x):
loss = fluid.layers.reduce_mean(self.fc0(self.embed0(x)))
return loss
class TestImperativeAutoPrune(unittest.TestCase):
def test_auto_prune(self):
with fluid.dygraph.guard():
case1 = AutoPruneLayer0("l1")
value1 = np.arange(25).reshape(5, 5).astype("float32")
value2 = np.arange(25).reshape(5, 5).astype("float32")
v1 = fluid.dygraph.to_variable(value1)
v2 = fluid.dygraph.to_variable(value2)
loss = case1(v1, v2)
loss.backward()
self.assertTrue(case1.fc2._w._ivar._grad_ivar() is not None)
self.assertTrue(case1.fc1._w._ivar._grad_ivar() is not None)
def test_auto_prune2(self):
with fluid.dygraph.guard():
case2 = AutoPruneLayer1("l1")
value1 = np.arange(25).reshape(5, 5).astype("float32")
value2 = np.arange(25).reshape(5, 5).astype("float32")
v1 = fluid.dygraph.to_variable(value1)
v2 = fluid.dygraph.to_variable(value2)
loss = case2(v1, v2)
loss.backward()
self.assertTrue(case2.fc2._w._ivar._grad_ivar() is None)
self.assertTrue(case2.fc1._w._ivar._grad_ivar() is not None)
def test_auto_prune3(self):
with fluid.dygraph.guard():
case3 = AutoPruneLayer3("l3")
value1 = np.arange(784).reshape(1, 784).astype("float32")
value2 = np.arange(1).reshape(1, 1).astype("int64")
v1 = fluid.dygraph.to_variable(value1)
v2 = fluid.dygraph.to_variable(value2)
loss, part2 = case3(v1, v2, 1)
loss.backward()
self.assertTrue(case3.fc._w._ivar._grad_ivar() is not None)
self.assertTrue((part2.gradient() == 0).all())
def test_auto_prune4(self):
with fluid.dygraph.guard():
case4 = AutoPruneLayer3("l3")
value1 = np.arange(784).reshape(1, 784).astype("float32")
value2 = np.arange(1).reshape(1, 1).astype("int64")
v1 = fluid.dygraph.to_variable(value1)
v2 = fluid.dygraph.to_variable(value2)
loss, part2 = case4(v1, v2, 1)
part2.backward()
self.assertTrue(case4.fc._w._ivar._grad_ivar() is not None)
self.assertTrue((part2.gradient() == 1).all())
def test_auto_prune5(self):
with fluid.dygraph.guard():
case4 = AutoPruneLayer3("l3")
value1 = np.arange(784).reshape(1, 784).astype("float32")
value2 = np.arange(1).reshape(1, 1).astype("int64")
v1 = fluid.dygraph.to_variable(value1)
v2 = fluid.dygraph.to_variable(value2)
loss, part1, part2 = case4(v1, v2, 2)
part1.backward()
self.assertTrue(case4.fc._w._ivar._grad_ivar() is not None)
self.assertTrue((part2.gradient() == 0).all())
def test_auto_prune6(self):
with fluid.dygraph.guard():
value0 = np.arange(26).reshape(2, 13).astype("float32")
value1 = np.arange(6).reshape(2, 3).astype("float32")
value2 = np.arange(10).reshape(2, 5).astype("float32")
fc = fluid.FC("fc1", size=5, dtype="float32")
fc2 = fluid.FC("fc2", size=3, dtype="float32")
a = fluid.dygraph.to_variable(value0)
b = fluid.dygraph.to_variable(value1)
c = fluid.dygraph.to_variable(value2)
out1 = fc(a)
out2 = fc2(b)
out1.stop_gradient = True
out = fluid.layers.concat(input=[out1, out2, c], axis=1)
out.backward()
self.assertTrue((fc._w.gradient() == 0).all())
self.assertTrue((out1.gradient() == 0).all())
def test_auto_prune7(self):
with fluid.dygraph.guard():
value0 = np.arange(26).reshape(2, 13).astype("float32")
value1 = np.arange(6).reshape(2, 3).astype("float32")
value2 = np.arange(10).reshape(2, 5).astype("float32")
fc = fluid.FC("fc1", size=5, dtype="float32")
fc2 = fluid.FC("fc2", size=3, dtype="float32")
a = fluid.dygraph.to_variable(value0)
b = fluid.dygraph.to_variable(value1)
c = fluid.dygraph.to_variable(value2)
out1 = fc(a)
out2 = fc2(b)
out1.stop_gradient = True
out = fluid.layers.concat(input=[out1, out2, c], axis=1)
backward_strategy = fluid.dygraph.BackwardStrategy()
out.backward(backward_strategy)
self.assertTrue((fc._w.gradient() == 0).all())
self.assertTrue((out1.gradient() == 0).all())
def test_auto_prune_with_optimizer(self):
vocab_size = 100
size = 20
batch_size = 16
indices = np.random.randint(
low=0, high=100, size=(batch_size, 1)).astype("int64")
embed = np.random.randn(batch_size, size).astype("float32")
place = fluid.CPUPlace()
with fluid.dygraph.guard(place):
model = MyLayer("mylayer", vocab_size, size)
optimizer = fluid.optimizer.AdamOptimizer(0.001)
grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(0.001)
indices = fluid.dygraph.to_variable(indices)
emebd = fluid.dygraph.to_variable(embed)
dummy_loss = model(embed)
loss = model.embed_linear0(indices)
loss.backward()
_, params_grads = optimizer.minimize(loss, grad_clip=grad_clip)
for items in params_grads:
assert items[0].name is not model.embed1._w.name
assert items[0].name is not model.fc1._w.name
assert model.embed1._w._ivar._grad_ivar() is None
assert model.fc1._w._ivar._grad_ivar() is None
with fluid.dygraph.guard(place):
model = MyLayer2("mylayer", vocab_size, size)
optimizer = fluid.optimizer.AdamOptimizer(0.001)
grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(0.001)
indices = fluid.dygraph.to_variable(indices)
emebd = fluid.dygraph.to_variable(embed)
dummy_loss = model(indices)
loss = model.embed_linear0(indices)
loss.backward()
optimizer.minimize(loss, grad_clip=grad_clip)
for items in params_grads:
assert items[0].name is not model.embed1._w.name
assert items[0].name is not model.fc1._w.name
assert model.embed1._w._ivar._grad_ivar() is None
assert model.fc1._w._ivar._grad_ivar() is None
def test_case2_prune_no_grad_branch(self):
with fluid.dygraph.guard():
value1 = np.arange(784).reshape(1, 784)
value2 = np.arange(1).reshape(1, 1)
v1 = fluid.dygraph.to_variable(value1).astype("float32")
v2 = fluid.dygraph.to_variable(value2).astype("float32")
case3 = AutoPruneLayer2("l2")
loss = case3(v1, v2)
loss.backward()
self.assertTrue(case3.fc2._w._ivar._grad_ivar() is None)
self.assertTrue(case3.fc._w._ivar._grad_ivar() is not None)
def test_case2_prune_no_grad_branch(self):
with fluid.dygraph.guard():
value1 = np.arange(784).reshape(1, 784)
value2 = np.arange(1).reshape(1, 1)
v1 = fluid.dygraph.to_variable(value1).astype("float32")
v2 = fluid.dygraph.to_variable(value2).astype("float32")
case3 = AutoPruneLayer2("l2")
loss = case3(v1, v2)
loss.backward()
self.assertTrue(case3.fc2._w._ivar._grad_ivar() is None)
self.assertTrue(case3.fc._w._ivar._grad_ivar() is not None)
def test_case3_prune_no_grad_branch2(self):
with fluid.dygraph.guard():
value1 = np.arange(1).reshape(1, 1)
fc = fluid.dygraph.FC("FC1", size=1, act=None)
label = fluid.dygraph.to_variable(value1).astype("float32")
label = fc(label)
label = fluid.layers.cast(label, dtype="float32")
label = fluid.layers.cast(label, dtype='int64')
out = fluid.layers.one_hot(input=label, depth=100)
loss = fluid.layers.mean(out)
loss.backward()
self.assertTrue(fc._w._ivar._grad_ivar() is None)
def test_case4_with_no_grad_op_maker(self):
with fluid.dygraph.guard():
out = fluid.layers.gaussian_random(shape=[20, 30])
loss = fluid.layers.mean(out)
loss.backward()
self.assertTrue(out._ivar._grad_ivar() is None)
if __name__ == '__main__':
unittest.main()
......@@ -183,14 +183,18 @@ class TestImperative(unittest.TestCase):
with fluid.dygraph.guard():
inputs = []
for _ in range(10):
inputs.append(fluid.dygraph.base.to_variable(x))
tmp = fluid.dygraph.base.to_variable(x)
tmp.stop_gradient = False
inputs.append(tmp)
ret = fluid.layers.sums(inputs)
loss = fluid.layers.reduce_sum(ret)
loss.backward()
with fluid.dygraph.guard():
inputs2 = []
for _ in range(10):
inputs2.append(fluid.dygraph.base.to_variable(x))
tmp = fluid.dygraph.base.to_variable(x)
tmp.stop_gradient = False
inputs2.append(tmp)
ret2 = fluid.layers.sums(inputs2)
loss2 = fluid.layers.reduce_sum(ret2)
backward_strategy = fluid.dygraph.BackwardStrategy()
......@@ -214,6 +218,7 @@ class TestImperative(unittest.TestCase):
np_inp = np.array([1.0, 2.0, -1.0], dtype=np.float32)
with fluid.dygraph.guard():
var_inp = fluid.dygraph.base.to_variable(np_inp)
var_inp.stop_gradient = False
l = MyLayer("my_layer")
x = l(var_inp)[0]
self.assertIsNotNone(x)
......@@ -223,6 +228,7 @@ class TestImperative(unittest.TestCase):
with fluid.dygraph.guard():
var_inp2 = fluid.dygraph.base.to_variable(np_inp)
var_inp2.stop_gradient = False
l2 = MyLayer("my_layer")
x2 = l2(var_inp2)[0]
self.assertIsNotNone(x2)
......
......@@ -47,6 +47,8 @@ class TestRecurrentFeed(unittest.TestCase):
fluid.default_main_program().random_seed = seed
original_in1 = to_variable(original_np1)
original_in2 = to_variable(original_np2)
original_in1.stop_gradient = False
original_in2.stop_gradient = False
rt = RecurrentTest("RecurrentTest")
for i in range(3):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册