diff --git a/paddle/fluid/framework/block_desc.cc b/paddle/fluid/framework/block_desc.cc index f537e4b9e569dd4c513ac0efde7240833bcf04b6..f4bb2f3e2fc2c8cf0376631d1996b395a8bc581a 100644 --- a/paddle/fluid/framework/block_desc.cc +++ b/paddle/fluid/framework/block_desc.cc @@ -163,6 +163,20 @@ std::vector BlockDesc::AllOps() const { return res; } +void BlockDesc::Clear() { + // clear all ops + ops_.clear(); + + // clear all vars which are not persistable + for (auto it = vars_.begin(); it != vars_.end();) { + if (it->second->Persistable()) { + ++it; + } else { + vars_.erase(it++); + } + } +} + void BlockDesc::Flush() { for (auto &op_desc : ops_) { op_desc->Flush(); diff --git a/paddle/fluid/framework/block_desc.h b/paddle/fluid/framework/block_desc.h index 960ca39e1eadd3c064beb0e2c1342a406c4f0b6a..e192624a261e1291f1610e8e7e700d99a9d814d2 100644 --- a/paddle/fluid/framework/block_desc.h +++ b/paddle/fluid/framework/block_desc.h @@ -97,6 +97,8 @@ class BlockDesc { std::vector AllOps() const; + void Clear(); + size_t OpSize() const { return ops_.size(); } OpDesc *Op(int idx) const { return ops_.at(idx).get(); } diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index 8c91f867814c064b6c9f148666eda68d2d7a0793..bbf614831ca817031b209fffec043495ea24d10f 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -114,23 +114,23 @@ class VarBase { public: VarBase() : VarBase(new framework::Variable(), new VarBase(true)) {} - // Owns `var` and `grad` + explicit VarBase(bool stop_gradient) + : VarBase(new framework::Variable(), + stop_gradient ? nullptr : new VarBase(true), stop_gradient) {} + VarBase(framework::Variable* var, VarBase* grad) + : VarBase(var, grad, false) {} + + private: + VarBase(framework::Variable* var, VarBase* grad, bool stop_gradient) : var_desc_(nullptr), var_(var), grads_(grad), - stop_gradient_(false), - pre_op_(nullptr), - pre_op_out_idx_(-1) {} - - explicit VarBase(bool stop_gradient) - : var_desc_(nullptr), - var_(new framework::Variable()), - grads_(stop_gradient ? nullptr : new VarBase(true)), stop_gradient_(stop_gradient), pre_op_(nullptr), pre_op_out_idx_(-1) {} + public: virtual ~VarBase() { if (var_) { delete var_; @@ -141,11 +141,13 @@ class VarBase { } } - OpBase* PreOp() const { return pre_op_; } - int PreOpOutIdx() const { return pre_op_out_idx_; } + inline OpBase* PreOp() const { return pre_op_; } + inline int PreOpOutIdx() const { return pre_op_out_idx_; } - void SetStopGradient(bool stop_gradient) { stop_gradient_ = stop_gradient; } - bool IsStopGradient() const { return stop_gradient_; } + inline void SetStopGradient(bool stop_gradient) { + stop_gradient_ = stop_gradient; + } + inline bool IsStopGradient() const { return stop_gradient_; } void RunBackward(); diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index a77c842bd8958ba55f0927b3dc2999be9bb34ba5..2993ab309027f9306c61023b55b1c061e0ebddc0 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -14,6 +14,8 @@ #include "paddle/fluid/imperative/tracer.h" +#include + #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" @@ -66,16 +68,18 @@ platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs) { return result; } -void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, - const VarBasePtrMap& outputs, framework::BlockDesc* block, - const platform::Place expected_place, - const bool stop_gradient) { +std::set Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, + const VarBasePtrMap& outputs, + framework::BlockDesc* block, + const platform::Place expected_place, + const bool stop_gradient) { std::map vars; framework::OpDesc* op_desc = op->op_desc_; VLOG(3) << "tracer tracing " << op_desc->Type(); op_desc->InferShape(*block); op_desc->InferVarType(block); + std::unique_ptr op_base = framework::OpRegistry::CreateOp(*op_desc); @@ -92,7 +96,7 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, invars.emplace_back(inp->var_); vars[inp->var_desc_->Name()] = inp; - if (inp->PreOp()) { + if (inp->PreOp() && !inp->IsStopGradient()) { op->pre_ops_[it.first].push_back(inp->PreOp()); op->pre_ops_out_idx_[it.first].push_back(inp->PreOpOutIdx()); } else { @@ -142,6 +146,8 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, framework::ExecutionContext(prepared_op.op, scope, *prepared_op.dev_ctx, prepared_op.ctx, prepared_op.kernel_configs)); + std::set vars_saved_for_backward; + if (!stop_gradient) { std::unique_ptr> grad_to_var( new std::unordered_map()); @@ -161,6 +167,7 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, PADDLE_ENFORCE(fwd_var_it != vars.end()); // Forward inputs or outputs. grad_in_vars.push_back(fwd_var_it->second->var_); + vars_saved_for_backward.insert(it.first); } else { VarBase* var = vars[var_it->second]; if (!var->grads_->var_->IsInitialized()) { @@ -194,6 +201,7 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, } op->block_ = block; + return vars_saved_for_backward; } std::vector Tracer::PyTrace(OpBase* op, @@ -203,7 +211,7 @@ std::vector Tracer::PyTrace(OpBase* op, op->input_vars_[PyLayer::kFwdInp] = inputs; op->output_vars_[PyLayer::kFwdOut] = PyLayer::Apply(op->forward_id_, inputs); for (VarBase* inp : inputs) { - if (inp->PreOp()) { + if (inp->PreOp() && !inp->IsStopGradient()) { op->pre_ops_[PyLayer::kFwdInp].push_back(inp->PreOp()); op->pre_ops_out_idx_[PyLayer::kFwdInp].push_back(inp->PreOpOutIdx()); } else { diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index 690838215581b09ff35a0ea13f30655b77e6e187..98909e378f0e4188250fcb6efd9502dcc9740da4 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -15,6 +15,7 @@ #pragma once #include +#include #include #include @@ -43,10 +44,11 @@ class Tracer { virtual ~Tracer() {} - void Trace(OpBase* op, const VarBasePtrMap& inputs, - const VarBasePtrMap& outputs, framework::BlockDesc* block, - const platform::Place expected_place, - const bool stop_gradient = false); + std::set Trace(OpBase* op, const VarBasePtrMap& inputs, + const VarBasePtrMap& outputs, + framework::BlockDesc* block, + const platform::Place expected_place, + const bool stop_gradient = false); std::vector PyTrace(OpBase* op, const std::vector& inputs, bool stop_gradient = false); diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 31c3bfa43ffec22059a602e9ff09a33188d72c91..aeabed19abfda3c857f54e5ada54d52bf95e2602 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -34,8 +34,8 @@ void BindTracer(pybind11::module* m) { framework::BlockDesc* block, const platform::CPUPlace expected_place, const bool stop_gradient = false) { - self.Trace(op, inputs, outputs, block, expected_place, - stop_gradient); + return self.Trace(op, inputs, outputs, block, expected_place, + stop_gradient); }) .def("trace", [](imperative::Tracer& self, imperative::OpBase* op, @@ -44,8 +44,8 @@ void BindTracer(pybind11::module* m) { framework::BlockDesc* block, const platform::CUDAPlace expected_place, const bool stop_gradient = false) { - self.Trace(op, inputs, outputs, block, expected_place, - stop_gradient); + return self.Trace(op, inputs, outputs, block, expected_place, + stop_gradient); }) .def("py_trace", &imperative::Tracer::PyTrace, pybind11::return_value_policy::take_ownership); diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc index e729be4a95a58510f1e0162af4216feaa400d971..48fe445b7d01287c37bcf7d4811f687785ca78d5 100644 --- a/paddle/fluid/pybind/protobuf.cc +++ b/paddle/fluid/pybind/protobuf.cc @@ -189,6 +189,8 @@ void BindBlockDesc(pybind11::module *m) { return self.HasVar(name); }, pybind11::return_value_policy::reference) + .def("_clear_block", [](pd::BlockDesc &self) { return self.Clear(); }, + pybind11::return_value_policy::reference) .def("_rename_var", [](pd::BlockDesc &self, const pybind11::bytes &byte_name, const pybind11::bytes &byte_name_new) { diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 489f8d6b3a94373d009ba2be3b0de1de13643c5c..7fe3120acc756d9335e5adbc2e8cfb4270e41c02 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -387,16 +387,19 @@ class Variable(object): # get_capacity is implemented pass - self.block.vars[name] = self - self.op = None - self.stop_gradient = stop_gradient - self.is_data = is_data if _in_imperative_mode(): + # record vars in tracer rather than blocks self._ivar = kwargs.get("ivar", None) if not self._ivar: - self._ivar = core.VarBase() + self._ivar = core.VarBase(stop_gradient) self._ivar.desc = self.desc - self._ivar.stop_gradient = stop_gradient + if persistable: + self.block.vars[name] = self + else: + self.block.vars[name] = self + self.op = None + self.stop_gradient = stop_gradient + self.is_data = is_data def _numpy(self): new_ivar = self._ivar._copy_to(core.CPUPlace(), True) @@ -739,6 +742,7 @@ class Operator(object): if _in_imperative_mode(): self.iop = core.OpBase() self.iop.desc = self.desc + self.inputs = defaultdict(list) if inputs is not None: for k, v in six.iteritems(inputs): @@ -746,6 +750,7 @@ class Operator(object): self.inputs[k].append(v._ivar) elif isinstance(v, list) or isinstance(v, tuple): self.inputs[k].extend([var._ivar for var in v]) + self.outputs = defaultdict(list) if outputs is not None: for k, v in six.iteritems(outputs): @@ -1195,6 +1200,15 @@ class Block(object): else: raise ValueError("Var {0} is not found recursively".format(name)) + def _clear_block(self): + # TODO(minqiyang): move this to backward_hooks + self.desc._clear_block() + + for name in self.vars.keys(): + assert self.vars[name].persistable + + del self.ops[:] + def all_parameters(self): return list(self.iter_parameters()) @@ -1325,18 +1339,31 @@ class Block(object): inputs=kwargs.get("inputs", None), outputs=kwargs.get("outputs", None), attrs=kwargs.get("attrs", None)) + + if _in_imperative_mode(): + # record ops in tracer rather than blocks + # + # TODO(minqiyang): add op stop_gradient support in static mode too. + # currently, we only support stop_gradient in imperative mode. + self._trace_op(op, kwargs.get("stop_gradient", False)) self.ops.append(op) - # TODO(minqiyang): add stop_gradient support in static mode too. - # currently, we only support stop_gradient in imperative mode. - self._trace_op(op, kwargs.get("stop_gradient", False)) return op def _trace_op(self, op, stop_gradient=False): - if _in_imperative_mode(): - _imperative_tracer().trace(op.iop, op.inputs, op.outputs, self.desc, - _imperative_current_expected_place_, - stop_gradient) + backward_refs = _imperative_tracer().trace( + op.iop, op.inputs, op.outputs, self.desc, + _imperative_current_expected_place_, stop_gradient) + + # TODO(minqiyang): support backward_hooks to eager remove backward_refs + op.backward_refs = defaultdict(list) + for k, v in six.iteritems(op.inputs): + if k in backward_refs: + op.backward_refs[k] = op.inputs[k] + + for k, v in six.iteritems(op.outputs): + if k in backward_refs: + op.backward_refs[k] = op.outputs[k] def _insert_op(self, index, *args, **kwargs): """ @@ -1391,7 +1418,8 @@ class Block(object): outputs=kwargs.get("outputs", None), attrs=kwargs.get("attrs", None)) self.ops.insert(0, op) - self._trace_op(op, kwargs.get("stop_gradient", False)) + if _in_imperative_mode(): + self._trace_op(op, kwargs.get("stop_gradient", False)) return op def _sync_with_cpp(self): diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py index 780c6a6be567c9f60f472c27cebd5300d56eb378..0d0a3bbe0bd47fe0e01761f8b42c92b884a5680a 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py @@ -105,7 +105,7 @@ class MNIST(fluid.imperative.Layer): class TestImperativeMnist(unittest.TestCase): def test_mnist_float32(self): seed = 90 - batch_num = 2 + epoch_num = 1 with fluid.imperative.guard(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed @@ -113,39 +113,40 @@ class TestImperativeMnist(unittest.TestCase): mnist = MNIST("mnist") sgd = SGDOptimizer(learning_rate=1e-3) train_reader = paddle.batch( - paddle.dataset.mnist.train(), batch_size=128) + paddle.dataset.mnist.train(), batch_size=128, drop_last=True) dy_param_init_value = {} - for batch_id, data in enumerate(train_reader()): - if batch_id >= batch_num: - break - - dy_x_data = np.array( - [x[0].reshape(1, 28, 28) for x in data]).astype('float32') - y_data = np.array([x[1] for x in data]).astype('int64').reshape( - 128, 1) - - img = to_variable(dy_x_data) - label = to_variable(y_data) - label._stop_gradient = True - - cost = mnist(img) - loss = fluid.layers.cross_entropy(cost, label) - avg_loss = fluid.layers.mean(loss) - dy_out = avg_loss._numpy() - - if batch_id == 0: - for param in fluid.default_main_program().global_block( - ).all_parameters(): - dy_param_init_value[param.name] = param._numpy() - - avg_loss._backward() - sgd.minimize(avg_loss) - mnist.clear_gradients() - dy_param_value = {} - for param in fluid.default_main_program().global_block( - ).all_parameters(): - dy_param_value[param.name] = param._numpy() + for epoch in range(epoch_num): + for batch_id, data in enumerate(train_reader()): + dy_x_data = np.array( + [x[0].reshape(1, 28, 28) + for x in data]).astype('float32') + y_data = np.array( + [x[1] for x in data]).astype('int64').reshape(128, 1) + + img = to_variable(dy_x_data) + label = to_variable(y_data) + label._stop_gradient = True + + cost = mnist(img) + loss = fluid.layers.cross_entropy(cost, label) + avg_loss = fluid.layers.mean(loss) + + dy_out = avg_loss._numpy() + + if epoch == 0 and batch_id == 0: + for param in mnist.parameters(): + dy_param_init_value[param.name] = param._numpy() + + avg_loss._backward() + sgd.minimize(avg_loss) + mnist.clear_gradients() + + fluid.default_main_program().global_block()._clear_block() + + dy_param_value = {} + for param in mnist.parameters(): + dy_param_value[param.name] = param._numpy() with new_program_scope(): fluid.default_startup_program().random_seed = seed @@ -157,7 +158,7 @@ class TestImperativeMnist(unittest.TestCase): mnist = MNIST("mnist") sgd = SGDOptimizer(learning_rate=1e-3) train_reader = paddle.batch( - paddle.dataset.mnist.train(), batch_size=128) + paddle.dataset.mnist.train(), batch_size=128, drop_last=True) img = fluid.layers.data( name='pixel', shape=[1, 28, 28], dtype='float32') @@ -170,8 +171,7 @@ class TestImperativeMnist(unittest.TestCase): # initialize params and fetch them static_param_init_value = {} static_param_name_list = [] - for param in fluid.default_startup_program().global_block( - ).all_parameters(): + for param in mnist.parameters(): static_param_name_list.append(param.name) out = exe.run(fluid.default_startup_program(), @@ -180,26 +180,29 @@ class TestImperativeMnist(unittest.TestCase): for i in range(len(static_param_name_list)): static_param_init_value[static_param_name_list[i]] = out[i] - for batch_id, data in enumerate(train_reader()): - if batch_id >= batch_num: - break - - static_x_data = np.array( - [x[0].reshape(1, 28, 28) for x in data]).astype('float32') - y_data = np.array([x[1] for x in data]).astype('int64').reshape( - [128, 1]) - - fetch_list = [avg_loss.name] - fetch_list.extend(static_param_name_list) - out = exe.run(fluid.default_main_program(), - feed={"pixel": static_x_data, - "label": y_data}, - fetch_list=fetch_list) - - static_param_value = {} - static_out = out[0] - for i in range(1, len(out)): - static_param_value[static_param_name_list[i - 1]] = out[i] + for epoch in range(epoch_num): + for batch_id, data in enumerate(train_reader()): + static_x_data = np.array( + [x[0].reshape(1, 28, 28) + for x in data]).astype('float32') + y_data = np.array( + [x[1] for x in data]).astype('int64').reshape([128, 1]) + + fetch_list = [avg_loss.name] + fetch_list.extend(static_param_name_list) + out = exe.run( + fluid.default_main_program(), + feed={"pixel": static_x_data, + "label": y_data}, + fetch_list=fetch_list) + + static_param_value = {} + static_out = out[0] + for i in range(1, len(out)): + static_param_value[static_param_name_list[i - 1]] = out[ + i] + + self.assertTrue(np.allclose(dy_x_data.all(), static_x_data.all())) for key, value in six.iteritems(static_param_init_value): self.assertTrue(np.allclose(value, dy_param_init_value[key])) @@ -207,7 +210,7 @@ class TestImperativeMnist(unittest.TestCase): self.assertTrue(np.allclose(static_out, dy_out)) for key, value in six.iteritems(static_param_value): - self.assertTrue(np.allclose(value, dy_param_value[key])) + self.assertTrue(np.allclose(value, dy_param_value[key], atol=1e-5)) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py index 0e134742a7e80c462206072644bb4bf196397b38..4892495e1108e6d2a7e96cab88dc7668e360d79f 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py @@ -231,7 +231,7 @@ class TestImperativeResnet(unittest.TestCase): seed = 90 batch_size = train_parameters["batch_size"] - batch_num = 1 + batch_num = 2 with fluid.imperative.guard(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed @@ -286,6 +286,8 @@ class TestImperativeResnet(unittest.TestCase): optimizer.minimize(avg_loss) resnet.clear_gradients() + fluid.default_main_program().global_block()._clear_block() + dy_param_value = {} for param in resnet.parameters(): dy_param_value[param.name] = param._numpy() @@ -319,11 +321,9 @@ class TestImperativeResnet(unittest.TestCase): static_param_init_value = {} static_param_name_list = [] static_grad_name_list = [] - for param in fluid.default_startup_program().global_block( - ).all_parameters(): + for param in resnet.parameters(): static_param_name_list.append(param.name) - for param in fluid.default_main_program().global_block( - ).all_parameters(): + for param in resnet.parameters(): if not param.stop_gradient: static_grad_name_list.append(param.name + core.grad_var_suffix())