From 31a1cd8ce5d74a803260da56bf0b559e836557c3 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 21 Jan 2019 12:21:42 +0800 Subject: [PATCH] Align the first batch of gpu resnet --- paddle/fluid/imperative/layer.cc | 33 +++++++ paddle/fluid/imperative/layer.h | 2 + paddle/fluid/imperative/tracer.cc | 6 +- paddle/fluid/pybind/pybind.cc | 10 +-- python/paddle/fluid/framework.py | 4 +- python/paddle/fluid/imperative/nn.py | 27 ++++-- python/paddle/fluid/layer_helper.py | 9 +- .../tests/unittests/test_imperative_base.py | 5 +- .../tests/unittests/test_imperative_resnet.py | 90 ++++++++++++++----- 9 files changed, 142 insertions(+), 44 deletions(-) diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index ffe276abb..3ba429d1d 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -167,12 +167,42 @@ class Autograd { } }; +framework::LoDTensor* VarBase::CopiedTensor() const { + PADDLE_ENFORCE(var_->IsInitialized(), + "Variable must be initialized when getting numpy tensor"); + platform::Place place = var_->Get().place(); + framework::LoDTensor* result = new framework::LoDTensor(); + result->Resize(var_->Get().dims()); + result->set_lod(var_->Get().lod()); + if (platform::is_gpu_place(place)) { + VLOG(3) << "fetch tensor " << var_desc_->Name() << " from gpu"; + + framework::TensorCopy(var_->Get(), + platform::CPUPlace(), result); + + platform::DeviceContext* dev_ctx = + platform::DeviceContextPool::Instance().Get(place); + dev_ctx->Wait(); + } else { + TensorCopy(var_->Get(), platform::CPUPlace(), result); + } + + return result; +} + framework::LoDTensor& VarBase::GradValue() { VLOG(3) << "get var grad " << var_desc_->Name(); return *(grads_->var_->GetMutable()); } std::map> OpBase::ApplyGrad() { + VLOG(3) << "ApplyGrad to Op: " << op_desc_->Type(); + for (auto it : input_vars_) { + for (VarBase* var : it.second) { + VLOG(3) << "Op Input: " << it.first << " : " << var->var_desc_->Name(); + } + } + if (!grad_op_desc_ && backward_id_ <= 0) { LOG(WARNING) << "op with no grad: " << op_desc_->Type(); return {}; @@ -222,6 +252,9 @@ std::map> OpBase::ApplyGrad() { for (size_t i = 0; i < outputs.size(); ++i) { framework::Variable* grad = outputs[i]; framework::Variable* orig_grad = origin_outputs[i]; + LOG(ERROR) << "Add grad of " << it.first << " " << i << " " + << orig_grad->GetMutable()->mutable_data( + expected_place_); AddGradTo(grad, orig_grad, expected_place_); delete grad; } diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index 5a1ad5540..4d407d3e2 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -136,6 +136,8 @@ class VarBase { framework::LoDTensor& GradValue(); + framework::LoDTensor* CopiedTensor() const; + inline std::string GradName() const { PADDLE_ENFORCE( var_desc_, diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 0c7e69cc0..41eb6102c 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -43,7 +43,7 @@ void InitVar(framework::Variable* var, framework::Variable* grad_var, grad_var->GetMutable()->mutable_data( var_t.dims(), dev_ctx->GetPlace()); operators::math::set_constant( - *dev_ctx, grad_var->GetMutable(), .0f); + *dev_ctx, grad_var->GetMutable(), 0.0); } platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs) { @@ -162,6 +162,7 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, } else { VarBase* var = vars[var_it->second]; if (!var->grads_->var_->IsInitialized()) { + LOG(ERROR) << "Init grad input " << it.first << " " << grad_invar; InitVar(var->var_, var->grads_->var_, prepared_op.GetDeviceContext()); } @@ -183,6 +184,9 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, VarBase* var = vars[var_it->second]; if (!var->grads_->var_->IsInitialized()) { InitVar(var->var_, var->grads_->var_, prepared_op.GetDeviceContext()); + LOG(ERROR) << "Init grad output " << it.first << " " << grad_outvar + << var->grads_->var_->GetMutable() + ->mutable_data(platform::CPUPlace()); } grad_out_vars.push_back(var->grads_->var_); } diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 7ed91fc6e..3370fb937 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -136,15 +136,11 @@ PYBIND11_MODULE(core, m) { .def("_grad_ivar", [](const imperative::VarBase &self) { return self.grads_; }, py::return_value_policy::reference) + .def("_cpu_tensor", + [](const imperative::VarBase &self) { return self.CopiedTensor(); }, + py::return_value_policy::take_ownership) .def("value", [](const imperative::VarBase &self) { return self.var_; }, py::return_value_policy::reference) - .def("wait_device", - [](const imperative::VarBase &self) { - platform::DeviceContext *dev_ctx = - platform::DeviceContextPool::Instance().Get( - self.var_->Get().place()); - dev_ctx->Wait(); - }) .def_property( "desc", [](const imperative::VarBase &self) { return self.var_desc_; }, diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 56e19ea30..ce0e23969 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -384,8 +384,8 @@ class Variable(object): self._ivar.stop_gradient = stop_gradient def _numpy(self): - self._ivar.wait_device() - tensor = self._ivar.value().get_tensor() + tensor = self._ivar._cpu_tensor() + print('shapex', self.name, tensor.shape()) return np.array(tensor) def _backward(self): diff --git a/python/paddle/fluid/imperative/nn.py b/python/paddle/fluid/imperative/nn.py index 0b4c01f7a..a1b35b402 100644 --- a/python/paddle/fluid/imperative/nn.py +++ b/python/paddle/fluid/imperative/nn.py @@ -55,7 +55,8 @@ class Conv2D(layers.Layer): param_attr=param_attr, bias_attr=bias_attr, dtype=dtype, - name=name) + name=name, + act=act) self._groups = groups self._stride = utils.convert_to_list(stride, 2, 'stride') @@ -141,6 +142,7 @@ class Conv2D(layers.Layer): outputs={'Out': [pre_act]}, attrs={'axis': 1}) + # Currently, we don't support inplace in imperative mode return self._helper.append_activation(pre_act) @@ -239,7 +241,6 @@ class FC(layers.Layer): shape=param_shape, dtype=self._dtype, is_bias=False) - print("create param: ", self._w.name, self._w.stop_gradient) if self._helper.bias_attr: size = list([self._size]) @@ -281,6 +282,7 @@ class FC(layers.Layer): attrs={'axis': self._num_flatten_dims}) else: pre_activation = pre_bias + # Currently, we don't support inplace in imperative mode return self._helper.append_activation(pre_activation) @@ -308,7 +310,11 @@ class BatchNorm(layers.Layer): from ..layer_helper import LayerHelper self._helper = LayerHelper( - 'batch_norm', param_attr=param_attr, bias_attr=bias_attr, name=name) + 'batch_norm', + param_attr=param_attr, + bias_attr=bias_attr, + name=name, + act=act) if dtype == core.VarDesc.VarType.FP16: self._dtype = core.VarDesc.VarType.FP32 @@ -324,18 +330,20 @@ class BatchNorm(layers.Layer): dtype=self._dtype, default_initializer=Constant(1.0)) - # setting stop_gradient=True to reduce computation - if use_global_stats and self._helper.param_attr.learning_rate == 0.: - self._scale.stop_gradient = True + # TODO(minqiyang): change stop_gradient sign to trainable to align with static graph + # # setting stop_gradient=True to reduce computation + # if use_global_stats and self._helper.param_attr.learning_rate == 0.: + # self._scale.stop_gradient = True self._bias = self._helper.create_parameter( attr=self._helper.bias_attr, shape=param_shape, dtype=self._dtype, is_bias=True) - # setting stop_gradient=True to reduce computation - if use_global_stats and self._helper.bias_attr.learning_rate == 0.: - self._bias.stop_gradient = True + # TODO(minqiyang): change stop_gradient sign to trainable to align with static graph + # # setting stop_gradient=True to reduce computation + # if use_global_stats and self._helper.bias_attr.learning_rate == 0.: + # self._bias.stop_gradient = True self._mean = self._helper.create_parameter( attr=ParamAttr( @@ -406,4 +414,5 @@ class BatchNorm(layers.Layer): "use_global_stats": self._use_global_stats }) + # Currently, we don't support inplace in imperative mode return self._helper.append_activation(batch_norm_out) diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py index ea9953f58..f9c9b896b 100644 --- a/python/paddle/fluid/layer_helper.py +++ b/python/paddle/fluid/layer_helper.py @@ -435,8 +435,13 @@ class LayerHelper(object): act_type = act.pop('type') tmp = input_var # NOTE(dzhwinter): some activation support inplace compution. - if not core.IsInplace(act_type): - tmp = self.create_variable_for_type_inference(dtype=input_var.dtype) + # NOTE(minqiyang): currently, we don't support inplace in imperative mode + # if core.IsInplace(act_type) and no_inplace: + # print("inplace", act_type) + # tmp = input_var + # else: + print("not inplace", act_type) + tmp = self.create_variable_for_type_inference(dtype=input_var.dtype) self.append_op( type=act_type, inputs={"X": [input_var]}, diff --git a/python/paddle/fluid/tests/unittests/test_imperative_base.py b/python/paddle/fluid/tests/unittests/test_imperative_base.py index 478cc13fb..e66f79c34 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_base.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_base.py @@ -24,7 +24,8 @@ from paddle.fluid import core def new_program_scope(): prog = fluid.Program() startup_prog = fluid.Program() - scope = fluid.core.Scope() + scope = core.Scope() with fluid.scope_guard(scope): with fluid.program_guard(prog, startup_prog): - yield + with fluid.unique_name.guard(): + yield diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py index 6a4fa7049..8e2ea735c 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py @@ -25,17 +25,18 @@ from paddle.fluid.imperative.nn import Conv2D, Pool2D, BatchNorm, FC from paddle.fluid.imperative.base import to_variable from test_imperative_base import new_program_scope +batch_size = 8 train_parameters = { "input_size": [3, 224, 224], "input_mean": [0.485, 0.456, 0.406], "input_std": [0.229, 0.224, 0.225], "learning_strategy": { "name": "piecewise_decay", - "batch_size": 1, + "batch_size": batch_size, "epochs": [30, 60, 90], "steps": [0.1, 0.01, 0.001, 0.0001] }, - "batch_size": 1, + "batch_size": batch_size, "lr": 0.1, "total_images": 1281164, } @@ -56,6 +57,7 @@ def optimizer_setting(params): lr = [] lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] optimizer = fluid.optimizer.SGD(learning_rate=params["lr"]) + # TODO(minqiyang): Add learning rate scheduler support to imperative mode # optimizer = fluid.optimizer.Momentum( # learning_rate=params["lr"], # learning_rate=fluid.layers.piecewise_decay( @@ -208,8 +210,12 @@ class TestImperativeResnet(unittest.TestCase): resnet = ResNet() optimizer = optimizer_setting(train_parameters) + np.random.seed(seed) + import random + random.seed = seed train_reader = paddle.batch( - paddle.dataset.flowers.train(), batch_size=batch_size) + paddle.dataset.flowers.train(use_xmap=False), + batch_size=batch_size) dy_param_init_value = {} for param in fluid.default_main_program().global_block( @@ -220,18 +226,22 @@ class TestImperativeResnet(unittest.TestCase): if batch_id >= 1: break - x_data = np.array( + dy_x_data = np.array( [x[0].reshape(3, 224, 224) for x in data]).astype('float32') + print('dy input shape', dy_x_data.shape) y_data = np.array([x[1] for x in data]).astype('int64').reshape( batch_size, 1) - img = to_variable(x_data) + img = to_variable(dy_x_data) label = to_variable(y_data) label._stop_gradient = True out = resnet(img) loss = fluid.layers.cross_entropy(input=out, label=label) avg_loss = fluid.layers.mean(x=loss) + + print('shapex ', avg_loss.shape) + dy_out = avg_loss._numpy() if batch_id == 0: @@ -241,6 +251,15 @@ class TestImperativeResnet(unittest.TestCase): dy_param_init_value[param.name] = param._numpy() avg_loss._backward() + dy_grad_value = {} + for param in fluid.default_main_program().global_block( + ).all_parameters(): + if not param.stop_gradient: + np_array = np.array(param._ivar._grad_ivar().value() + .get_tensor()) + dy_grad_value[param.name + core.grad_var_suffix( + )] = np_array + optimizer.minimize(avg_loss) dy_param_value = {} @@ -256,8 +275,13 @@ class TestImperativeResnet(unittest.TestCase): resnet = ResNet() optimizer = optimizer_setting(train_parameters) + + np.random.seed(seed) + import random + random.seed = seed train_reader = paddle.batch( - paddle.dataset.flowers.train(), batch_size=batch_size) + paddle.dataset.flowers.train(use_xmap=False), + batch_size=batch_size) img = fluid.layers.data( name='pixel', shape=[3, 224, 224], dtype='float32') @@ -267,12 +291,21 @@ class TestImperativeResnet(unittest.TestCase): avg_loss = fluid.layers.mean(x=loss) optimizer.minimize(avg_loss) + print('avg_loss shape', avg_loss.shape) + print(fluid.default_main_program()) + # initialize params and fetch them static_param_init_value = {} static_param_name_list = [] + static_grad_name_list = [] for param in fluid.default_startup_program().global_block( ).all_parameters(): static_param_name_list.append(param.name) + for param in fluid.default_main_program().global_block( + ).all_parameters(): + if not param.stop_gradient: + static_grad_name_list.append(param.name + + core.grad_var_suffix()) out = exe.run(fluid.default_startup_program(), fetch_list=static_param_name_list) @@ -284,34 +317,49 @@ class TestImperativeResnet(unittest.TestCase): if batch_id >= 1: break - x_data = np.array( + static_x_data = np.array( [x[0].reshape(3, 224, 224) for x in data]).astype('float32') y_data = np.array([x[1] for x in data]).astype('int64').reshape( [batch_size, 1]) - fetch_list = [loss.name] + fetch_list = [avg_loss.name] fetch_list.extend(static_param_name_list) + fetch_list.extend(static_grad_name_list) out = exe.run(fluid.default_main_program(), - feed={"pixel": x_data, + feed={"pixel": static_x_data, "label": y_data}, fetch_list=fetch_list) static_param_value = {} + static_grad_value = {} static_out = out[0] - for i in range(1, len(out)): - static_param_value[static_param_name_list[i - 1]] = out[i] + param_start_pos = 1 + grad_start_pos = len(static_param_name_list) + param_start_pos + for i in range(param_start_pos, + len(static_param_name_list) + param_start_pos): + static_param_value[static_param_name_list[ + i - param_start_pos]] = out[i] + for i in range(grad_start_pos, + len(static_grad_name_list) + grad_start_pos): + static_grad_value[static_grad_name_list[ + i - grad_start_pos]] = out[i] + + self.assertTrue(np.allclose(static_out, dy_out)) + + self.assertEqual(len(dy_param_init_value), len(static_param_init_value)) + for key, value in six.iteritems(static_param_init_value): + self.assertTrue(np.allclose(value, dy_param_init_value[key])) - self.assertTrue(np.allclose(static_out.all(), dy_out.all())) + self.assertEqual(len(dy_grad_value), len(static_grad_value)) + # TODO(minqiyang): find a way to align the gradient + # for key, value in six.iteritems(static_grad_value): + # self.assertTrue( + # np.allclose(value, dy_grad_value[key])) - for key, value in six.iteritems(static_param_init_value): - self.assertTrue( - np.allclose(value.all(), dy_param_init_value[key].all())) - - for key, value in six.iteritems(static_param_value): - if not np.allclose(value.all(), dy_param_value[key].all()): - print(key) - print(value, dy_param_value[key]) - self.assertTrue(np.allclose(value.all(), dy_param_value[key].all())) + self.assertEqual(len(dy_param_value), len(static_param_value)) + # for key, value in six.iteritems(static_param_value): + + # self.assertTrue(np.allclose(value, dy_param_value[key])) if __name__ == '__main__': -- GitLab