From 29f3441615872eb724cb708ffe5fd42b1befe0c0 Mon Sep 17 00:00:00 2001 From: Jiabin Yang Date: Mon, 8 Apr 2019 14:39:27 +0800 Subject: [PATCH] Cherry pick/fix transformer (#16620) * Imperative deep-first backward process (#16605) * Fix bug of gradient interface * shrink transformer * Right transformer * Change from width-first backward to deep-first backward process test=develop * Reverse iterator op's input test=develop * Polish code * Change the iteration direction in ingrads' map slots test=develop * Polish code test=develop * test=develop, cherry-pick fix for transformer in dygraph * test=develop, fix transformer in dygraph / --- paddle/fluid/imperative/layer.cc | 8 ++-- python/paddle/fluid/framework.py | 3 +- .../unittests/test_imperative_transformer.py | 42 ++++++------------- 3 files changed, 18 insertions(+), 35 deletions(-) diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index 036d2a50a4a..bc03285a4c5 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -122,14 +122,14 @@ class Autograd { std::map> input_grads = ready_op->ApplyGrad(); - for (auto it : input_grads) { - const std::vector& ingrads = it.second; + for (auto it = input_grads.rbegin(); it != input_grads.rend(); ++it) { + const std::vector& ingrads = it->second; for (size_t i = 0; i < ingrads.size(); ++i) { if (!ingrads[i]) continue; - if (ready_op->input_vars_[it.first][i]->IsStopGradient()) { + if (ready_op->input_vars_[it->first][i]->IsStopGradient()) { continue; } - OpBase* pre_op = ready_op->pre_ops_[it.first][i]; + OpBase* pre_op = ready_op->pre_ops_[it->first][i]; if (!pre_op) continue; dep_counts[pre_op] -= 1; diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 0f5a8f51463..7953d98bcbb 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -493,7 +493,8 @@ class Variable(object): self._ivar._run_backward() def _gradient(self): - return np.array(self._ivar._grad_value()) + new_ivar = self._ivar._grad_ivar()._copy_to(core.CPUPlace(), True) + return np.array(new_ivar.value().get_tensor()) def _clear_gradient(self): self._ivar._clear_gradient() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_transformer.py b/python/paddle/fluid/tests/unittests/test_imperative_transformer.py index 3bdf3349730..df097360c63 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_transformer.py @@ -302,8 +302,11 @@ use_py_reader = False # if we run sync mode sync = False -# how many batches we use -batch_num = 2 +if not core.is_compiled_with_cuda(): + # how many batches we use + batch_num = 50 +else: + batch_num = 5 np.random.seed = 1 src_word_np = np.random.randint( @@ -335,24 +338,6 @@ lbl_word_np = np.random.randint( dtype='int64') lbl_weight_np = np.random.randn(batch_size * seq_len, 1).astype('float32') -# np.random.seed = 1 -# src_word_np = np.arange(0, 10).reshape([batch_size, seq_len, 1]).astype('int64') -# src_pos_np = np.random.randint( -# 1, seq_len, size=(batch_size, seq_len, 1), dtype='int64') -# src_slf_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head, -# seq_len, seq_len).astype('float32') -# -# trg_word_np = np.arange(0, 10).reshape([batch_size, seq_len, 1]).astype('int64') -# trg_pos_np = np.random.randint( -# 1, seq_len, size=(batch_size, seq_len, 1), dtype='int64') -# trg_slf_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head, -# seq_len, seq_len).astype('float32') -# trg_src_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head, -# seq_len, seq_len).astype('float32') -# -# lbl_word_np = np.arange(0, 10).reshape([batch_size * seq_len, 1]).astype('int64') -# lbl_weight_np = np.random.randn(batch_size * seq_len, 1).astype('float32') -# pos_inp1 = position_encoding_init(ModelHyperParams.max_length, ModelHyperParams.d_model) pos_inp2 = position_encoding_init(ModelHyperParams.max_length, @@ -739,7 +724,7 @@ class DecoderSubLayer(Layer): enc_attn_output_pp = self._multihead_attention_layer2( pre_process_rlt2, enc_output, enc_output, dec_enc_attn_bias) enc_attn_output = self._post_process_layer2( - slf_attn_output, enc_attn_output_pp, self._postprocess_cmd, + slf_attn_output_pp, enc_attn_output_pp, self._postprocess_cmd, self._prepostprcess_dropout) pre_process_rlt3 = self._pre_process_layer3(None, enc_attn_output, self._preprocess_cmd, @@ -1076,20 +1061,17 @@ class TestDygraphTransformer(unittest.TestCase): 4]] = out[k] self.assertTrue( - np.allclose(static_avg_cost_value, dy_avg_cost._numpy())) + np.array_equal(static_avg_cost_value, dy_avg_cost._numpy())) self.assertTrue( - np.allclose(static_sum_cost_value, dy_sum_cost._numpy())) + np.array_equal(static_sum_cost_value, dy_sum_cost._numpy())) self.assertTrue( - np.allclose( - static_predict_value, dy_predict._numpy(), atol=1e-5)) + np.array_equal(static_predict_value, dy_predict._numpy())) self.assertTrue( - np.allclose(static_token_num_value, dy_token_num._numpy())) + np.array_equal(static_token_num_value, dy_token_num._numpy())) for key, value in six.iteritems(static_param_init): - self.assertTrue(np.allclose(value, dy_param_init[key])) + self.assertTrue(np.array_equal(value, dy_param_init[key])) for key, value in six.iteritems(static_param_updated): - self.assertTrue( - np.allclose( - value, dy_param_updated[key], atol=1e-4)) + self.assertTrue(np.array_equal(value, dy_param_updated[key])) if __name__ == '__main__': -- GitLab