未验证 提交 29f34416 编写于 作者: J Jiabin Yang 提交者: GitHub

Cherry pick/fix transformer (#16620)

* Imperative deep-first backward process (#16605)

* Fix bug of gradient interface

* shrink transformer

* Right transformer

* Change from width-first backward to deep-first backward process

test=develop

* Reverse iterator op's input

test=develop

* Polish code

* Change the iteration direction in ingrads' map slots

test=develop

* Polish code

test=develop

* test=develop, cherry-pick fix for transformer in dygraph

* test=develop, fix transformer in dygraph
/
上级 89d09b83
......@@ -122,14 +122,14 @@ class Autograd {
std::map<std::string, std::vector<VarBase*>> input_grads =
ready_op->ApplyGrad();
for (auto it : input_grads) {
const std::vector<VarBase*>& ingrads = it.second;
for (auto it = input_grads.rbegin(); it != input_grads.rend(); ++it) {
const std::vector<VarBase*>& ingrads = it->second;
for (size_t i = 0; i < ingrads.size(); ++i) {
if (!ingrads[i]) continue;
if (ready_op->input_vars_[it.first][i]->IsStopGradient()) {
if (ready_op->input_vars_[it->first][i]->IsStopGradient()) {
continue;
}
OpBase* pre_op = ready_op->pre_ops_[it.first][i];
OpBase* pre_op = ready_op->pre_ops_[it->first][i];
if (!pre_op) continue;
dep_counts[pre_op] -= 1;
......
......@@ -493,7 +493,8 @@ class Variable(object):
self._ivar._run_backward()
def _gradient(self):
return np.array(self._ivar._grad_value())
new_ivar = self._ivar._grad_ivar()._copy_to(core.CPUPlace(), True)
return np.array(new_ivar.value().get_tensor())
def _clear_gradient(self):
self._ivar._clear_gradient()
......
......@@ -302,8 +302,11 @@ use_py_reader = False
# if we run sync mode
sync = False
# how many batches we use
batch_num = 2
if not core.is_compiled_with_cuda():
# how many batches we use
batch_num = 50
else:
batch_num = 5
np.random.seed = 1
src_word_np = np.random.randint(
......@@ -335,24 +338,6 @@ lbl_word_np = np.random.randint(
dtype='int64')
lbl_weight_np = np.random.randn(batch_size * seq_len, 1).astype('float32')
# np.random.seed = 1
# src_word_np = np.arange(0, 10).reshape([batch_size, seq_len, 1]).astype('int64')
# src_pos_np = np.random.randint(
# 1, seq_len, size=(batch_size, seq_len, 1), dtype='int64')
# src_slf_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
# seq_len, seq_len).astype('float32')
#
# trg_word_np = np.arange(0, 10).reshape([batch_size, seq_len, 1]).astype('int64')
# trg_pos_np = np.random.randint(
# 1, seq_len, size=(batch_size, seq_len, 1), dtype='int64')
# trg_slf_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
# seq_len, seq_len).astype('float32')
# trg_src_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
# seq_len, seq_len).astype('float32')
#
# lbl_word_np = np.arange(0, 10).reshape([batch_size * seq_len, 1]).astype('int64')
# lbl_weight_np = np.random.randn(batch_size * seq_len, 1).astype('float32')
#
pos_inp1 = position_encoding_init(ModelHyperParams.max_length,
ModelHyperParams.d_model)
pos_inp2 = position_encoding_init(ModelHyperParams.max_length,
......@@ -739,7 +724,7 @@ class DecoderSubLayer(Layer):
enc_attn_output_pp = self._multihead_attention_layer2(
pre_process_rlt2, enc_output, enc_output, dec_enc_attn_bias)
enc_attn_output = self._post_process_layer2(
slf_attn_output, enc_attn_output_pp, self._postprocess_cmd,
slf_attn_output_pp, enc_attn_output_pp, self._postprocess_cmd,
self._prepostprcess_dropout)
pre_process_rlt3 = self._pre_process_layer3(None, enc_attn_output,
self._preprocess_cmd,
......@@ -1076,20 +1061,17 @@ class TestDygraphTransformer(unittest.TestCase):
4]] = out[k]
self.assertTrue(
np.allclose(static_avg_cost_value, dy_avg_cost._numpy()))
np.array_equal(static_avg_cost_value, dy_avg_cost._numpy()))
self.assertTrue(
np.allclose(static_sum_cost_value, dy_sum_cost._numpy()))
np.array_equal(static_sum_cost_value, dy_sum_cost._numpy()))
self.assertTrue(
np.allclose(
static_predict_value, dy_predict._numpy(), atol=1e-5))
np.array_equal(static_predict_value, dy_predict._numpy()))
self.assertTrue(
np.allclose(static_token_num_value, dy_token_num._numpy()))
np.array_equal(static_token_num_value, dy_token_num._numpy()))
for key, value in six.iteritems(static_param_init):
self.assertTrue(np.allclose(value, dy_param_init[key]))
self.assertTrue(np.array_equal(value, dy_param_init[key]))
for key, value in six.iteritems(static_param_updated):
self.assertTrue(
np.allclose(
value, dy_param_updated[key], atol=1e-4))
self.assertTrue(np.array_equal(value, dy_param_updated[key]))
if __name__ == '__main__':
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册