From f610916c5cb46426dc0d15fbff1c51bd7a404441 Mon Sep 17 00:00:00 2001 From: guosheng Date: Thu, 8 Nov 2018 13:04:23 +0800 Subject: [PATCH] Fix the reshape in no-weight-sharing mode of Transformer --- .../neural_machine_translation/transformer/model.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fluid/PaddleNLP/neural_machine_translation/transformer/model.py b/fluid/PaddleNLP/neural_machine_translation/transformer/model.py index 9fd04cc9..1e510bc6 100644 --- a/fluid/PaddleNLP/neural_machine_translation/transformer/model.py +++ b/fluid/PaddleNLP/neural_machine_translation/transformer/model.py @@ -129,10 +129,12 @@ def multi_head_attention(queries, # input from the previous time step first. k = cache["k"] = layers.concat( [layers.reshape( - cache["k"], shape=[0, 0, d_model]), k], axis=1) + cache["k"], shape=[0, 0, d_key * n_head]), k], + axis=1) v = cache["v"] = layers.concat( [layers.reshape( - cache["v"], shape=[0, 0, d_model]), v], axis=1) + cache["v"], shape=[0, 0, d_value * n_head]), v], + axis=1) q = __split_heads(q, n_head) k = __split_heads(k, n_head) @@ -657,8 +659,7 @@ def wrap_decoder(trg_vocab_size, else: predict = layers.fc(input=dec_output, size=trg_vocab_size, - bias_attr=False, - num_flatten_dims=2) + bias_attr=False) if dec_inputs is None: # Return probs for independent decoder program. predict = layers.softmax(predict) -- GitLab