From f610916c5cb46426dc0d15fbff1c51bd7a404441 Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Thu, 8 Nov 2018 13:04:23 +0800
Subject: [PATCH] Fix the reshape in no-weight-sharing mode of Transformer

---
 .../neural_machine_translation/transformer/model.py      | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/fluid/PaddleNLP/neural_machine_translation/transformer/model.py b/fluid/PaddleNLP/neural_machine_translation/transformer/model.py
index 9fd04cc9..1e510bc6 100644
--- a/fluid/PaddleNLP/neural_machine_translation/transformer/model.py
+++ b/fluid/PaddleNLP/neural_machine_translation/transformer/model.py
@@ -129,10 +129,12 @@ def multi_head_attention(queries,
         # input from the previous time step first.
         k = cache["k"] = layers.concat(
             [layers.reshape(
-                cache["k"], shape=[0, 0, d_model]), k], axis=1)
+                cache["k"], shape=[0, 0, d_key * n_head]), k],
+            axis=1)
         v = cache["v"] = layers.concat(
             [layers.reshape(
-                cache["v"], shape=[0, 0, d_model]), v], axis=1)
+                cache["v"], shape=[0, 0, d_value * n_head]), v],
+            axis=1)
 
     q = __split_heads(q, n_head)
     k = __split_heads(k, n_head)
@@ -657,8 +659,7 @@ def wrap_decoder(trg_vocab_size,
     else:
         predict = layers.fc(input=dec_output,
                             size=trg_vocab_size,
-                            bias_attr=False,
-                            num_flatten_dims=2)
+                            bias_attr=False)
     if dec_inputs is None:
         # Return probs for independent decoder program.
         predict = layers.softmax(predict)
-- 
GitLab