diff --git a/fluid/PaddleNLP/neural_machine_translation/transformer/model.py b/fluid/PaddleNLP/neural_machine_translation/transformer/model.py
index 9fd04cc907bd8fe14ab88a749727e1dfdc6ebb74..1e510bc620dc56f82e8e7303a56ca44a44b74650 100644
--- a/fluid/PaddleNLP/neural_machine_translation/transformer/model.py
+++ b/fluid/PaddleNLP/neural_machine_translation/transformer/model.py
@@ -129,10 +129,12 @@ def multi_head_attention(queries,
         # input from the previous time step first.
         k = cache["k"] = layers.concat(
             [layers.reshape(
-                cache["k"], shape=[0, 0, d_model]), k], axis=1)
+                cache["k"], shape=[0, 0, d_key * n_head]), k],
+            axis=1)
         v = cache["v"] = layers.concat(
             [layers.reshape(
-                cache["v"], shape=[0, 0, d_model]), v], axis=1)
+                cache["v"], shape=[0, 0, d_value * n_head]), v],
+            axis=1)
 
     q = __split_heads(q, n_head)
     k = __split_heads(k, n_head)
@@ -657,8 +659,7 @@ def wrap_decoder(trg_vocab_size,
     else:
         predict = layers.fc(input=dec_output,
                             size=trg_vocab_size,
-                            bias_attr=False,
-                            num_flatten_dims=2)
+                            bias_attr=False)
     if dec_inputs is None:
         # Return probs for independent decoder program.
         predict = layers.softmax(predict)