diff --git a/fluid/neural_machine_translation/transformer/model.py b/fluid/neural_machine_translation/transformer/model.py
index 7f537dbc13c89c404dd8d34ad442c78a09876ee5..c8ff77e0b2f433f0507bbe3ab97dcd0cdeba14e8 100644
--- a/fluid/neural_machine_translation/transformer/model.py
+++ b/fluid/neural_machine_translation/transformer/model.py
@@ -80,7 +80,7 @@ def multi_head_attention(queries,
         # The value 0 in shape attr means copying the corresponding dimension
         # size of the input as the output dimension size.
         reshaped = layers.reshape(
-            x=x, shape=[0, 0, n_head, hidden_size // n_head])
+            x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=True)
 
         # permuate the dimensions into:
         # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
@@ -99,7 +99,9 @@ def multi_head_attention(queries,
         # The value 0 in shape attr means copying the corresponding dimension
         # size of the input as the output dimension size.
         return layers.reshape(
-            x=trans_x, shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]])
+            x=trans_x,
+            shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]],
+            inplace=True)
 
     def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate):
         """
@@ -523,8 +525,7 @@ def transformer(src_vocab_size,
             epsilon=label_smooth_eps)
 
     cost = layers.softmax_with_cross_entropy(
-        logits=layers.reshape(
-            predict, shape=[-1, trg_vocab_size]),
+        logits=predict,
         label=label,
         soft_label=True if label_smooth_eps else False)
     weighted_cost = cost * weights
@@ -637,6 +638,9 @@ def wrap_decoder(trg_vocab_size,
         preprocess_cmd,
         postprocess_cmd,
         caches=caches)
+    # Reshape to 2D tensor to use GEMM instead of BatchedGEMM
+    dec_output = layers.reshape(
+        dec_output, shape=[-1, dec_output.shape[-1]], inplace=True)
     if weight_sharing:
         predict = layers.matmul(
             x=dec_output,
@@ -751,7 +755,6 @@ def fast_decode(
                 dec_inputs=(pre_ids, pre_pos, None, pre_src_attn_bias),
                 enc_output=pre_enc_output,
                 caches=pre_caches)
-            logits = layers.reshape(logits, (-1, trg_vocab_size))
 
             topk_scores, topk_indices = layers.topk(
                 input=layers.softmax(logits), k=beam_size)
diff --git a/fluid/neural_machine_translation/transformer/profile.py b/fluid/neural_machine_translation/transformer/profile.py
index a2ac16df9ec0647ef65efc33b9dcdd330f93459c..9a437725cb27c29b0233d6297e84781f5343aff1 100644
--- a/fluid/neural_machine_translation/transformer/profile.py
+++ b/fluid/neural_machine_translation/transformer/profile.py
@@ -1,5 +1,6 @@
 import argparse
 import ast
+import contextlib
 import multiprocessing
 import os
 import six
@@ -79,8 +80,7 @@ def parse_args():
         type=lambda x: str(x.encode().decode("unicode-escape")),
         default=" ",
         help="The delimiter used to split tokens in source or target sentences. "
-        "For EN-DE BPE data we provided, use spaces as token delimiter. "
-        "For EN-FR wordpiece data we provided, use '\x01' as token delimiter.")
+        "For EN-DE BPE data we provided, use spaces as token delimiter.")
     parser.add_argument(
         "--use_mem_opt",
         type=ast.literal_eval,
@@ -98,9 +98,14 @@ def parse_args():
         help="The iteration number to run in profiling.")
     parser.add_argument(
         "--use_parallel_exe",
-        type=bool,
+        type=ast.literal_eval,
         default=False,
         help="The flag indicating whether to use ParallelExecutor.")
+    parser.add_argument(
+        "--profile_ops",
+        type=ast.literal_eval,
+        default=True,
+        help="The flag indicating whether to profile operators.")
     parser.add_argument(
         'opts',
         help='See config.py for all options',
@@ -125,6 +130,8 @@ def parse_args():
 def main(args):
     train_prog = fluid.Program()
     startup_prog = fluid.Program()
+    train_prog.random_seed = 1000
+    startup_prog.random_seed = 1000
     with fluid.program_guard(train_prog, startup_prog):
         with fluid.unique_name.guard():
             sum_cost, avg_cost, predict, token_num, pyreader = transformer(
@@ -243,24 +250,33 @@ def main(args):
                 if args.use_py_reader:
                     pyreader.reset()
                     pyreader.start()
-                break
 
         return reader_time, run_time
 
+    @contextlib.contextmanager
+    def profile_context(profile=True):
+        if profile:
+            with profiler.profiler('All', 'total', '/tmp/profile_file'):
+                yield
+        else:
+            yield
+
     # start-up
     init_flag = True
-    run(1)
+    run(5)
     init_flag = False
 
     # profiling
     start = time.time()
     # currently only support profiling on one device
-    with profiler.profiler('All', 'total', '/tmp/profile_file'):
+    with profile_context(args.profile_ops):
         reader_time, run_time = run(args.iter_num)
     end = time.time()
     total_time = end - start
-    print("Total time: {0}, reader time: {1} s, run time: {2} s".format(
-        total_time, np.sum(reader_time), np.sum(run_time)))
+    print(
+        "Total time: {0}, reader time: {1} s, run time: {2} s, step number: {3}".
+        format(total_time, np.sum(reader_time), np.sum(run_time),
+               args.iter_num))
 
 
 if __name__ == "__main__":
diff --git a/fluid/neural_machine_translation/transformer/reader.py b/fluid/neural_machine_translation/transformer/reader.py
index eb485793584c64f610ede11efda312100584bc9e..10f44ade6768edf9536cad27bcbcd0b08d16e668 100644
--- a/fluid/neural_machine_translation/transformer/reader.py
+++ b/fluid/neural_machine_translation/transformer/reader.py
@@ -297,9 +297,14 @@ class DataReader(object):
                 infos = self._sample_infos
 
             if self._sort_type == SortType.POOL:
+                reverse = True
                 for i in range(0, len(infos), self._pool_size):
+                    # to avoid placing short next to long sentences
+                    reverse = not reverse
                     infos[i:i + self._pool_size] = sorted(
-                        infos[i:i + self._pool_size], key=lambda x: x.max_len)
+                        infos[i:i + self._pool_size],
+                        key=lambda x: x.max_len,
+                        reverse=reverse)
 
         # concat batch
         batches = []