diff --git a/fluid/neural_machine_translation/transformer/config.py b/fluid/neural_machine_translation/transformer/config.py
index a4e588c620f21c4f38eb1906f55d68ddf93214b6..e68ab17e69eff890cb8e6b028ead5e6163213761 100644
--- a/fluid/neural_machine_translation/transformer/config.py
+++ b/fluid/neural_machine_translation/transformer/config.py
@@ -62,10 +62,8 @@ class ModelHyperParams(object):
     eos_idx = 1
     # index for <unk> token
     unk_idx = 2
-    # max length of sequences.
-    # The size of position encoding table should at least plus 1, since the
-    # sinusoid position encoding starts from 1 and 0 can be used as the padding
-    # token for position encoding.
+    # max length of sequences deciding the size of position encoding table.
+    # Start from 1 and count start and end tokens in.
     max_length = 256
     # the dimension for word embeddings, which is also the last dimension of
     # the input and output of multi-head attention, position-wise feed-forward
diff --git a/fluid/neural_machine_translation/transformer/infer.py b/fluid/neural_machine_translation/transformer/infer.py
index 874028081cca218ae16559af9ea9b05d3494c977..505bf0b0062bda27a0299ed7d844e2f05abd95b8 100644
--- a/fluid/neural_machine_translation/transformer/infer.py
+++ b/fluid/neural_machine_translation/transformer/infer.py
@@ -543,7 +543,8 @@ def infer(args, inferencer=fast_infer):
         start_mark=args.special_token[0],
         end_mark=args.special_token[1],
         unk_mark=args.special_token[2],
-        max_length=ModelHyperParams.max_length,
+        # count start and end tokens out
+        max_length=ModelHyperParams.max_length - 2,
         clip_last_batch=False)
     trg_idx2word = test_data.load_dict(
         dict_path=args.trg_vocab_fpath, reverse=True)
diff --git a/fluid/neural_machine_translation/transformer/train.py b/fluid/neural_machine_translation/transformer/train.py
index e3c9b62d068b7cbf0433328d1fcb559a4e659166..cdd7dfed8235a42da867e08e16e0aef4ba500fa1 100644
--- a/fluid/neural_machine_translation/transformer/train.py
+++ b/fluid/neural_machine_translation/transformer/train.py
@@ -290,7 +290,8 @@ def train(args):
         start_mark=args.special_token[0],
         end_mark=args.special_token[1],
         unk_mark=args.special_token[2],
-        max_length=ModelHyperParams.max_length,
+        # count start and end tokens out
+        max_length=ModelHyperParams.max_length - 2,
         clip_last_batch=False)
     train_data = read_multiple(
         reader=train_data.batch_generator,
@@ -326,7 +327,8 @@ def train(args):
             start_mark=args.special_token[0],
             end_mark=args.special_token[1],
             unk_mark=args.special_token[2],
-            max_length=ModelHyperParams.max_length,
+            # count start and end tokens out
+            max_length=ModelHyperParams.max_length - 2,
             clip_last_batch=False,
             shuffle=False,
             shuffle_batch=False)