diff --git a/BERT/README.md b/BERT/README.md
index 84e05b2df588be9fb42e541f84321203c34009b6..6f122caf113a20fd1e0a5b46f43fa8d7aa280c2e 100644
--- a/BERT/README.md
+++ b/BERT/README.md
@@ -12,7 +12,7 @@
 - 支持 BERT GPU 多卡 Fine-tuning
 - 提供 BERT 预测接口 demo, 方便多硬件设备生产环境的部署
 
-2）支持FP16/FP32混合精度训练和 Fine-tuning，节省显存开销、加速训练过程；
+2）支持 FP16/FP32 混合精度训练和 Fine-tuning，节省显存开销、加速训练过程；
 
 3）提供转换成 Paddle Fluid 参数格式的 [BERT 开源预训练模型](https://github.com/google-research/bert) 供下载，以进行下游任务的 Fine-tuning, 包括如下模型:
 
@@ -155,7 +155,7 @@ python -u run_classifier.py --task_name ${TASK_NAME} \
                    --num_iteration_per_drop_scope 1
 ```
 
-这里的 `chinese_L-12_H-768_A-12` 即是转换后的中文预训练模型。需要注意的是，BERT 支持按两种方式构建一个 batch 的数据，`in_tokens` 参数影响 `batch_size` 参数的意义，如果 `in_tokens` 为 `true` 则按照 token 个数构建 batch, 如不设定则按照 example 个数来构建 batch. 训练过程中会输出训练误差、训练速度等信息，训练结束后会输出如下所示的在验证集上的测试结果：
+这里的 `chinese_L-12_H-768_A-12` 即是转换后的中文预训练模型。需要注意的是，BERT on PaddlePaddle 支持按两种方式构建一个 batch 的数据，`in_tokens` 参数影响 `batch_size` 参数的意义，如果 `in_tokens` 为 `true` 则按照 token 个数构建 batch, 如不设定则按照 example 个数来构建 batch. 训练过程中会输出训练误差、训练速度等信息，训练结束后会输出如下所示的在验证集上的测试结果：
 
 ```
 [dev evaluation] ave loss: 0.622958, ave acc: 0.770281, elapsed time: 8.946956 s
diff --git a/BERT/predict_classifier.py b/BERT/predict_classifier.py
index 3b644996bc05e216277dbcde177ecfdc16b10430..fe813ffeffdd2beead9742e66507aac1b3d62db4 100644
--- a/BERT/predict_classifier.py
+++ b/BERT/predict_classifier.py
@@ -109,13 +109,13 @@ def main(args):
 
     # Due to the design that ParallelExecutor would drop small batches (mostly the last batch)
     # So using ParallelExecutor may left some data unpredicted
-    # if prediction of each and every example is needed, use Executor instead
+    # if prediction of each and every example is needed, please use Executor instead
     predict_exe = fluid.ParallelExecutor(
         use_cuda=args.use_cuda, main_program=predict_prog)
 
     predict_pyreader.decorate_tensor_provider(
         processor.data_generator(
-            batch_size=args.batch_size, phase='test', epoch=1))
+            batch_size=args.batch_size, phase='test', epoch=1, shuffle=False))
 
     predict_pyreader.start()
     all_results = []
diff --git a/BERT/run_classifier.py b/BERT/run_classifier.py
index fd5e307eefe0deb837690a7cbd7fc1b65fd35cb0..5ba2ca9224ac0b0710cc078a923c365be5fad847 100644
--- a/BERT/run_classifier.py
+++ b/BERT/run_classifier.py
@@ -65,7 +65,7 @@ data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data process
 data_g.add_arg("data_dir",      str,  None,  "Path to training data.")
 data_g.add_arg("vocab_path",    str,  None,  "Vocabulary path.")
 data_g.add_arg("max_seq_len",   int,  512,   "Number of words of the longest seqence.")
-data_g.add_arg("batch_size",    int,  8192,  "Total examples' number in batch for training. see also --in_tokens.")
+data_g.add_arg("batch_size",    int,  32,  "Total examples' number in batch for training. see also --in_tokens.")
 data_g.add_arg("in_tokens",     bool, False,
               "If set, the batch size will be the maximum number of tokens in one batch. "
               "Otherwise, it will be the maximum number of examples in one batch.")