diff --git a/PaddleCV/image_classification/scripts/train/ResNet50_fp16.sh b/PaddleCV/image_classification/scripts/train/ResNet50_fp16.sh
index 6ebd5c01ff3de389ec5cf9f8aaeb5d0f3f690715..7791c1b3561520a88377e75ea62ba6cb50c4904e 100755
--- a/PaddleCV/image_classification/scripts/train/ResNet50_fp16.sh
+++ b/PaddleCV/image_classification/scripts/train/ResNet50_fp16.sh
@@ -3,8 +3,9 @@
 export FLAGS_conv_workspace_size_limit=4000 #MB
 export FLAGS_cudnn_exhaustive_search=1
 export FLAGS_cudnn_batchnorm_spatial_persistent=1
+export FLAGS_max_inplace_grad_add=8
 
-DATA_DIR="Your image dataset path, e.g. /work/datasets/ILSVRC2012/"
+DATA_DIR="./data/ILSVRC2012/"
 
 DATA_FORMAT="NHWC"
 USE_FP16=true #whether to use float16
@@ -17,7 +18,7 @@ fi
 python train.py \
        --model=ResNet50 \
        --data_dir=${DATA_DIR} \
-       --batch_size=256 \
+       --batch_size=128 \
        --total_images=1281167 \
        --image_shape 3 224 224 \
        --class_dim=1000 \
@@ -36,5 +37,7 @@ python train.py \
        --reader_thread=10 \
        --reader_buf_size=4000 \
        --use_dali=${USE_DALI} \
+       --fuse_all_optimizer_ops=true \
+       --enable_addto=true \
        --lr=0.1
 
diff --git a/PaddleCV/image_classification/train.py b/PaddleCV/image_classification/train.py
index 150b44b6e747efae84b1efb5803be30c4f0528ce..68abad9572ac8d2b9a90c0c5c96278c4ec0c910b 100755
--- a/PaddleCV/image_classification/train.py
+++ b/PaddleCV/image_classification/train.py
@@ -244,8 +244,16 @@ def train(args):
         t1 = time.time()
         for batch in train_iter:
             #NOTE: this is for benchmark
-            if args.max_iter and total_batch_num == args.max_iter:
+            if total_batch_num == 200:
+            #if args.max_iter and total_batch_num == args.max_iter:
+                print("=" *20)
+                print("total_batch_num: ", total_batch_num, "records_num: ", len(train_batch_time_record))
+                avg_times = sum(train_batch_time_record[-150:]) / 150
+                avg_speed = args.batch_size / avg_times
+                print("average time: %.5f s/batch, average speed: %.5f imgs/s" % (avg_times, avg_speed))  
                 return
+            #if args.max_iter and total_batch_num == args.max_iter:
+            #    return
             train_batch_metrics = exe.run(compiled_train_prog,
                                           feed=batch,
                                           fetch_list=train_fetch_list)
diff --git a/PaddleCV/image_classification/utils/utility.py b/PaddleCV/image_classification/utils/utility.py
index 45c85102acfc7ebbc45e291ebe633211d69b9f42..e5dafa75a24a9f80583a3b2676de7b819f374fd6 100644
--- a/PaddleCV/image_classification/utils/utility.py
+++ b/PaddleCV/image_classification/utils/utility.py
@@ -162,6 +162,8 @@ def parse_args():
     add_arg('profiler_path',            str,    './profilier_files',                   "the profiler output file path")
     add_arg('max_iter',                 int,    0,                      "the max train batch num")
     add_arg('same_feed',                int,    0,                      "whether to feed same images")
+    add_arg('enable_addto',             bool,   False,                  "whether to enable the addto strategy for gradient accumulation")
+    add_arg('fuse_all_optimizer_ops',   bool,   False,                  "whether to fuse all optimizer operators")
 
 
     # yapf: enable
@@ -524,6 +526,8 @@ def best_strategy_compiled(args,
         try:
             fluid.require_version(min_version='1.7.0')
             build_strategy.fuse_bn_act_ops = args.fuse_bn_act_ops
+            build_strategy.fuse_all_optimizer_ops = args.fuse_all_optimizer_ops
+            build_strategy.enable_addto = args.enable_addto
         except Exception as e:
             logger.info("PaddlePaddle version 1.7.0 or higher is "
             "required when you want to fuse batch_norm and activation_op.")