diff --git a/Classification/cnns/benchmark.sh b/Classification/cnns/benchmark.sh
index b0e68b5d140deeabc8125f64bf06422c43c4f4d3..3671cff87c490c344ae22d0fe2991f2f658d0441 100755
--- a/Classification/cnns/benchmark.sh
+++ b/Classification/cnns/benchmark.sh
@@ -44,4 +44,5 @@ python3 $BENCH_ROOT/of_cnn_train_val.py \
     --fuse_bn_add_relu=True \
     --nccl_fusion_threshold_mb=16 \
     --nccl_fusion_max_ops=24 \
+    --gpu_image_decoder=True \
     --model="resnet50"
diff --git a/Classification/cnns/config.py b/Classification/cnns/config.py
index 1a37f104ec590609669943a4addce2df57abbcc3..215a38597b781e5065a067ed472a1727c1322ca8 100755
--- a/Classification/cnns/config.py
+++ b/Classification/cnns/config.py
@@ -111,7 +111,8 @@ def get_parser(parser=None):
         default=False,
         help='Whether to use use fuse batch normalization add relu. Currently supported in origin/master of OneFlow only.'
     )
-
+    parser.add_argument("--gpu_image_decoder", type=str2bool,
+                        default=False, help='Whether to use use ImageDecoderRandomCropResize.')
     # inference
     parser.add_argument("--image_path", type=str, default='test_img/tiger.jpg', help="image path")
 
diff --git a/Classification/cnns/job_function_util.py b/Classification/cnns/job_function_util.py
index 3a78f01e011f5ae9815e4bf2c34148f1345794ee..88675c07ab1bb2d323fc166cd7451f92b0142882 100755
--- a/Classification/cnns/job_function_util.py
+++ b/Classification/cnns/job_function_util.py
@@ -35,6 +35,10 @@ def get_train_config(args):
 
     train_config.prune_parallel_cast_ops(True)
     train_config.enable_inplace(True)
+    if args.num_nodes > 1:
+        train_config.cudnn_conv_heuristic_search_algo(True)
+    else:
+        train_config.cudnn_conv_heuristic_search_algo(False)
     train_config.enable_fuse_model_update_ops(True)
     return train_config
 
diff --git a/Classification/cnns/ofrecord_util.py b/Classification/cnns/ofrecord_util.py
index 00107eab0d410a534115cd6d74ff3265655bdf32..8c80cf344a778dfb99988d40b1bc69f2a12e391c 100755
--- a/Classification/cnns/ofrecord_util.py
+++ b/Classification/cnns/ofrecord_util.py
@@ -90,14 +90,19 @@ def load_imagenet_for_training(args):
                                         part_name_suffix_length=5,
                                         random_shuffle=True,
                                         shuffle_after_epoch=True)
-    image = flow.data.OFRecordImageDecoderRandomCrop(ofrecord, "encoded",  # seed=seed,
-                                                    color_space=color_space)
     label = flow.data.OFRecordRawDecoder(
         ofrecord, "class/label", shape=(), dtype=flow.int32)
+    if args.gpu_image_decoder:
+        encoded = flow.data.OFRecordBytesDecoder(ofrecord, "encoded")
+        image = flow.data.ImageDecoderRandomCropResize(encoded, target_width=224, target_height=224, num_workers=3)
+    else:
+        image = flow.data.OFRecordImageDecoderRandomCrop(ofrecord, "encoded",  # seed=seed,
+                                                        color_space=color_space)
+        rsz = flow.image.Resize(image, target_size=[args.image_size, args.image_size])
+        image = rsz[0]
 
-    rsz = flow.image.Resize(image, target_size=[args.image_size, args.image_size]) 
     rng = flow.random.CoinFlip(batch_size=train_batch_size)  # , seed=seed)
-    normal = flow.image.CropMirrorNormalize(rsz[0], mirror_blob=rng,
+    normal = flow.image.CropMirrorNormalize(image, mirror_blob=rng,
                                             color_space=color_space, output_layout=output_layout,
                                             mean=args.rgb_mean, std=args.rgb_std, output_dtype=flow.float)
     return label, normal
diff --git a/Classification/cnns/train_fp16.sh b/Classification/cnns/train_fp16.sh
new file mode 100755
index 0000000000000000000000000000000000000000..7ecfa5cb40556d6352706dc80263a04b0a458c1f
--- /dev/null
+++ b/Classification/cnns/train_fp16.sh
@@ -0,0 +1,53 @@
+rm -rf core.*
+rm -rf ./output/snapshots/*
+
+if [ -n "$1" ]; then
+    NUM_EPOCH=$1
+else
+    NUM_EPOCH=50
+fi
+echo NUM_EPOCH=$NUM_EPOCH
+
+# training with imagenet
+if [ -n "$2" ]; then
+    DATA_ROOT=$2
+else
+    DATA_ROOT=/data/imagenet/ofrecord
+fi
+echo DATA_ROOT=$DATA_ROOT
+
+LOG_FOLDER=../logs
+mkdir -p $LOG_FOLDER
+LOGFILE=$LOG_FOLDER/resnet_training.log
+
+export PYTHONUNBUFFERED=1
+echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED
+export NCCL_LAUNCH_MODE=PARALLEL
+echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE
+
+python3 of_cnn_train_val.py \
+     --train_data_dir=$DATA_ROOT/train \
+     --train_data_part_num=256 \
+     --val_data_dir=$DATA_ROOT/validation \
+     --val_data_part_num=256 \
+     --num_nodes=1 \
+     --gpu_num_per_node=8 \
+     --optimizer="sgd" \
+     --momentum=0.875 \
+     --label_smoothing=0.1 \
+     --learning_rate=1.536 \
+     --loss_print_every_n_iter=100 \
+     --batch_size_per_device=192 \
+     --val_batch_size_per_device=50 \
+     --use_fp16 \
+     --channel_last=True \
+     --pad_output \
+     --fuse_bn_relu=True \
+     --fuse_bn_add_relu=True \
+     --nccl_fusion_threshold_mb=16 \
+     --nccl_fusion_max_ops=24 \
+     --gpu_image_decoder=True \
+     --num_epoch=$NUM_EPOCH \
+     --model="resnet50" 2>&1 | tee ${LOGFILE}
+
+echo "Writting log to ${LOGFILE}"
diff --git a/Classification/cnns/train_fp32.sh b/Classification/cnns/train_fp32.sh
new file mode 100755
index 0000000000000000000000000000000000000000..6662d2e5c274b9ab90c396870ba60b1e6d8fc11c
--- /dev/null
+++ b/Classification/cnns/train_fp32.sh
@@ -0,0 +1,51 @@
+rm -rf core.*
+rm -rf ./output/snapshots/*
+
+if [ -n "$1" ]; then
+    NUM_EPOCH=$1
+else
+    NUM_EPOCH=50
+fi
+echo NUM_EPOCH=$NUM_EPOCH
+
+# training with imagenet
+if [ -n "$2" ]; then
+    DATA_ROOT=$2
+else
+    DATA_ROOT=/data/imagenet/ofrecord
+fi
+echo DATA_ROOT=$DATA_ROOT
+
+LOG_FOLDER=../logs
+mkdir -p $LOG_FOLDER
+LOGFILE=$LOG_FOLDER/resnet_training.log
+
+export PYTHONUNBUFFERED=1
+echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED
+export NCCL_LAUNCH_MODE=PARALLEL
+echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE
+
+python3 of_cnn_train_val.py \
+     --train_data_dir=$DATA_ROOT/train \
+     --train_data_part_num=256 \
+     --val_data_dir=$DATA_ROOT/validation \
+     --val_data_part_num=256 \
+     --num_nodes=1 \
+     --gpu_num_per_node=8 \
+     --optimizer="sgd" \
+     --momentum=0.875 \
+     --label_smoothing=0.1 \
+     --learning_rate=0.768 \
+     --loss_print_every_n_iter=100 \
+     --batch_size_per_device=96 \
+     --val_batch_size_per_device=50 \
+     --channel_last=False \
+     --fuse_bn_relu=True \
+     --fuse_bn_add_relu=True \
+     --nccl_fusion_threshold_mb=16 \
+     --nccl_fusion_max_ops=24 \
+     --gpu_image_decoder=True \
+     --num_epoch=$NUM_EPOCH \
+     --model="resnet50" 2>&1 | tee ${LOGFILE}
+
+echo "Writting log to ${LOGFILE}"