diff --git a/Classification/cnns/benchmark.sh b/Classification/cnns/benchmark.sh index b0e68b5d140deeabc8125f64bf06422c43c4f4d3..3671cff87c490c344ae22d0fe2991f2f658d0441 100755 --- a/Classification/cnns/benchmark.sh +++ b/Classification/cnns/benchmark.sh @@ -44,4 +44,5 @@ python3 $BENCH_ROOT/of_cnn_train_val.py \ --fuse_bn_add_relu=True \ --nccl_fusion_threshold_mb=16 \ --nccl_fusion_max_ops=24 \ + --gpu_image_decoder=True \ --model="resnet50" diff --git a/Classification/cnns/config.py b/Classification/cnns/config.py index 1a37f104ec590609669943a4addce2df57abbcc3..215a38597b781e5065a067ed472a1727c1322ca8 100755 --- a/Classification/cnns/config.py +++ b/Classification/cnns/config.py @@ -111,7 +111,8 @@ def get_parser(parser=None): default=False, help='Whether to use use fuse batch normalization add relu. Currently supported in origin/master of OneFlow only.' ) - + parser.add_argument("--gpu_image_decoder", type=str2bool, + default=False, help='Whether to use use ImageDecoderRandomCropResize.') # inference parser.add_argument("--image_path", type=str, default='test_img/tiger.jpg', help="image path") diff --git a/Classification/cnns/job_function_util.py b/Classification/cnns/job_function_util.py index 3a78f01e011f5ae9815e4bf2c34148f1345794ee..88675c07ab1bb2d323fc166cd7451f92b0142882 100755 --- a/Classification/cnns/job_function_util.py +++ b/Classification/cnns/job_function_util.py @@ -35,6 +35,10 @@ def get_train_config(args): train_config.prune_parallel_cast_ops(True) train_config.enable_inplace(True) + if args.num_nodes > 1: + train_config.cudnn_conv_heuristic_search_algo(True) + else: + train_config.cudnn_conv_heuristic_search_algo(False) train_config.enable_fuse_model_update_ops(True) return train_config diff --git a/Classification/cnns/ofrecord_util.py b/Classification/cnns/ofrecord_util.py index 00107eab0d410a534115cd6d74ff3265655bdf32..8c80cf344a778dfb99988d40b1bc69f2a12e391c 100755 --- a/Classification/cnns/ofrecord_util.py +++ b/Classification/cnns/ofrecord_util.py @@ -90,14 +90,19 @@ def load_imagenet_for_training(args): part_name_suffix_length=5, random_shuffle=True, shuffle_after_epoch=True) - image = flow.data.OFRecordImageDecoderRandomCrop(ofrecord, "encoded", # seed=seed, - color_space=color_space) label = flow.data.OFRecordRawDecoder( ofrecord, "class/label", shape=(), dtype=flow.int32) + if args.gpu_image_decoder: + encoded = flow.data.OFRecordBytesDecoder(ofrecord, "encoded") + image = flow.data.ImageDecoderRandomCropResize(encoded, target_width=224, target_height=224, num_workers=3) + else: + image = flow.data.OFRecordImageDecoderRandomCrop(ofrecord, "encoded", # seed=seed, + color_space=color_space) + rsz = flow.image.Resize(image, target_size=[args.image_size, args.image_size]) + image = rsz[0] - rsz = flow.image.Resize(image, target_size=[args.image_size, args.image_size]) rng = flow.random.CoinFlip(batch_size=train_batch_size) # , seed=seed) - normal = flow.image.CropMirrorNormalize(rsz[0], mirror_blob=rng, + normal = flow.image.CropMirrorNormalize(image, mirror_blob=rng, color_space=color_space, output_layout=output_layout, mean=args.rgb_mean, std=args.rgb_std, output_dtype=flow.float) return label, normal diff --git a/Classification/cnns/train_fp16.sh b/Classification/cnns/train_fp16.sh new file mode 100755 index 0000000000000000000000000000000000000000..7ecfa5cb40556d6352706dc80263a04b0a458c1f --- /dev/null +++ b/Classification/cnns/train_fp16.sh @@ -0,0 +1,53 @@ +rm -rf core.* +rm -rf ./output/snapshots/* + +if [ -n "$1" ]; then + NUM_EPOCH=$1 +else + NUM_EPOCH=50 +fi +echo NUM_EPOCH=$NUM_EPOCH + +# training with imagenet +if [ -n "$2" ]; then + DATA_ROOT=$2 +else + DATA_ROOT=/data/imagenet/ofrecord +fi +echo DATA_ROOT=$DATA_ROOT + +LOG_FOLDER=../logs +mkdir -p $LOG_FOLDER +LOGFILE=$LOG_FOLDER/resnet_training.log + +export PYTHONUNBUFFERED=1 +echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED +export NCCL_LAUNCH_MODE=PARALLEL +echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE + +python3 of_cnn_train_val.py \ + --train_data_dir=$DATA_ROOT/train \ + --train_data_part_num=256 \ + --val_data_dir=$DATA_ROOT/validation \ + --val_data_part_num=256 \ + --num_nodes=1 \ + --gpu_num_per_node=8 \ + --optimizer="sgd" \ + --momentum=0.875 \ + --label_smoothing=0.1 \ + --learning_rate=1.536 \ + --loss_print_every_n_iter=100 \ + --batch_size_per_device=192 \ + --val_batch_size_per_device=50 \ + --use_fp16 \ + --channel_last=True \ + --pad_output \ + --fuse_bn_relu=True \ + --fuse_bn_add_relu=True \ + --nccl_fusion_threshold_mb=16 \ + --nccl_fusion_max_ops=24 \ + --gpu_image_decoder=True \ + --num_epoch=$NUM_EPOCH \ + --model="resnet50" 2>&1 | tee ${LOGFILE} + +echo "Writting log to ${LOGFILE}" diff --git a/Classification/cnns/train_fp32.sh b/Classification/cnns/train_fp32.sh new file mode 100755 index 0000000000000000000000000000000000000000..6662d2e5c274b9ab90c396870ba60b1e6d8fc11c --- /dev/null +++ b/Classification/cnns/train_fp32.sh @@ -0,0 +1,51 @@ +rm -rf core.* +rm -rf ./output/snapshots/* + +if [ -n "$1" ]; then + NUM_EPOCH=$1 +else + NUM_EPOCH=50 +fi +echo NUM_EPOCH=$NUM_EPOCH + +# training with imagenet +if [ -n "$2" ]; then + DATA_ROOT=$2 +else + DATA_ROOT=/data/imagenet/ofrecord +fi +echo DATA_ROOT=$DATA_ROOT + +LOG_FOLDER=../logs +mkdir -p $LOG_FOLDER +LOGFILE=$LOG_FOLDER/resnet_training.log + +export PYTHONUNBUFFERED=1 +echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED +export NCCL_LAUNCH_MODE=PARALLEL +echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE + +python3 of_cnn_train_val.py \ + --train_data_dir=$DATA_ROOT/train \ + --train_data_part_num=256 \ + --val_data_dir=$DATA_ROOT/validation \ + --val_data_part_num=256 \ + --num_nodes=1 \ + --gpu_num_per_node=8 \ + --optimizer="sgd" \ + --momentum=0.875 \ + --label_smoothing=0.1 \ + --learning_rate=0.768 \ + --loss_print_every_n_iter=100 \ + --batch_size_per_device=96 \ + --val_batch_size_per_device=50 \ + --channel_last=False \ + --fuse_bn_relu=True \ + --fuse_bn_add_relu=True \ + --nccl_fusion_threshold_mb=16 \ + --nccl_fusion_max_ops=24 \ + --gpu_image_decoder=True \ + --num_epoch=$NUM_EPOCH \ + --model="resnet50" 2>&1 | tee ${LOGFILE} + +echo "Writting log to ${LOGFILE}"