From 05cd6ce3a64ef01c3dc0837d195ccee62dcdfc61 Mon Sep 17 00:00:00 2001 From: Zhen Wang Date: Mon, 18 Jan 2021 21:22:55 +0800 Subject: [PATCH] Add new pure fp16 training for ResNet50. (#5047) --- PaddleCV/image_classification/build_model.py | 65 ++++++++----------- PaddleCV/image_classification/dali.py | 2 +- .../scripts/train/ResNet50_fp16.sh | 9 +-- PaddleCV/image_classification/train.py | 19 ++++-- .../image_classification/utils/optimizer.py | 19 ++---- .../image_classification/utils/utility.py | 8 +-- 6 files changed, 51 insertions(+), 71 deletions(-) diff --git a/PaddleCV/image_classification/build_model.py b/PaddleCV/image_classification/build_model.py index 6b374b17..65de4a95 100644 --- a/PaddleCV/image_classification/build_model.py +++ b/PaddleCV/image_classification/build_model.py @@ -37,20 +37,18 @@ def _calc_label_smoothing_loss(softmax_out, label, class_dim, epsilon): def _basic_model(data, model, args, is_train): image = data[0] label = data[1] + if args.model in AMP_MODEL_LIST: - image_data = (fluid.layers.cast(image, 'float16') - if args.use_pure_fp16 and not args.use_dali else image) - image_transpose = fluid.layers.transpose( - image_data, - [0, 2, 3, 1]) if args.data_format == 'NHWC' else image_data - image_transpose.stop_gradient = image.stop_gradient - net_out = model.net(input=image_transpose, - class_dim=args.class_dim, - data_format=args.data_format) + with paddle.static.amp.fp16_guard(): + image_transpose = fluid.layers.transpose( + image, + [0, 2, 3, 1]) if args.data_format == 'NHWC' else image + image_transpose.stop_gradient = image.stop_gradient + net_out = model.net(input=image_transpose, + class_dim=args.class_dim, + data_format=args.data_format) else: net_out = model.net(input=image, class_dim=args.class_dim) - if args.use_pure_fp16: - net_out = fluid.layers.cast(x=net_out, dtype="float32") softmax_out = fluid.layers.softmax(net_out, use_cudnn=False) if is_train and args.use_label_smoothing: @@ -59,12 +57,11 @@ def _basic_model(data, model, args, is_train): else: cost = fluid.layers.cross_entropy(input=softmax_out, label=label) - target_cost = (fluid.layers.reduce_sum(cost) - if args.use_pure_fp16 else fluid.layers.mean(cost)) + avg_cost = fluid.layers.mean(cost) acc_top1 = fluid.layers.accuracy(input=softmax_out, label=label, k=1) acc_top5 = fluid.layers.accuracy( input=softmax_out, label=label, k=min(5, args.class_dim)) - return [target_cost, acc_top1, acc_top5] + return [avg_cost, acc_top1, acc_top5] def _googlenet_model(data, model, args, is_train): @@ -103,22 +100,18 @@ def _mixup_model(data, model, args, is_train): lam = data[3] if args.model in AMP_MODEL_LIST: - image_data = (fluid.layers.cast(image, 'float16') - if args.use_pure_fp16 and not args.use_dali else image) - image_transpose = fluid.layers.transpose( - image_data, - [0, 2, 3, 1]) if args.data_format == 'NHWC' else image_data - image_transpose.stop_gradient = image.stop_gradient - net_out = model.net(input=image_transpose, - class_dim=args.class_dim, - data_format=args.data_format) + with paddle.static.amp.fp16_guard(): + image_transpose = fluid.layers.transpose( + image, + [0, 2, 3, 1]) if args.data_format == 'NHWC' else image + image_transpose.stop_gradient = image.stop_gradient + net_out = model.net(input=image_transpose, + class_dim=args.class_dim, + data_format=args.data_format) else: net_out = model.net(input=image, class_dim=args.class_dim) - if args.use_pure_fp16: - net_out_fp32 = fluid.layers.cast(x=net_out, dtype="float32") - softmax_out = fluid.layers.softmax(net_out_fp32, use_cudnn=False) - else: - softmax_out = fluid.layers.softmax(net_out, use_cudnn=False) + softmax_out = fluid.layers.softmax(net_out, use_cudnn=False) + if not args.use_label_smoothing: loss_a = fluid.layers.cross_entropy(input=softmax_out, label=y_a) loss_b = fluid.layers.cross_entropy(input=softmax_out, label=y_b) @@ -128,17 +121,11 @@ def _mixup_model(data, model, args, is_train): loss_b = _calc_label_smoothing_loss(softmax_out, y_b, args.class_dim, args.label_smoothing_epsilon) - if args.use_pure_fp16: - target_loss_a = fluid.layers.reduce_sum(x=loss_a) - target_loss_b = fluid.layers.reduce_sum(x=loss_b) - cost = lam * target_loss_a + (1 - lam) * target_loss_b - target_cost = fluid.layers.reduce_sum(x=cost) - else: - target_loss_a = fluid.layers.mean(x=loss_a) - target_loss_b = fluid.layers.mean(x=loss_b) - cost = lam * target_loss_a + (1 - lam) * target_loss_b - target_cost = fluid.layers.mean(x=cost) - return [target_cost] + loss_a_mean = fluid.layers.mean(x=loss_a) + loss_b_mean = fluid.layers.mean(x=loss_b) + cost = lam * loss_a_mean + (1 - lam) * loss_b_mean + avg_cost = fluid.layers.mean(x=cost) + return [avg_cost] def create_model(model, args, is_train): diff --git a/PaddleCV/image_classification/dali.py b/PaddleCV/image_classification/dali.py index 653acd66..11141b2a 100644 --- a/PaddleCV/image_classification/dali.py +++ b/PaddleCV/image_classification/dali.py @@ -165,7 +165,7 @@ def build(settings, mode='train'): min_area = settings.lower_scale lower = settings.lower_ratio upper = settings.upper_ratio - output_dtype = types.FLOAT16 if settings.use_pure_fp16 else types.FLOAT + output_dtype = types.FLOAT16 if (settings.use_amp and settings.use_pure_fp16) else types.FLOAT interp = settings.interpolation or 1 # default to linear interp_map = { diff --git a/PaddleCV/image_classification/scripts/train/ResNet50_fp16.sh b/PaddleCV/image_classification/scripts/train/ResNet50_fp16.sh index 9c1ba24b..2c1f907b 100755 --- a/PaddleCV/image_classification/scripts/train/ResNet50_fp16.sh +++ b/PaddleCV/image_classification/scripts/train/ResNet50_fp16.sh @@ -4,13 +4,10 @@ export FLAGS_conv_workspace_size_limit=4000 #MB export FLAGS_cudnn_exhaustive_search=1 export FLAGS_cudnn_batchnorm_spatial_persistent=1 - DATA_DIR="Your image dataset path, e.g. /work/datasets/ILSVRC2012/" - DATA_FORMAT="NHWC" USE_AMP=true #whether to use amp -USE_PURE_FP16=false -MULTI_PRECISION=${USE_PURE_FP16} +USE_PURE_FP16=true USE_DALI=true USE_ADDTO=true @@ -34,7 +31,6 @@ python train.py \ --lr_strategy=piecewise_decay \ --use_amp=${USE_AMP} \ --use_pure_fp16=${USE_PURE_FP16} \ - --multi_precision=${MULTI_PRECISION} \ --scale_loss=128.0 \ --use_dynamic_loss_scaling=true \ --data_format=${DATA_FORMAT} \ @@ -48,6 +44,5 @@ python train.py \ --reader_thread=10 \ --reader_buf_size=4000 \ --use_dali=${USE_DALI} \ - --lr=0.1 \ - --random_seed=2020 + --lr=0.1 diff --git a/PaddleCV/image_classification/train.py b/PaddleCV/image_classification/train.py index 3749c689..c44ae38c 100755 --- a/PaddleCV/image_classification/train.py +++ b/PaddleCV/image_classification/train.py @@ -75,6 +75,7 @@ def build_program(is_train, main_prog, startup_prog, args): use_se=use_se) else: model = models.__dict__[args.model]() + optimizer = None with fluid.program_guard(main_prog, startup_prog): if args.random_seed or args.enable_ce: main_prog.random_seed = args.random_seed @@ -91,10 +92,12 @@ def build_program(is_train, main_prog, startup_prog, args): loss_out.append(global_lr) if args.use_amp: - optimizer = fluid.contrib.mixed_precision.decorate( + optimizer = paddle.static.amp.decorate( optimizer, init_loss_scaling=args.scale_loss, - use_dynamic_loss_scaling=args.use_dynamic_loss_scaling) + use_dynamic_loss_scaling=args.use_dynamic_loss_scaling, + use_pure_fp16=args.use_pure_fp16, + use_fp16_guard=True) optimizer.minimize(avg_cost) if args.use_ema: @@ -105,7 +108,7 @@ def build_program(is_train, main_prog, startup_prog, args): ema.update() loss_out.append(ema) loss_out.append(data_loader) - return loss_out + return loss_out, optimizer def validate(args, @@ -178,7 +181,7 @@ def train(args): """ startup_prog = fluid.Program() train_prog = fluid.Program() - train_out = build_program( + train_out, optimizer = build_program( is_train=True, main_prog=train_prog, startup_prog=startup_prog, @@ -194,7 +197,7 @@ def train(args): if args.validate: test_prog = fluid.Program() - test_out = build_program( + test_out, _ = build_program( is_train=False, main_prog=test_prog, startup_prog=startup_prog, @@ -216,6 +219,12 @@ def train(args): #init model by checkpoint or pretrianed model. init_model(exe, args, train_prog) + + if args.use_amp: + optimizer.amp_init(place, + scope=paddle.static.global_scope(), + test_program=test_prog if args.validate else None) + num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) if args.use_dali: import dali diff --git a/PaddleCV/image_classification/utils/optimizer.py b/PaddleCV/image_classification/utils/optimizer.py index 8a9c1104..68ca94bf 100644 --- a/PaddleCV/image_classification/utils/optimizer.py +++ b/PaddleCV/image_classification/utils/optimizer.py @@ -160,9 +160,7 @@ class Optimizer(object): self.decay_epochs = args.decay_epochs self.decay_rate = args.decay_rate self.total_images = args.total_images - self.multi_precision = args.multi_precision - self.rescale_grad = (1.0 / (args.batch_size / len(fluid.cuda_places())) - if args.use_pure_fp16 else 1.0) + self.multi_precision = args.use_pure_fp16 self.step = int(math.ceil(float(self.total_images) / self.batch_size)) @@ -179,8 +177,7 @@ class Optimizer(object): learning_rate=learning_rate, momentum=self.momentum_rate, regularization=fluid.regularizer.L2Decay(self.l2_decay), - multi_precision=self.multi_precision, - rescale_grad=self.rescale_grad) + multi_precision=self.multi_precision) return optimizer def cosine_decay(self): @@ -198,8 +195,7 @@ class Optimizer(object): learning_rate=learning_rate, momentum=self.momentum_rate, regularization=fluid.regularizer.L2Decay(self.l2_decay), - multi_precision=self.multi_precision, - rescale_grad=self.rescale_grad) + multi_precision=self.multi_precision) return optimizer def cosine_decay_warmup(self): @@ -218,8 +214,7 @@ class Optimizer(object): learning_rate=learning_rate, momentum=self.momentum_rate, regularization=fluid.regularizer.L2Decay(self.l2_decay), - multi_precision=self.multi_precision, - rescale_grad=self.rescale_grad) + multi_precision=self.multi_precision) return optimizer def exponential_decay_warmup(self): @@ -257,8 +252,7 @@ class Optimizer(object): learning_rate=learning_rate, momentum=self.momentum_rate, regularization=fluid.regularizer.L2Decay(self.l2_decay), - multi_precision=self.multi_precision, - rescale_grad=self.rescale_grad) + multi_precision=self.multi_precision) return optimizer @@ -301,8 +295,7 @@ class Optimizer(object): learning_rate=self.lr, momentum=self.momentum_rate, regularization=fluid.regularizer.L2Decay(self.l2_decay), - multi_precision=self.multi_precision, - rescale_grad=self.rescale_grad) + multi_precision=self.multi_precision) return optimizer diff --git a/PaddleCV/image_classification/utils/utility.py b/PaddleCV/image_classification/utils/utility.py index e9162d05..54c3ba96 100644 --- a/PaddleCV/image_classification/utils/utility.py +++ b/PaddleCV/image_classification/utils/utility.py @@ -141,7 +141,6 @@ def parse_args(): add_arg('validate', bool, True, "whether to validate when training.") add_arg('use_amp', bool, False, "Whether to enable mixed precision training with fp16." ) add_arg('use_pure_fp16', bool, False, "Whether to enable all half precision training with fp16." ) - add_arg('multi_precision', bool, False, "Whether to enable multi-precision training with fp16." ) add_arg('scale_loss', float, 1.0, "The value of scale_loss for fp16." ) add_arg('use_dynamic_loss_scaling', bool, True, "Whether to use dynamic loss scaling.") add_arg('data_format', str, "NCHW", "Tensor data format when training.") @@ -379,13 +378,10 @@ def create_data_loader(is_train, args): data_loader and the input data of net, """ image_shape = args.image_shape - image_dtype = "float32" - if args.model == "ResNet50" and args.use_pure_fp16 and args.use_dali: - image_dtype = "float16" feed_image = fluid.data( name="feed_image", shape=[None] + image_shape, - dtype=image_dtype, + dtype="float32", lod_level=0) feed_label = fluid.data( @@ -399,7 +395,7 @@ def create_data_loader(is_train, args): feed_y_b = fluid.data( name="feed_y_b", shape=[None, 1], dtype="int64", lod_level=0) feed_lam = fluid.data( - name="feed_lam", shape=[None, 1], dtype=image_dtype, lod_level=0) + name="feed_lam", shape=[None, 1], dtype="float32", lod_level=0) data_loader = fluid.io.DataLoader.from_generator( feed_list=[feed_image, feed_y_a, feed_y_b, feed_lam], -- GitLab