未验证 提交 05cd6ce3 编写于 作者: Z Zhen Wang 提交者: GitHub

Add new pure fp16 training for ResNet50. (#5047)

上级 7a0bff7c
...@@ -37,20 +37,18 @@ def _calc_label_smoothing_loss(softmax_out, label, class_dim, epsilon): ...@@ -37,20 +37,18 @@ def _calc_label_smoothing_loss(softmax_out, label, class_dim, epsilon):
def _basic_model(data, model, args, is_train): def _basic_model(data, model, args, is_train):
image = data[0] image = data[0]
label = data[1] label = data[1]
if args.model in AMP_MODEL_LIST: if args.model in AMP_MODEL_LIST:
image_data = (fluid.layers.cast(image, 'float16') with paddle.static.amp.fp16_guard():
if args.use_pure_fp16 and not args.use_dali else image)
image_transpose = fluid.layers.transpose( image_transpose = fluid.layers.transpose(
image_data, image,
[0, 2, 3, 1]) if args.data_format == 'NHWC' else image_data [0, 2, 3, 1]) if args.data_format == 'NHWC' else image
image_transpose.stop_gradient = image.stop_gradient image_transpose.stop_gradient = image.stop_gradient
net_out = model.net(input=image_transpose, net_out = model.net(input=image_transpose,
class_dim=args.class_dim, class_dim=args.class_dim,
data_format=args.data_format) data_format=args.data_format)
else: else:
net_out = model.net(input=image, class_dim=args.class_dim) net_out = model.net(input=image, class_dim=args.class_dim)
if args.use_pure_fp16:
net_out = fluid.layers.cast(x=net_out, dtype="float32")
softmax_out = fluid.layers.softmax(net_out, use_cudnn=False) softmax_out = fluid.layers.softmax(net_out, use_cudnn=False)
if is_train and args.use_label_smoothing: if is_train and args.use_label_smoothing:
...@@ -59,12 +57,11 @@ def _basic_model(data, model, args, is_train): ...@@ -59,12 +57,11 @@ def _basic_model(data, model, args, is_train):
else: else:
cost = fluid.layers.cross_entropy(input=softmax_out, label=label) cost = fluid.layers.cross_entropy(input=softmax_out, label=label)
target_cost = (fluid.layers.reduce_sum(cost) avg_cost = fluid.layers.mean(cost)
if args.use_pure_fp16 else fluid.layers.mean(cost))
acc_top1 = fluid.layers.accuracy(input=softmax_out, label=label, k=1) acc_top1 = fluid.layers.accuracy(input=softmax_out, label=label, k=1)
acc_top5 = fluid.layers.accuracy( acc_top5 = fluid.layers.accuracy(
input=softmax_out, label=label, k=min(5, args.class_dim)) input=softmax_out, label=label, k=min(5, args.class_dim))
return [target_cost, acc_top1, acc_top5] return [avg_cost, acc_top1, acc_top5]
def _googlenet_model(data, model, args, is_train): def _googlenet_model(data, model, args, is_train):
...@@ -103,22 +100,18 @@ def _mixup_model(data, model, args, is_train): ...@@ -103,22 +100,18 @@ def _mixup_model(data, model, args, is_train):
lam = data[3] lam = data[3]
if args.model in AMP_MODEL_LIST: if args.model in AMP_MODEL_LIST:
image_data = (fluid.layers.cast(image, 'float16') with paddle.static.amp.fp16_guard():
if args.use_pure_fp16 and not args.use_dali else image)
image_transpose = fluid.layers.transpose( image_transpose = fluid.layers.transpose(
image_data, image,
[0, 2, 3, 1]) if args.data_format == 'NHWC' else image_data [0, 2, 3, 1]) if args.data_format == 'NHWC' else image
image_transpose.stop_gradient = image.stop_gradient image_transpose.stop_gradient = image.stop_gradient
net_out = model.net(input=image_transpose, net_out = model.net(input=image_transpose,
class_dim=args.class_dim, class_dim=args.class_dim,
data_format=args.data_format) data_format=args.data_format)
else: else:
net_out = model.net(input=image, class_dim=args.class_dim) net_out = model.net(input=image, class_dim=args.class_dim)
if args.use_pure_fp16:
net_out_fp32 = fluid.layers.cast(x=net_out, dtype="float32")
softmax_out = fluid.layers.softmax(net_out_fp32, use_cudnn=False)
else:
softmax_out = fluid.layers.softmax(net_out, use_cudnn=False) softmax_out = fluid.layers.softmax(net_out, use_cudnn=False)
if not args.use_label_smoothing: if not args.use_label_smoothing:
loss_a = fluid.layers.cross_entropy(input=softmax_out, label=y_a) loss_a = fluid.layers.cross_entropy(input=softmax_out, label=y_a)
loss_b = fluid.layers.cross_entropy(input=softmax_out, label=y_b) loss_b = fluid.layers.cross_entropy(input=softmax_out, label=y_b)
...@@ -128,17 +121,11 @@ def _mixup_model(data, model, args, is_train): ...@@ -128,17 +121,11 @@ def _mixup_model(data, model, args, is_train):
loss_b = _calc_label_smoothing_loss(softmax_out, y_b, args.class_dim, loss_b = _calc_label_smoothing_loss(softmax_out, y_b, args.class_dim,
args.label_smoothing_epsilon) args.label_smoothing_epsilon)
if args.use_pure_fp16: loss_a_mean = fluid.layers.mean(x=loss_a)
target_loss_a = fluid.layers.reduce_sum(x=loss_a) loss_b_mean = fluid.layers.mean(x=loss_b)
target_loss_b = fluid.layers.reduce_sum(x=loss_b) cost = lam * loss_a_mean + (1 - lam) * loss_b_mean
cost = lam * target_loss_a + (1 - lam) * target_loss_b avg_cost = fluid.layers.mean(x=cost)
target_cost = fluid.layers.reduce_sum(x=cost) return [avg_cost]
else:
target_loss_a = fluid.layers.mean(x=loss_a)
target_loss_b = fluid.layers.mean(x=loss_b)
cost = lam * target_loss_a + (1 - lam) * target_loss_b
target_cost = fluid.layers.mean(x=cost)
return [target_cost]
def create_model(model, args, is_train): def create_model(model, args, is_train):
......
...@@ -165,7 +165,7 @@ def build(settings, mode='train'): ...@@ -165,7 +165,7 @@ def build(settings, mode='train'):
min_area = settings.lower_scale min_area = settings.lower_scale
lower = settings.lower_ratio lower = settings.lower_ratio
upper = settings.upper_ratio upper = settings.upper_ratio
output_dtype = types.FLOAT16 if settings.use_pure_fp16 else types.FLOAT output_dtype = types.FLOAT16 if (settings.use_amp and settings.use_pure_fp16) else types.FLOAT
interp = settings.interpolation or 1 # default to linear interp = settings.interpolation or 1 # default to linear
interp_map = { interp_map = {
......
...@@ -4,13 +4,10 @@ export FLAGS_conv_workspace_size_limit=4000 #MB ...@@ -4,13 +4,10 @@ export FLAGS_conv_workspace_size_limit=4000 #MB
export FLAGS_cudnn_exhaustive_search=1 export FLAGS_cudnn_exhaustive_search=1
export FLAGS_cudnn_batchnorm_spatial_persistent=1 export FLAGS_cudnn_batchnorm_spatial_persistent=1
DATA_DIR="Your image dataset path, e.g. /work/datasets/ILSVRC2012/" DATA_DIR="Your image dataset path, e.g. /work/datasets/ILSVRC2012/"
DATA_FORMAT="NHWC" DATA_FORMAT="NHWC"
USE_AMP=true #whether to use amp USE_AMP=true #whether to use amp
USE_PURE_FP16=false USE_PURE_FP16=true
MULTI_PRECISION=${USE_PURE_FP16}
USE_DALI=true USE_DALI=true
USE_ADDTO=true USE_ADDTO=true
...@@ -34,7 +31,6 @@ python train.py \ ...@@ -34,7 +31,6 @@ python train.py \
--lr_strategy=piecewise_decay \ --lr_strategy=piecewise_decay \
--use_amp=${USE_AMP} \ --use_amp=${USE_AMP} \
--use_pure_fp16=${USE_PURE_FP16} \ --use_pure_fp16=${USE_PURE_FP16} \
--multi_precision=${MULTI_PRECISION} \
--scale_loss=128.0 \ --scale_loss=128.0 \
--use_dynamic_loss_scaling=true \ --use_dynamic_loss_scaling=true \
--data_format=${DATA_FORMAT} \ --data_format=${DATA_FORMAT} \
...@@ -48,6 +44,5 @@ python train.py \ ...@@ -48,6 +44,5 @@ python train.py \
--reader_thread=10 \ --reader_thread=10 \
--reader_buf_size=4000 \ --reader_buf_size=4000 \
--use_dali=${USE_DALI} \ --use_dali=${USE_DALI} \
--lr=0.1 \ --lr=0.1
--random_seed=2020
...@@ -75,6 +75,7 @@ def build_program(is_train, main_prog, startup_prog, args): ...@@ -75,6 +75,7 @@ def build_program(is_train, main_prog, startup_prog, args):
use_se=use_se) use_se=use_se)
else: else:
model = models.__dict__[args.model]() model = models.__dict__[args.model]()
optimizer = None
with fluid.program_guard(main_prog, startup_prog): with fluid.program_guard(main_prog, startup_prog):
if args.random_seed or args.enable_ce: if args.random_seed or args.enable_ce:
main_prog.random_seed = args.random_seed main_prog.random_seed = args.random_seed
...@@ -91,10 +92,12 @@ def build_program(is_train, main_prog, startup_prog, args): ...@@ -91,10 +92,12 @@ def build_program(is_train, main_prog, startup_prog, args):
loss_out.append(global_lr) loss_out.append(global_lr)
if args.use_amp: if args.use_amp:
optimizer = fluid.contrib.mixed_precision.decorate( optimizer = paddle.static.amp.decorate(
optimizer, optimizer,
init_loss_scaling=args.scale_loss, init_loss_scaling=args.scale_loss,
use_dynamic_loss_scaling=args.use_dynamic_loss_scaling) use_dynamic_loss_scaling=args.use_dynamic_loss_scaling,
use_pure_fp16=args.use_pure_fp16,
use_fp16_guard=True)
optimizer.minimize(avg_cost) optimizer.minimize(avg_cost)
if args.use_ema: if args.use_ema:
...@@ -105,7 +108,7 @@ def build_program(is_train, main_prog, startup_prog, args): ...@@ -105,7 +108,7 @@ def build_program(is_train, main_prog, startup_prog, args):
ema.update() ema.update()
loss_out.append(ema) loss_out.append(ema)
loss_out.append(data_loader) loss_out.append(data_loader)
return loss_out return loss_out, optimizer
def validate(args, def validate(args,
...@@ -178,7 +181,7 @@ def train(args): ...@@ -178,7 +181,7 @@ def train(args):
""" """
startup_prog = fluid.Program() startup_prog = fluid.Program()
train_prog = fluid.Program() train_prog = fluid.Program()
train_out = build_program( train_out, optimizer = build_program(
is_train=True, is_train=True,
main_prog=train_prog, main_prog=train_prog,
startup_prog=startup_prog, startup_prog=startup_prog,
...@@ -194,7 +197,7 @@ def train(args): ...@@ -194,7 +197,7 @@ def train(args):
if args.validate: if args.validate:
test_prog = fluid.Program() test_prog = fluid.Program()
test_out = build_program( test_out, _ = build_program(
is_train=False, is_train=False,
main_prog=test_prog, main_prog=test_prog,
startup_prog=startup_prog, startup_prog=startup_prog,
...@@ -216,6 +219,12 @@ def train(args): ...@@ -216,6 +219,12 @@ def train(args):
#init model by checkpoint or pretrianed model. #init model by checkpoint or pretrianed model.
init_model(exe, args, train_prog) init_model(exe, args, train_prog)
if args.use_amp:
optimizer.amp_init(place,
scope=paddle.static.global_scope(),
test_program=test_prog if args.validate else None)
num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
if args.use_dali: if args.use_dali:
import dali import dali
......
...@@ -160,9 +160,7 @@ class Optimizer(object): ...@@ -160,9 +160,7 @@ class Optimizer(object):
self.decay_epochs = args.decay_epochs self.decay_epochs = args.decay_epochs
self.decay_rate = args.decay_rate self.decay_rate = args.decay_rate
self.total_images = args.total_images self.total_images = args.total_images
self.multi_precision = args.multi_precision self.multi_precision = args.use_pure_fp16
self.rescale_grad = (1.0 / (args.batch_size / len(fluid.cuda_places()))
if args.use_pure_fp16 else 1.0)
self.step = int(math.ceil(float(self.total_images) / self.batch_size)) self.step = int(math.ceil(float(self.total_images) / self.batch_size))
...@@ -179,8 +177,7 @@ class Optimizer(object): ...@@ -179,8 +177,7 @@ class Optimizer(object):
learning_rate=learning_rate, learning_rate=learning_rate,
momentum=self.momentum_rate, momentum=self.momentum_rate,
regularization=fluid.regularizer.L2Decay(self.l2_decay), regularization=fluid.regularizer.L2Decay(self.l2_decay),
multi_precision=self.multi_precision, multi_precision=self.multi_precision)
rescale_grad=self.rescale_grad)
return optimizer return optimizer
def cosine_decay(self): def cosine_decay(self):
...@@ -198,8 +195,7 @@ class Optimizer(object): ...@@ -198,8 +195,7 @@ class Optimizer(object):
learning_rate=learning_rate, learning_rate=learning_rate,
momentum=self.momentum_rate, momentum=self.momentum_rate,
regularization=fluid.regularizer.L2Decay(self.l2_decay), regularization=fluid.regularizer.L2Decay(self.l2_decay),
multi_precision=self.multi_precision, multi_precision=self.multi_precision)
rescale_grad=self.rescale_grad)
return optimizer return optimizer
def cosine_decay_warmup(self): def cosine_decay_warmup(self):
...@@ -218,8 +214,7 @@ class Optimizer(object): ...@@ -218,8 +214,7 @@ class Optimizer(object):
learning_rate=learning_rate, learning_rate=learning_rate,
momentum=self.momentum_rate, momentum=self.momentum_rate,
regularization=fluid.regularizer.L2Decay(self.l2_decay), regularization=fluid.regularizer.L2Decay(self.l2_decay),
multi_precision=self.multi_precision, multi_precision=self.multi_precision)
rescale_grad=self.rescale_grad)
return optimizer return optimizer
def exponential_decay_warmup(self): def exponential_decay_warmup(self):
...@@ -257,8 +252,7 @@ class Optimizer(object): ...@@ -257,8 +252,7 @@ class Optimizer(object):
learning_rate=learning_rate, learning_rate=learning_rate,
momentum=self.momentum_rate, momentum=self.momentum_rate,
regularization=fluid.regularizer.L2Decay(self.l2_decay), regularization=fluid.regularizer.L2Decay(self.l2_decay),
multi_precision=self.multi_precision, multi_precision=self.multi_precision)
rescale_grad=self.rescale_grad)
return optimizer return optimizer
...@@ -301,8 +295,7 @@ class Optimizer(object): ...@@ -301,8 +295,7 @@ class Optimizer(object):
learning_rate=self.lr, learning_rate=self.lr,
momentum=self.momentum_rate, momentum=self.momentum_rate,
regularization=fluid.regularizer.L2Decay(self.l2_decay), regularization=fluid.regularizer.L2Decay(self.l2_decay),
multi_precision=self.multi_precision, multi_precision=self.multi_precision)
rescale_grad=self.rescale_grad)
return optimizer return optimizer
......
...@@ -141,7 +141,6 @@ def parse_args(): ...@@ -141,7 +141,6 @@ def parse_args():
add_arg('validate', bool, True, "whether to validate when training.") add_arg('validate', bool, True, "whether to validate when training.")
add_arg('use_amp', bool, False, "Whether to enable mixed precision training with fp16." ) add_arg('use_amp', bool, False, "Whether to enable mixed precision training with fp16." )
add_arg('use_pure_fp16', bool, False, "Whether to enable all half precision training with fp16." ) add_arg('use_pure_fp16', bool, False, "Whether to enable all half precision training with fp16." )
add_arg('multi_precision', bool, False, "Whether to enable multi-precision training with fp16." )
add_arg('scale_loss', float, 1.0, "The value of scale_loss for fp16." ) add_arg('scale_loss', float, 1.0, "The value of scale_loss for fp16." )
add_arg('use_dynamic_loss_scaling', bool, True, "Whether to use dynamic loss scaling.") add_arg('use_dynamic_loss_scaling', bool, True, "Whether to use dynamic loss scaling.")
add_arg('data_format', str, "NCHW", "Tensor data format when training.") add_arg('data_format', str, "NCHW", "Tensor data format when training.")
...@@ -379,13 +378,10 @@ def create_data_loader(is_train, args): ...@@ -379,13 +378,10 @@ def create_data_loader(is_train, args):
data_loader and the input data of net, data_loader and the input data of net,
""" """
image_shape = args.image_shape image_shape = args.image_shape
image_dtype = "float32"
if args.model == "ResNet50" and args.use_pure_fp16 and args.use_dali:
image_dtype = "float16"
feed_image = fluid.data( feed_image = fluid.data(
name="feed_image", name="feed_image",
shape=[None] + image_shape, shape=[None] + image_shape,
dtype=image_dtype, dtype="float32",
lod_level=0) lod_level=0)
feed_label = fluid.data( feed_label = fluid.data(
...@@ -399,7 +395,7 @@ def create_data_loader(is_train, args): ...@@ -399,7 +395,7 @@ def create_data_loader(is_train, args):
feed_y_b = fluid.data( feed_y_b = fluid.data(
name="feed_y_b", shape=[None, 1], dtype="int64", lod_level=0) name="feed_y_b", shape=[None, 1], dtype="int64", lod_level=0)
feed_lam = fluid.data( feed_lam = fluid.data(
name="feed_lam", shape=[None, 1], dtype=image_dtype, lod_level=0) name="feed_lam", shape=[None, 1], dtype="float32", lod_level=0)
data_loader = fluid.io.DataLoader.from_generator( data_loader = fluid.io.DataLoader.from_generator(
feed_list=[feed_image, feed_y_a, feed_y_b, feed_lam], feed_list=[feed_image, feed_y_a, feed_y_b, feed_lam],
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册