未验证 提交 cc9c0c25 编写于 作者: F furnace 提交者: GitHub

updates data format for model SE_ResNet50_vd (#4943)

* add amp support for model SE_ResNet50_vd

* optimize code, add model ResNet50 and SE_ResNet50_vd to list

* merge develop
上级 8aac9839
...@@ -15,6 +15,9 @@ import paddle ...@@ -15,6 +15,9 @@ import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import utils.utility as utility import utils.utility as utility
AMP_MODEL_LIST = ["ResNet50", "SE_ResNet50_vd"]
def _calc_label_smoothing_loss(softmax_out, label, class_dim, epsilon): def _calc_label_smoothing_loss(softmax_out, label, class_dim, epsilon):
"""Calculate label smoothing loss """Calculate label smoothing loss
...@@ -34,11 +37,12 @@ def _calc_label_smoothing_loss(softmax_out, label, class_dim, epsilon): ...@@ -34,11 +37,12 @@ def _calc_label_smoothing_loss(softmax_out, label, class_dim, epsilon):
def _basic_model(data, model, args, is_train): def _basic_model(data, model, args, is_train):
image = data[0] image = data[0]
label = data[1] label = data[1]
if args.model == "ResNet50": if args.model in AMP_MODEL_LIST:
image_data = (fluid.layers.cast(image, 'float16') image_data = (fluid.layers.cast(image, 'float16')
if args.use_pure_fp16 and not args.use_dali else image) if args.use_pure_fp16 and not args.use_dali else image)
image_transpose = fluid.layers.transpose( image_transpose = fluid.layers.transpose(
image_data, [0, 2, 3, 1]) if args.data_format == 'NHWC' else image_data image_data,
[0, 2, 3, 1]) if args.data_format == 'NHWC' else image_data
image_transpose.stop_gradient = image.stop_gradient image_transpose.stop_gradient = image.stop_gradient
net_out = model.net(input=image_transpose, net_out = model.net(input=image_transpose,
class_dim=args.class_dim, class_dim=args.class_dim,
...@@ -55,8 +59,8 @@ def _basic_model(data, model, args, is_train): ...@@ -55,8 +59,8 @@ def _basic_model(data, model, args, is_train):
else: else:
cost = fluid.layers.cross_entropy(input=softmax_out, label=label) cost = fluid.layers.cross_entropy(input=softmax_out, label=label)
target_cost = (fluid.layers.reduce_sum(cost) if args.use_pure_fp16 target_cost = (fluid.layers.reduce_sum(cost)
else fluid.layers.mean(cost)) if args.use_pure_fp16 else fluid.layers.mean(cost))
acc_top1 = fluid.layers.accuracy(input=softmax_out, label=label, k=1) acc_top1 = fluid.layers.accuracy(input=softmax_out, label=label, k=1)
acc_top5 = fluid.layers.accuracy( acc_top5 = fluid.layers.accuracy(
input=softmax_out, label=label, k=min(5, args.class_dim)) input=softmax_out, label=label, k=min(5, args.class_dim))
...@@ -98,11 +102,12 @@ def _mixup_model(data, model, args, is_train): ...@@ -98,11 +102,12 @@ def _mixup_model(data, model, args, is_train):
y_b = data[2] y_b = data[2]
lam = data[3] lam = data[3]
if args.model == "ResNet50": if args.model in AMP_MODEL_LIST:
image_data = (fluid.layers.cast(image, 'float16') image_data = (fluid.layers.cast(image, 'float16')
if args.use_pure_fp16 and not args.use_dali else image) if args.use_pure_fp16 and not args.use_dali else image)
image_transpose = fluid.layers.transpose( image_transpose = fluid.layers.transpose(
image_data, [0, 2, 3, 1]) if args.data_format == 'NHWC' else image_data image_data,
[0, 2, 3, 1]) if args.data_format == 'NHWC' else image_data
image_transpose.stop_gradient = image.stop_gradient image_transpose.stop_gradient = image.stop_gradient
net_out = model.net(input=image_transpose, net_out = model.net(input=image_transpose,
class_dim=args.class_dim, class_dim=args.class_dim,
......
...@@ -21,8 +21,10 @@ import paddle.fluid as fluid ...@@ -21,8 +21,10 @@ import paddle.fluid as fluid
from paddle.fluid.param_attr import ParamAttr from paddle.fluid.param_attr import ParamAttr
import math import math
__all__ = ["SE_ResNet_vd", "SE_ResNet18_vd","SE_ResNet34_vd", "SE_ResNet50_vd", "SE_ResNet101_vd", "SE_ResNet152_vd", __all__ = [
"SE_ResNet200_vd"] "SE_ResNet_vd", "SE_ResNet18_vd", "SE_ResNet34_vd", "SE_ResNet50_vd",
"SE_ResNet101_vd", "SE_ResNet152_vd", "SE_ResNet200_vd"
]
class SE_ResNet_vd(): class SE_ResNet_vd():
...@@ -30,7 +32,7 @@ class SE_ResNet_vd(): ...@@ -30,7 +32,7 @@ class SE_ResNet_vd():
self.layers = layers self.layers = layers
self.is_3x3 = is_3x3 self.is_3x3 = is_3x3
def net(self, input, class_dim=1000): def net(self, input, class_dim=1000, data_format="NCHW"):
is_3x3 = self.is_3x3 is_3x3 = self.is_3x3
layers = self.layers layers = self.layers
supported_layers = [18, 34, 50, 101, 152, 200] supported_layers = [18, 34, 50, 101, 152, 200]
...@@ -38,7 +40,7 @@ class SE_ResNet_vd(): ...@@ -38,7 +40,7 @@ class SE_ResNet_vd():
"supported layers are {} but input layer is {}".format(supported_layers, layers) "supported layers are {} but input layer is {}".format(supported_layers, layers)
if layers == 18: if layers == 18:
depth = [2, 2, 2, 2] depth = [2, 2, 2, 2]
elif layers == 34 or layers == 50: elif layers == 34 or layers == 50:
depth = [3, 4, 6, 3] depth = [3, 4, 6, 3]
elif layers == 101: elif layers == 101:
...@@ -51,66 +53,94 @@ class SE_ResNet_vd(): ...@@ -51,66 +53,94 @@ class SE_ResNet_vd():
reduction_ratio = 16 reduction_ratio = 16
if is_3x3 == False: if is_3x3 == False:
conv = self.conv_bn_layer( conv = self.conv_bn_layer(
input=input, num_filters=64, filter_size=7, stride=2, act='relu') input=input,
num_filters=64,
filter_size=7,
stride=2,
act='relu',
data_format=data_format)
else: else:
conv = self.conv_bn_layer( conv = self.conv_bn_layer(
input=input, num_filters=32, filter_size=3, stride=2, act='relu', name='conv1_1') input=input,
num_filters=32,
filter_size=3,
stride=2,
act='relu',
name='conv1_1',
data_format=data_format)
conv = self.conv_bn_layer( conv = self.conv_bn_layer(
input=conv, num_filters=32, filter_size=3, stride=1, act='relu', name='conv1_2') input=conv,
num_filters=32,
filter_size=3,
stride=1,
act='relu',
name='conv1_2',
data_format=data_format)
conv = self.conv_bn_layer( conv = self.conv_bn_layer(
input=conv, num_filters=64, filter_size=3, stride=1, act='relu', name='conv1_3') input=conv,
num_filters=64,
filter_size=3,
stride=1,
act='relu',
name='conv1_3',
data_format=data_format)
conv = fluid.layers.pool2d( conv = fluid.layers.pool2d(
input=conv, input=conv,
pool_size=3, pool_size=3,
pool_stride=2, pool_stride=2,
pool_padding=1, pool_padding=1,
pool_type='max') pool_type='max',
data_format=data_format)
if layers >= 50: if layers >= 50:
for block in range(len(depth)): for block in range(len(depth)):
for i in range(depth[block]): for i in range(depth[block]):
if layers in [101, 152, 200] and block == 2: if layers in [101, 152, 200] and block == 2:
if i == 0: if i == 0:
conv_name="res"+str(block+2)+"a" conv_name = "res" + str(block + 2) + "a"
else: else:
conv_name="res"+str(block+2)+"b"+str(i) conv_name = "res" + str(block + 2) + "b" + str(i)
else: else:
conv_name="res"+str(block+2)+chr(97+i) conv_name = "res" + str(block + 2) + chr(97 + i)
conv = self.bottleneck_block( conv = self.bottleneck_block(
input=conv, input=conv,
num_filters=num_filters[block], num_filters=num_filters[block],
stride=2 if i == 0 and block != 0 else 1, stride=2 if i == 0 and block != 0 else 1,
if_first=block==i==0, if_first=block == i == 0,
reduction_ratio=reduction_ratio, reduction_ratio=reduction_ratio,
name=conv_name) name=conv_name,
data_format=data_format)
else: else:
for block in range(len(depth)): for block in range(len(depth)):
for i in range(depth[block]): for i in range(depth[block]):
conv_name="res"+str(block+2)+chr(97+i) conv_name = "res" + str(block + 2) + chr(97 + i)
conv = self.basic_block( conv = self.basic_block(
input=conv, input=conv,
num_filters=num_filters[block], num_filters=num_filters[block],
stride=2 if i == 0 and block != 0 else 1, stride=2 if i == 0 and block != 0 else 1,
if_first=block==i==0, if_first=block == i == 0,
reduction_ratio=reduction_ratio, reduction_ratio=reduction_ratio,
name=conv_name) name=conv_name,
data_format=data_format)
pool = fluid.layers.pool2d( pool = fluid.layers.pool2d(
input=conv, pool_size=7, pool_type='avg', global_pooling=True) input=conv,
pool_size=7,
pool_type='avg',
global_pooling=True,
data_format=data_format)
stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0) stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
out = fluid.layers.fc(input=pool, out = fluid.layers.fc(
size=class_dim, input=pool,
param_attr=fluid.param_attr.ParamAttr( size=class_dim,
initializer=fluid.initializer.Uniform(-stdv, stdv), name='fc6_weights'), param_attr=fluid.param_attr.ParamAttr(
bias_attr=ParamAttr(name='fc6_offset')) initializer=fluid.initializer.Uniform(-stdv, stdv),
name='fc6_weights'),
bias_attr=ParamAttr(name='fc6_offset'))
return out return out
def conv_bn_layer(self, def conv_bn_layer(self,
input, input,
...@@ -119,7 +149,8 @@ class SE_ResNet_vd(): ...@@ -119,7 +149,8 @@ class SE_ResNet_vd():
stride=1, stride=1,
groups=1, groups=1,
act=None, act=None,
name=None): name=None,
data_format='NCHW'):
conv = fluid.layers.conv2d( conv = fluid.layers.conv2d(
input=input, input=input,
num_filters=num_filters, num_filters=num_filters,
...@@ -129,34 +160,39 @@ class SE_ResNet_vd(): ...@@ -129,34 +160,39 @@ class SE_ResNet_vd():
groups=groups, groups=groups,
act=None, act=None,
param_attr=ParamAttr(name=name + "_weights"), param_attr=ParamAttr(name=name + "_weights"),
bias_attr=False) bias_attr=False,
data_format=data_format)
if name == "conv1": if name == "conv1":
bn_name = "bn_" + name bn_name = "bn_" + name
else: else:
bn_name = "bn" + name[3:] bn_name = "bn" + name[3:]
return fluid.layers.batch_norm(input=conv, return fluid.layers.batch_norm(
act=act, input=conv,
param_attr=ParamAttr(name=bn_name + '_scale'), act=act,
bias_attr=ParamAttr(bn_name + '_offset'), param_attr=ParamAttr(name=bn_name + '_scale'),
moving_mean_name=bn_name + '_mean', bias_attr=ParamAttr(bn_name + '_offset'),
moving_variance_name=bn_name + '_variance') moving_mean_name=bn_name + '_mean',
moving_variance_name=bn_name + '_variance',
data_layout=data_format)
def conv_bn_layer_new(self, def conv_bn_layer_new(self,
input, input,
num_filters, num_filters,
filter_size, filter_size,
stride=1, stride=1,
groups=1, groups=1,
act=None, act=None,
name=None): name=None,
pool = fluid.layers.pool2d(input=input, data_format='NCHW'):
pool = fluid.layers.pool2d(
input=input,
pool_size=2, pool_size=2,
pool_stride=2, pool_stride=2,
pool_padding=0, pool_padding=0,
pool_type='avg', pool_type='avg',
ceil_mode=True) ceil_mode=True,
data_format=data_format)
conv = fluid.layers.conv2d( conv = fluid.layers.conv2d(
input=pool, input=pool,
num_filters=num_filters, num_filters=num_filters,
...@@ -166,130 +202,198 @@ class SE_ResNet_vd(): ...@@ -166,130 +202,198 @@ class SE_ResNet_vd():
groups=groups, groups=groups,
act=None, act=None,
param_attr=ParamAttr(name=name + "_weights"), param_attr=ParamAttr(name=name + "_weights"),
bias_attr=False) bias_attr=False,
data_format=data_format)
if name == "conv1": if name == "conv1":
bn_name = "bn_" + name bn_name = "bn_" + name
else: else:
bn_name = "bn" + name[3:] bn_name = "bn" + name[3:]
return fluid.layers.batch_norm(input=conv, return fluid.layers.batch_norm(
act=act, input=conv,
param_attr=ParamAttr(name=bn_name + '_scale'), act=act,
bias_attr=ParamAttr(bn_name + '_offset'), param_attr=ParamAttr(name=bn_name + '_scale'),
moving_mean_name=bn_name + '_mean', bias_attr=ParamAttr(bn_name + '_offset'),
moving_variance_name=bn_name + '_variance') moving_mean_name=bn_name + '_mean',
moving_variance_name=bn_name + '_variance',
data_layout=data_format)
def shortcut(self, input, ch_out, stride, name, if_first=False): def shortcut(self,
ch_in = input.shape[1] input,
ch_out,
stride,
name,
if_first=False,
data_format='NCHW'):
if data_format == 'NCHW':
ch_in = input.shape[1]
else:
ch_in = input.shape[-1]
if ch_in != ch_out or stride != 1: if ch_in != ch_out or stride != 1:
if if_first: if if_first:
return self.conv_bn_layer(input, ch_out, 1, stride, name=name) return self.conv_bn_layer(
input,
ch_out,
1,
stride,
name=name,
data_format=data_format)
else: else:
return self.conv_bn_layer_new(input, ch_out, 1, stride, name=name) return self.conv_bn_layer_new(
input,
ch_out,
1,
stride,
name=name,
data_format=data_format)
elif if_first: elif if_first:
return self.conv_bn_layer(input, ch_out, 1, stride, name=name) return self.conv_bn_layer(
input, ch_out, 1, stride, name=name, data_format=data_format)
else: else:
return input return input
def bottleneck_block(self, input, num_filters, stride, name, if_first, reduction_ratio): def bottleneck_block(self, input, num_filters, stride, name, if_first,
reduction_ratio, data_format):
conv0 = self.conv_bn_layer( conv0 = self.conv_bn_layer(
input=input, input=input,
num_filters=num_filters, num_filters=num_filters,
filter_size=1, filter_size=1,
act='relu', act='relu',
name=name+"_branch2a") name=name + "_branch2a",
data_format=data_format)
conv1 = self.conv_bn_layer( conv1 = self.conv_bn_layer(
input=conv0, input=conv0,
num_filters=num_filters, num_filters=num_filters,
filter_size=3, filter_size=3,
stride=stride, stride=stride,
act='relu', act='relu',
name=name+"_branch2b") name=name + "_branch2b",
conv2 =self.conv_bn_layer( data_format=data_format)
input=conv1, conv2 = self.conv_bn_layer(
num_filters=num_filters * 4, input=conv1,
filter_size=1, num_filters=num_filters * 4,
act=None, filter_size=1,
name=name+"_branch2c") act=None,
name=name + "_branch2c",
data_format=data_format)
scale = self.squeeze_excitation( scale = self.squeeze_excitation(
input=conv2, input=conv2,
num_channels=num_filters * 4, num_channels=num_filters * 4,
reduction_ratio=reduction_ratio, reduction_ratio=reduction_ratio,
name='fc_'+name) name='fc_' + name,
data_format=data_format)
short = self.shortcut(input, num_filters * 4, stride, if_first=if_first, name=name + "_branch1") short = self.shortcut(
input,
num_filters * 4,
stride,
if_first=if_first,
name=name + "_branch1",
data_format=data_format)
return fluid.layers.elementwise_add(x=short, y=scale, act='relu') return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
def basic_block(self, input, num_filters, stride, name, if_first, reduction_ratio): def basic_block(self, input, num_filters, stride, name, if_first,
conv0 = self.conv_bn_layer(input=input, reduction_ratio, data_format):
num_filters=num_filters, conv0 = self.conv_bn_layer(
filter_size=3, input=input,
act='relu', num_filters=num_filters,
stride=stride, filter_size=3,
name=name+"_branch2a") act='relu',
conv1 = self.conv_bn_layer(input=conv0, stride=stride,
num_filters=num_filters, name=name + "_branch2a",
filter_size=3, data_format=data_format)
act=None, conv1 = self.conv_bn_layer(
name=name+"_branch2b") input=conv0,
num_filters=num_filters,
filter_size=3,
act=None,
name=name + "_branch2b",
data_format=data_format)
scale = self.squeeze_excitation( scale = self.squeeze_excitation(
input=conv1, input=conv1,
num_channels=num_filters, num_channels=num_filters,
reduction_ratio=reduction_ratio, reduction_ratio=reduction_ratio,
name='fc_'+name) name='fc_' + name,
short = self.shortcut(input, data_format=data_format)
num_filters, short = self.shortcut(
stride, input,
if_first=if_first, num_filters,
name=name + "_branch1") stride,
if_first=if_first,
name=name + "_branch1",
data_format=data_format)
return fluid.layers.elementwise_add(x=short, y=scale, act='relu') return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
def squeeze_excitation(self,
def squeeze_excitation(self, input, num_channels, reduction_ratio, name=None): input,
num_channels,
reduction_ratio,
name=None,
data_format='NCHW'):
pool = fluid.layers.pool2d( pool = fluid.layers.pool2d(
input=input, pool_size=0, pool_type='avg', global_pooling=True) input=input,
pool_size=0,
pool_type='avg',
global_pooling=True,
data_format=data_format)
stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0) stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
squeeze = fluid.layers.fc(input=pool, squeeze = fluid.layers.fc(
size=num_channels // reduction_ratio, input=pool,
act='relu', size=num_channels // reduction_ratio,
param_attr=fluid.param_attr.ParamAttr( act='relu',
initializer=fluid.initializer.Uniform( param_attr=fluid.param_attr.ParamAttr(
-stdv, stdv),name=name+'_sqz_weights'), initializer=fluid.initializer.Uniform(-stdv, stdv),
bias_attr=ParamAttr(name=name+'_sqz_offset')) name=name + '_sqz_weights'),
bias_attr=ParamAttr(name=name + '_sqz_offset'))
stdv = 1.0 / math.sqrt(squeeze.shape[1] * 1.0) stdv = 1.0 / math.sqrt(squeeze.shape[1] * 1.0)
excitation = fluid.layers.fc(input=squeeze, excitation = fluid.layers.fc(
size=num_channels, input=squeeze,
act='sigmoid', size=num_channels,
param_attr=fluid.param_attr.ParamAttr( act='sigmoid',
initializer=fluid.initializer.Uniform(-stdv, stdv), param_attr=fluid.param_attr.ParamAttr(
name=name+'_exc_weights'), initializer=fluid.initializer.Uniform(-stdv, stdv),
bias_attr=ParamAttr(name=name+'_exc_offset')) name=name + '_exc_weights'),
scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0) bias_attr=ParamAttr(name=name + '_exc_offset'))
return scale
# scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0)
# return scale
input_in = fluid.layers.transpose(
input, [0, 3, 1, 2]) if data_format == 'NHWC' else input
input_in.stop_gradient = input.stop_gradient
scale = fluid.layers.elementwise_mul(x=input_in, y=excitation, axis=0)
scale_out = fluid.layers.transpose(
scale, [0, 2, 3, 1]) if data_format == 'NHWC' else scale
scale_out.stop_gradient = scale.stop_gradient
return scale_out
def SE_ResNet18_vd(): def SE_ResNet18_vd():
model = SE_ResNet_vd(layers=18, is_3x3 = True) model = SE_ResNet_vd(layers=18, is_3x3=True)
return model return model
def SE_ResNet34_vd(): def SE_ResNet34_vd():
model = SE_ResNet_vd(layers=34, is_3x3 = True) model = SE_ResNet_vd(layers=34, is_3x3=True)
return model return model
def SE_ResNet50_vd(): def SE_ResNet50_vd():
model = SE_ResNet_vd(layers=50, is_3x3 = True) model = SE_ResNet_vd(layers=50, is_3x3=True)
return model return model
def SE_ResNet101_vd(): def SE_ResNet101_vd():
model = SE_ResNet_vd(layers=101, is_3x3 = True) model = SE_ResNet_vd(layers=101, is_3x3=True)
return model return model
def SE_ResNet152_vd(): def SE_ResNet152_vd():
model = SE_ResNet_vd(layers=152, is_3x3 = True) model = SE_ResNet_vd(layers=152, is_3x3=True)
return model return model
def SE_ResNet200_vd(): def SE_ResNet200_vd():
model = SE_ResNet_vd(layers=200, is_3x3 = True) model = SE_ResNet_vd(layers=200, is_3x3=True)
return model return model
#SE_ResNet50_vd
export CUDA_VISIBLE_DEVICES=4
export FLAGS_conv_workspace_size_limit=4000 #MB
export FLAGS_cudnn_exhaustive_search=1
export FLAGS_cudnn_batchnorm_spatial_persistent=1
DATA_DIR="Your image dataset path, e.g. /work/datasets/ILSVRC2012/"
DATA_FORMAT="NHWC"
USE_FP16=true #whether to use float16
USE_DALI=true
USE_ADDTO=true
if ${USE_ADDTO} ;then
export FLAGS_max_inplace_grad_add=8
fi
if ${USE_DALI}; then
export FLAGS_fraction_of_gpu_memory_to_use=0.8
fi
python train.py \
--model=SE_ResNet50_vd \
--data_dir=${DATA_DIR} \
--batch_size=128 \
--lr_strategy=cosine_decay \
--use_fp16=${USE_FP16} \
--data_format=${DATA_FORMAT} \
--lr=0.1 \
--num_epochs=200 \
--model_save_dir=output/ \
--l2_decay=1e-4 \
--use_mixup=False \
--use_label_smoothing=True \
--label_smoothing_epsilon=0.1 \
--enable_addto=${USE_ADDTO} \
--use_dali=${USE_DALI} \
--image_shape 4 224 224 \
--fuse_bn_act_ops=true \
--fuse_bn_add_act_ops=true \
--fuse_elewise_add_act_ops=true \
...@@ -268,6 +268,7 @@ def train(args): ...@@ -268,6 +268,7 @@ def train(args):
#NOTE: this is for benchmark #NOTE: this is for benchmark
if args.max_iter and total_batch_num == args.max_iter: if args.max_iter and total_batch_num == args.max_iter:
return return
reader_cost_averager.record(time.time() - batch_start) reader_cost_averager.record(time.time() - batch_start)
train_batch_metrics = exe.run(compiled_train_prog, train_batch_metrics = exe.run(compiled_train_prog,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册