提交 a20f4b5f 编写于 作者: S ShawnXuan

Merge branch 'master' of https://github.com/Oneflow-Inc/OneFlow-Benchmark into cnn_pad_output

......@@ -78,8 +78,8 @@ def conv2d_layer(
return output
def alexnet(images, need_transpose=False, channel_last=False, training=True):
data_format = "NHWC" if channel_last else "NCHW"
def alexnet(images, args, need_transpose=False, training=True):
data_format = "NHWC" if args.channel_last else "NCHW"
conv1 = conv2d_layer(
"conv1", images, filters=64, kernel_size=11, strides=4, padding="VALID",
......
......@@ -52,6 +52,7 @@ def get_parser(parser=None):
help='node/machine number for training')
parser.add_argument('--node_ips', type=str_list, default=['192.168.1.13', '192.168.1.14'],
help='nodes ip list for training, devided by ",", length >= num_nodes')
parser.add_argument("--ctrl_port", type=int, default=50051, help='ctrl_port for multinode job')
parser.add_argument("--model", type=str, default="resnet50",
help="resnet50")
......@@ -92,6 +93,20 @@ def get_parser(parser=None):
parser.add_argument("--batch_size_per_device", type=int, default=64)
parser.add_argument("--val_batch_size_per_device", type=int, default=8)
# fuse bn relu or bn add relu
parser.add_argument(
'--fuse_bn_relu',
type=str2bool,
default=False,
help='Whether to use use fuse batch normalization relu. Currently supported in origin/master of OneFlow only.'
)
parser.add_argument(
'--fuse_bn_add_relu',
type=str2bool,
default=False,
help='Whether to use use fuse batch normalization add relu. Currently supported in origin/master of OneFlow only.'
)
# inference
parser.add_argument("--image_path", type=str, default='test_img/tiger.jpg', help="image path")
......
......@@ -491,12 +491,8 @@ def InceptionE(in_blob, index):
return concat_total
def inceptionv3(images, trainable=True, need_transpose=False, channel_last=False):
if need_transpose:
images = flow.transpose(images, name="transpose", perm=[0, 3, 1, 2])
if channel_last:
# if channel_last=True, then change mode from 'nchw' to 'nhwc'
images = flow.transpose(images, name="transpose", perm=[0, 2, 3, 1])
def inceptionv3(images, trainable=True, channel_last=False):
assert channel_last==False, "InceptionV3 does not support channel_last mode, set channel_last=False will be right!"
with flow.scope.namespace("InceptionV3"):
# conv0: 299 x 299 x 3
conv0 = conv2d_layer_with_bn(
......
......@@ -58,7 +58,11 @@ def _relu6(data, prefix):
def mobilenet_unit(data, num_filter=1, kernel=(1, 1), stride=(1, 1), pad=(0, 0), num_group=1, data_format="NCHW", if_act=True, use_bias=False, prefix=''):
conv = flow.layers.conv2d(inputs=data, filters=num_filter, kernel_size=kernel, strides=stride, padding=pad, data_format=data_format, dilation_rate=1, groups=num_group, activation=None, use_bias=use_bias, kernel_initializer=_get_initializer("weight"), bias_initializer=_get_initializer("bias"), kernel_regularizer=_get_regularizer("weight"), bias_regularizer=_get_regularizer("bias"), name=prefix)
conv = flow.layers.conv2d(inputs=data, filters=num_filter, kernel_size=kernel, strides=stride,
padding=pad, data_format=data_format, dilation_rate=1, groups=num_group, activation=None,
use_bias=use_bias, kernel_initializer=_get_initializer("weight"),
bias_initializer=_get_initializer("bias"), kernel_regularizer=_get_regularizer("weight"),
bias_regularizer=_get_regularizer("bias"), name=prefix)
bn = _batch_norm(conv, axis=1, momentum=0.9, epsilon=1e-5, name='%s-BatchNorm'%prefix)
if if_act:
act = _relu6(bn, prefix)
......@@ -156,11 +160,9 @@ class MobileNetV2(object):
else:
self.config_map=MNETV2_CONFIGS_MAP[(224, 224)]
def build_network(self, input_data, need_transpose, data_format, class_num=1000, prefix="", **configs):
def build_network(self, input_data, data_format, class_num=1000, prefix="", **configs):
self.config_map.update(configs)
if need_transpose:
input_data = flow.transpose(input_data, name="transpose", perm=[0, 3, 1, 2])
first_c = int(round(self.config_map['firstconv_filter_num']*self.multiplier))
first_layer = mobilenet_unit(
data=input_data,
......@@ -233,11 +235,13 @@ class MobileNetV2(object):
)
return fc
def __call__(self, input_data, need_transpose, class_num=1000, prefix = "", **configs):
sym = self.build_network(input_data, need_transpose, class_num=class_num, prefix=prefix, **configs)
def __call__(self, input_data, class_num=1000, prefix = "", **configs):
sym = self.build_network(input_data, class_num=class_num, prefix=prefix, **configs)
return sym
def Mobilenet(input_data, trainable=True, need_transpose=False, training=True, data_format="NCHW", num_classes=1000, multiplier=1.0, prefix = ""):
def Mobilenet(input_data, args, trainable=True, training=True, num_classes=1000, multiplier=1.0, prefix = ""):
assert args.channel_last==False, "Mobilenet does not support channel_last mode, set channel_last=False will be right!"
data_format="NHWC" if args.channel_last else "NCHW"
mobilenetgen = MobileNetV2((224,224), multiplier=multiplier)
out = mobilenetgen(input_data, need_transpose, data_format=data_format, class_num=num_classes, prefix = "MobilenetV2")
out = mobilenetgen(input_data, data_format=data_format, class_num=num_classes, prefix = "MobilenetV2")
return out
......@@ -58,8 +58,7 @@ def InferenceNet():
print("Loading data from {}".format(args.val_data_dir))
(labels, images) = ofrecord_util.load_imagenet_for_validation(args)
logits = model_dict[args.model](images,
channel_last=args.channel_last)
logits = model_dict[args.model](images, args)
predictions = flow.nn.softmax(logits)
outputs = {"predictions": predictions, "labels": labels}
return outputs
......
......@@ -57,7 +57,7 @@ def load_image(image_path='test_img/ILSVRC2012_val_00020287.JPEG'):
@flow.global_function("predict", flow.function_config())
def InferenceNet(images: tp.Numpy.Placeholder((1, 3, 224, 224), dtype=flow.float)) -> tp.Numpy:
logits = model_dict[args.model](images, training=False)
logits = model_dict[args.model](images, args, training=False)
predictions = flow.nn.softmax(logits)
return predictions
......
......@@ -71,9 +71,7 @@ def TrainNet():
else:
print("Loading synthetic data.")
(labels, images) = ofrecord_util.load_synthetic(args)
logits = model_dict[args.model](images,
channel_last=args.channel_last
)
logits = model_dict[args.model](images, args)
if args.label_smoothing > 0:
one_hot_labels = label_smoothing(labels, args.num_classes, args.label_smoothing, logits.dtype)
loss = flow.nn.softmax_cross_entropy_with_logits(one_hot_labels, logits, name="softmax_loss")
......@@ -101,8 +99,7 @@ def InferenceNet():
print("Loading synthetic data.")
(labels, images) = ofrecord_util.load_synthetic(args)
logits = model_dict[args.model](
images, channel_last=args.channel_last)
logits = model_dict[args.model](images, args)
predictions = flow.nn.softmax(logits)
outputs = {"predictions": predictions, "labels": labels}
return outputs
......
......@@ -193,9 +193,9 @@ class ResnetBuilder(object):
return pool1
def resnet50(images, trainable=True, training=True, wd=1.0 / 32768, channel_last=False):
weight_regularizer = flow.regularizers.l2(wd) if wd > 0.0 and wd < 1.0 else None
builder = ResnetBuilder(weight_regularizer, trainable, training, channel_last)
def resnet50(images, args, trainable=True, training=True):
weight_regularizer = flow.regularizers.l2(args.wd) if args.wd > 0.0 and args.wd < 1.0 else None
builder = ResnetBuilder(weight_regularizer, trainable, training, args.channel_last, args.fuse_bn_relu, args.fuse_bn_add_relu)
pad_output = 0 # TODO: use args.pad_output
......
......@@ -242,34 +242,34 @@ def resnext34(images, trainable=True, training=True, need_transpose=False,
return model
def resnext50(images, trainable=True, training=True, need_transpose=False,
channel_last=False, **kwargs):
def resnext50(images, args, trainable=True, training=True, need_transpose=False,
**kwargs):
"""Constructs a ResNeXt-50 model.
"""
resnext_50 = ResNeXt(images, trainable=trainable, training=training,
need_transpose=need_transpose, channel_last=channel_last,
need_transpose=need_transpose, channel_last=args.channel_last,
block=bottle_neck, layers=[3, 4, 6, 3], **kwargs)
model = resnext_50.build_network()
return model
def resnext101(images, trainable=True, training=True, need_transpose=False,
channel_last=False, **kwargs):
def resnext101(images, args, trainable=True, training=True, need_transpose=False,
**kwargs):
"""Constructs a ResNeXt-101 model.
"""
resnext_101 = ResNeXt(images, trainable=trainable, training=training,
need_transpose=False, channel_last=False,
need_transpose=False, channel_last=args.channel_last,
block=bottle_neck, layers=[3, 4, 23, 3], **kwargs)
model = resnex_101.build_network()
return model
def resnext152(images, trainable=True, training=True, need_transpose=False,
channel_last=False, **kwargs):
def resnext152(images, args, trainable=True, training=True, need_transpose=False,
**kwargs):
"""Constructs a ResNeXt-152 model.
"""
resnext_152 = ResNeXt(images, trainable=trainable, training=training,
need_transpose=need_transpose, channel_last=channel_last,
need_transpose=need_transpose, channel_last=args.channel_last,
block=bottle_neck, layers=[3, 8, 36, 3], **kwargs)
model = resnext_152.build_network()
return model
......@@ -25,7 +25,7 @@ import oneflow as flow
def InitNodes(args):
if args.num_nodes > 1:
assert args.num_nodes <= len(args.node_ips)
flow.env.ctrl_port(12138)
flow.env.ctrl_port(args.ctrl_port)
nodes = []
for ip in args.node_ips[:args.num_nodes]:
addr_dict = {}
......
......@@ -104,8 +104,8 @@ def _conv_block(in_blob, index, filters, conv_times, data_format="NCHW"):
return conv_block
def vgg16bn(images, trainable=True, channel_last=False, training=True, wd=1.0/32768):
data_format="NHWC" if channel_last else "NCHW"
def vgg16bn(images, args, trainable=True, training=True):
data_format="NHWC" if args.channel_last else "NCHW"
conv1 = _conv_block(images, 0, 64, 2, data_format)
pool1 = flow.nn.max_pool2d(conv1[-1], 2, 2, "VALID", data_format, name="pool1")
......
......@@ -21,7 +21,10 @@ import glob
from sklearn.metrics import roc_auc_score
import numpy as np
import time
from pynvml import *
def str_list(x):
return x.split(',')
parser = argparse.ArgumentParser()
parser.add_argument('--train_data_dir', type=str, required=True)
parser.add_argument('--train_data_part_num', type=int, required=True)
......@@ -42,7 +45,12 @@ parser.add_argument('--num_wide_sparse_fields', type=int, default=2)
parser.add_argument('--num_deep_sparse_fields', type=int, default=26)
parser.add_argument('--max_iter', type=int, default=30000)
parser.add_argument('--loss_print_every_n_iter', type=int, default=100)
parser.add_argument('--gpu_num', type=int, default=8)
parser.add_argument('--gpu_num_per_node', type=int, default=8)
parser.add_argument('--num_nodes', type=int, default=1,
help='node/machine number for training')
parser.add_argument('--node_ips', type=str_list, default=['192.168.1.13', '192.168.1.14'],
help='nodes ip list for training, devided by ",", length >= num_nodes')
parser.add_argument("--ctrl_port", type=int, default=50051, help='ctrl_port for multinode job')
parser.add_argument('--hidden_units_num', type=int, default=7)
parser.add_argument('--hidden_size', type=int, default=1024)
......@@ -50,7 +58,6 @@ FLAGS = parser.parse_args()
#DEEP_HIDDEN_UNITS = [1024, 1024]#, 1024, 1024, 1024, 1024, 1024]
DEEP_HIDDEN_UNITS = [FLAGS.hidden_size for i in range(FLAGS.hidden_units_num)]
print(DEEP_HIDDEN_UNITS)
def _data_loader_ofrecord(data_dir, data_part_num, batch_size, part_name_suffix_length=-1,
......@@ -120,6 +127,7 @@ def _model(dense_fields, wide_sparse_fields, deep_sparse_fields):
global_loss = 0.0
def _create_train_callback(step):
handle = nvmlDeviceGetHandleByIndex(0)
def nop(loss):
global global_loss
global_loss += loss.mean()
......@@ -127,8 +135,9 @@ def _create_train_callback(step):
def print_loss(loss):
global global_loss
info = nvmlDeviceGetMemoryInfo(handle)
global_loss += loss.mean()
print(step+1, 'time', datetime.datetime.now(), 'loss', global_loss/FLAGS.loss_print_every_n_iter)
print(step+1, 'time', datetime.datetime.now(), 'loss', global_loss/FLAGS.loss_print_every_n_iter, 'mem', info.used)
global_loss = 0.0
if (step + 1) % FLAGS.loss_print_every_n_iter == 0:
......@@ -139,7 +148,7 @@ def _create_train_callback(step):
def CreateOptimizer(args):
lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [args.learning_rate])
return flow.optimizer.LARS(lr_scheduler)
return flow.optimizer.LazyAdam(lr_scheduler)
def _get_train_conf():
......@@ -177,8 +186,37 @@ def eval_job():
predict = flow.math.sigmoid(logits)
return loss, predict, labels
def InitNodes(args):
if args.num_nodes > 1:
assert args.num_nodes <= len(args.node_ips)
flow.env.ctrl_port(args.ctrl_port)
nodes = []
for ip in args.node_ips[:args.num_nodes]:
addr_dict = {}
addr_dict["addr"] = ip
nodes.append(addr_dict)
flow.env.machine(nodes)
def print_args(args):
print("=".ljust(66, "="))
print("Running {}: num_gpu_per_node = {}, num_nodes = {}.".format(
'OneFlow-WDL', args.gpu_num_per_node, args.num_nodes))
print("=".ljust(66, "="))
for arg in vars(args):
print("{} = {}".format(arg, getattr(args, arg)))
print("-".ljust(66, "-"))
#print("Time stamp: {}".format(
# str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S"))))
def main():
flow.config.gpu_device_num(FLAGS.gpu_num)
print_args(FLAGS)
InitNodes(FLAGS)
flow.config.gpu_device_num(FLAGS.gpu_num_per_node)
flow.config.enable_model_io_v2(True)
flow.config.enable_debug_mode(True)
#flow.config.enable_numa_aware_cuda_malloc_host(True)
#flow.config.collective_boxing.enable_fusion(False)
check_point = flow.train.CheckPoint()
......@@ -201,4 +239,6 @@ def main():
if __name__ == '__main__':
nvmlInit()
main()
nvmlShutdown()
......@@ -48,6 +48,7 @@ def get_parser(parser=None):
help='node/machine number for training')
parser.add_argument('--node_ips', type=str_list, default=['192.168.1.13', '192.168.1.14'],
help='nodes ip list for training, devided by ",", length >= num_nodes')
parser.add_argument("--ctrl_port", type=int, default=50051, help='ctrl_port for multinode job')
# train
parser.add_argument("--learning_rate", type=float, default=1e-4, help="Learning rate")
......
......@@ -26,7 +26,7 @@ import oneflow as flow
def InitNodes(args):
if args.num_nodes > 1:
assert args.num_nodes <= len(args.node_ips)
#flow.env.ctrl_port(12138)
flow.env.ctrl_port(args.ctrl_port)
nodes = []
for ip in args.node_ips[:args.num_nodes]:
addr_dict = {}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册