Merge branch 'master' of https://github.com/Oneflow-Inc/OneFlow-Benchmark into cnn_pad_output

a20f4b5f · ShawnXuan · d3f24e35 · 0b3e470d · a20f4b5f · a20f4b5f
14 changed file
--- a/Classification/cnns/alexnet_model.py
+++ b/Classification/cnns/alexnet_model.py
@@ -78,8 +78,8 @@ def conv2d_layer(
    return output


-def alexnet(images, need_transpose=False, channel_last=False, training=True):
-    data_format = "NHWC" if channel_last else "NCHW"
+def alexnet(images, args, need_transpose=False, training=True):
+    data_format = "NHWC" if args.channel_last else "NCHW"

    conv1 = conv2d_layer(
        "conv1", images, filters=64, kernel_size=11, strides=4, padding="VALID",

--- a/Classification/cnns/config.py
+++ b/Classification/cnns/config.py
@@ -52,6 +52,7 @@ def get_parser(parser=None):
                        help='node/machine number for training')
    parser.add_argument('--node_ips', type=str_list, default=['192.168.1.13', '192.168.1.14'],
                        help='nodes ip list for training, devided by ",", length >= num_nodes')
+    parser.add_argument("--ctrl_port", type=int, default=50051, help='ctrl_port for multinode job')

    parser.add_argument("--model", type=str, default="resnet50",
                        help="resnet50")
@@ -92,6 +93,20 @@ def get_parser(parser=None):
    parser.add_argument("--batch_size_per_device", type=int, default=64)
    parser.add_argument("--val_batch_size_per_device", type=int, default=8)

+    # fuse bn relu or bn add relu
+    parser.add_argument(
+        '--fuse_bn_relu',
+        type=str2bool,
+        default=False,
+        help='Whether to use use fuse batch normalization relu. Currently supported in origin/master of OneFlow only.'
+    )
+    parser.add_argument(
+        '--fuse_bn_add_relu',
+        type=str2bool,
+        default=False,
+        help='Whether to use use fuse batch normalization add relu. Currently supported in origin/master of OneFlow only.'
+    )
+
    # inference
    parser.add_argument("--image_path", type=str, default='test_img/tiger.jpg', help="image path")


--- a/Classification/cnns/inception_model.py
+++ b/Classification/cnns/inception_model.py
@@ -491,12 +491,8 @@ def InceptionE(in_blob, index):
    return concat_total


-def inceptionv3(images, trainable=True, need_transpose=False, channel_last=False):
-    if need_transpose:
-        images = flow.transpose(images, name="transpose", perm=[0, 3, 1, 2])
-    if channel_last:
-    # if channel_last=True, then change mode from 'nchw' to 'nhwc'
-        images = flow.transpose(images, name="transpose", perm=[0, 2, 3, 1])
+def inceptionv3(images, trainable=True, channel_last=False):
+    assert   channel_last==False, "InceptionV3 does not support channel_last mode, set channel_last=False will be right!"
    with flow.scope.namespace("InceptionV3"):
        # conv0: 299 x 299 x 3
        conv0 = conv2d_layer_with_bn(

--- a/Classification/cnns/mobilenet_v2_model.py
+++ b/Classification/cnns/mobilenet_v2_model.py
@@ -58,7 +58,11 @@ def _relu6(data, prefix):


 def mobilenet_unit(data, num_filter=1, kernel=(1, 1), stride=(1, 1), pad=(0, 0), num_group=1, data_format="NCHW", if_act=True, use_bias=False, prefix=''):
-    conv = flow.layers.conv2d(inputs=data, filters=num_filter, kernel_size=kernel, strides=stride, padding=pad, data_format=data_format, dilation_rate=1, groups=num_group, activation=None, use_bias=use_bias, kernel_initializer=_get_initializer("weight"), bias_initializer=_get_initializer("bias"), kernel_regularizer=_get_regularizer("weight"), bias_regularizer=_get_regularizer("bias"), name=prefix)
+    conv = flow.layers.conv2d(inputs=data, filters=num_filter, kernel_size=kernel, strides=stride, 
+            padding=pad, data_format=data_format, dilation_rate=1, groups=num_group, activation=None, 
+            use_bias=use_bias, kernel_initializer=_get_initializer("weight"), 
+            bias_initializer=_get_initializer("bias"), kernel_regularizer=_get_regularizer("weight"), 
+            bias_regularizer=_get_regularizer("bias"), name=prefix)
    bn = _batch_norm(conv, axis=1, momentum=0.9, epsilon=1e-5, name='%s-BatchNorm'%prefix)
    if if_act:
        act = _relu6(bn, prefix)
@@ -156,11 +160,9 @@ class MobileNetV2(object):
        else:
            self.config_map=MNETV2_CONFIGS_MAP[(224, 224)]
    
-    def build_network(self, input_data, need_transpose, data_format, class_num=1000, prefix="", **configs):
+    def build_network(self, input_data, data_format, class_num=1000, prefix="", **configs):
        self.config_map.update(configs)

-        if need_transpose:
-            input_data = flow.transpose(input_data, name="transpose", perm=[0, 3, 1, 2])
        first_c = int(round(self.config_map['firstconv_filter_num']*self.multiplier))
        first_layer = mobilenet_unit(
            data=input_data,
@@ -233,11 +235,13 @@ class MobileNetV2(object):
        )
        return fc

-    def __call__(self, input_data, need_transpose, class_num=1000, prefix = "", **configs):
-        sym = self.build_network(input_data, need_transpose, class_num=class_num, prefix=prefix, **configs)
+    def __call__(self, input_data, class_num=1000, prefix = "", **configs):
+        sym = self.build_network(input_data, class_num=class_num, prefix=prefix, **configs)
        return sym

-def Mobilenet(input_data, trainable=True, need_transpose=False, training=True, data_format="NCHW", num_classes=1000, multiplier=1.0, prefix = ""):
+def Mobilenet(input_data, args, trainable=True, training=True, num_classes=1000, multiplier=1.0, prefix = ""):
+    assert   args.channel_last==False, "Mobilenet does not support channel_last mode, set channel_last=False will be right!"
+    data_format="NHWC" if args.channel_last else "NCHW"
    mobilenetgen = MobileNetV2((224,224), multiplier=multiplier)
-    out = mobilenetgen(input_data, need_transpose, data_format=data_format, class_num=num_classes, prefix = "MobilenetV2")
+    out = mobilenetgen(input_data, data_format=data_format, class_num=num_classes, prefix = "MobilenetV2")
    return out
--- a/Classification/cnns/of_cnn_evaluate.py
+++ b/Classification/cnns/of_cnn_evaluate.py
@@ -58,8 +58,7 @@ def InferenceNet():
    print("Loading data from {}".format(args.val_data_dir))
    (labels, images) = ofrecord_util.load_imagenet_for_validation(args)

-    logits = model_dict[args.model](images,
-                                    channel_last=args.channel_last)
+    logits = model_dict[args.model](images, args)
    predictions = flow.nn.softmax(logits)
    outputs = {"predictions": predictions, "labels": labels}
    return outputs

--- a/Classification/cnns/of_cnn_inference.py
+++ b/Classification/cnns/of_cnn_inference.py
@@ -57,7 +57,7 @@ def load_image(image_path='test_img/ILSVRC2012_val_00020287.JPEG'):

 @flow.global_function("predict", flow.function_config())
 def InferenceNet(images: tp.Numpy.Placeholder((1, 3, 224, 224), dtype=flow.float)) -> tp.Numpy:
-    logits = model_dict[args.model](images, training=False)
+    logits = model_dict[args.model](images, args, training=False)
    predictions = flow.nn.softmax(logits)
    return predictions


--- a/Classification/cnns/of_cnn_train_val.py
+++ b/Classification/cnns/of_cnn_train_val.py
@@ -71,9 +71,7 @@ def TrainNet():
    else:
        print("Loading synthetic data.")
        (labels, images) = ofrecord_util.load_synthetic(args)
-    logits = model_dict[args.model](images,
-                                    channel_last=args.channel_last
-                                    )
+    logits = model_dict[args.model](images, args)
    if args.label_smoothing > 0:
        one_hot_labels = label_smoothing(labels, args.num_classes, args.label_smoothing, logits.dtype)
        loss = flow.nn.softmax_cross_entropy_with_logits(one_hot_labels, logits, name="softmax_loss")
@@ -101,8 +99,7 @@ def InferenceNet():
        print("Loading synthetic data.")
        (labels, images) = ofrecord_util.load_synthetic(args)

-    logits = model_dict[args.model](
-        images, channel_last=args.channel_last)
+    logits = model_dict[args.model](images, args)
    predictions = flow.nn.softmax(logits)
    outputs = {"predictions": predictions, "labels": labels}
    return outputs

--- a/Classification/cnns/resnet_model.py
+++ b/Classification/cnns/resnet_model.py
@@ -193,9 +193,9 @@ class ResnetBuilder(object):
        return pool1


-def resnet50(images, trainable=True, training=True, wd=1.0 / 32768, channel_last=False):
-    weight_regularizer = flow.regularizers.l2(wd) if wd > 0.0 and wd < 1.0 else None
-    builder = ResnetBuilder(weight_regularizer, trainable, training, channel_last)
+def resnet50(images, args, trainable=True, training=True):
+    weight_regularizer = flow.regularizers.l2(args.wd) if args.wd > 0.0 and args.wd < 1.0 else None
+    builder = ResnetBuilder(weight_regularizer, trainable, training, args.channel_last, args.fuse_bn_relu, args.fuse_bn_add_relu)


    pad_output = 0 # TODO: use args.pad_output 

--- a/Classification/cnns/resnext_model.py
+++ b/Classification/cnns/resnext_model.py
@@ -242,34 +242,34 @@ def resnext34(images, trainable=True, training=True, need_transpose=False,
    return model


-def resnext50(images, trainable=True, training=True, need_transpose=False,
-        channel_last=False, **kwargs):
+def resnext50(images, args, trainable=True, training=True, need_transpose=False,
+        **kwargs):
    """Constructs a ResNeXt-50 model.
    """
    resnext_50 = ResNeXt(images,  trainable=trainable, training=training,
-             need_transpose=need_transpose, channel_last=channel_last,
+             need_transpose=need_transpose, channel_last=args.channel_last,
             block=bottle_neck, layers=[3, 4, 6, 3], **kwargs)
    model = resnext_50.build_network()
    return model


-def resnext101(images, trainable=True, training=True, need_transpose=False,
-        channel_last=False, **kwargs):
+def resnext101(images, args, trainable=True, training=True, need_transpose=False,
+        **kwargs):
    """Constructs a ResNeXt-101 model.
    """
    resnext_101 = ResNeXt(images, trainable=trainable, training=training,
-            need_transpose=False, channel_last=False,
+            need_transpose=False, channel_last=args.channel_last,
            block=bottle_neck, layers=[3, 4, 23, 3], **kwargs)
    model = resnex_101.build_network()
    return model


-def resnext152(images, trainable=True, training=True, need_transpose=False,
-        channel_last=False, **kwargs):
+def resnext152(images, args, trainable=True, training=True, need_transpose=False,
+        **kwargs):
    """Constructs a ResNeXt-152 model.
    """
    resnext_152 = ResNeXt(images, trainable=trainable, training=training,
-            need_transpose=need_transpose, channel_last=channel_last,
+            need_transpose=need_transpose, channel_last=args.channel_last,
            block=bottle_neck, layers=[3, 8, 36, 3], **kwargs)
    model = resnext_152.build_network()
    return model
--- a/Classification/cnns/util.py
+++ b/Classification/cnns/util.py
@@ -25,7 +25,7 @@ import oneflow as flow
 def InitNodes(args):
    if args.num_nodes > 1:
        assert args.num_nodes <= len(args.node_ips)
-        flow.env.ctrl_port(12138)
+        flow.env.ctrl_port(args.ctrl_port)
        nodes = []
        for ip in args.node_ips[:args.num_nodes]:
            addr_dict = {}

--- a/Classification/cnns/vgg_model.py
+++ b/Classification/cnns/vgg_model.py
@@ -104,8 +104,8 @@ def _conv_block(in_blob, index, filters, conv_times, data_format="NCHW"):

    return conv_block

-def vgg16bn(images, trainable=True, channel_last=False, training=True, wd=1.0/32768):
-    data_format="NHWC" if channel_last else "NCHW"
+def vgg16bn(images, args, trainable=True, training=True):
+    data_format="NHWC" if args.channel_last else "NCHW"
    
    conv1 = _conv_block(images, 0, 64, 2, data_format)
    pool1 = flow.nn.max_pool2d(conv1[-1], 2, 2, "VALID", data_format, name="pool1")

--- a/ClickThroughRate/WideDeepLearning/wdl_train_eval.py
+++ b/ClickThroughRate/WideDeepLearning/wdl_train_eval.py
@@ -21,7 +21,10 @@ import glob
 from sklearn.metrics import roc_auc_score
 import numpy as np
 import time
+from pynvml import *

+def str_list(x):
+    return x.split(',')
 parser = argparse.ArgumentParser()
 parser.add_argument('--train_data_dir', type=str, required=True)
 parser.add_argument('--train_data_part_num', type=int, required=True)
@@ -42,7 +45,12 @@ parser.add_argument('--num_wide_sparse_fields', type=int, default=2)
 parser.add_argument('--num_deep_sparse_fields', type=int, default=26)
 parser.add_argument('--max_iter', type=int, default=30000)
 parser.add_argument('--loss_print_every_n_iter', type=int, default=100)
-parser.add_argument('--gpu_num', type=int, default=8)
+parser.add_argument('--gpu_num_per_node', type=int, default=8)
+parser.add_argument('--num_nodes', type=int, default=1,
+                    help='node/machine number for training')
+parser.add_argument('--node_ips', type=str_list, default=['192.168.1.13', '192.168.1.14'],
+                    help='nodes ip list for training, devided by ",", length >= num_nodes')
+parser.add_argument("--ctrl_port", type=int, default=50051, help='ctrl_port for multinode job')
 parser.add_argument('--hidden_units_num', type=int, default=7)
 parser.add_argument('--hidden_size', type=int, default=1024)

@@ -50,7 +58,6 @@ FLAGS = parser.parse_args()

 #DEEP_HIDDEN_UNITS = [1024, 1024]#, 1024, 1024, 1024, 1024, 1024]
 DEEP_HIDDEN_UNITS = [FLAGS.hidden_size for i in range(FLAGS.hidden_units_num)]
-print(DEEP_HIDDEN_UNITS)


 def _data_loader_ofrecord(data_dir, data_part_num, batch_size, part_name_suffix_length=-1,
@@ -120,6 +127,7 @@ def _model(dense_fields, wide_sparse_fields, deep_sparse_fields):

 global_loss = 0.0
 def _create_train_callback(step):
+    handle = nvmlDeviceGetHandleByIndex(0)
    def nop(loss):
        global global_loss
        global_loss += loss.mean()
@@ -127,8 +135,9 @@ def _create_train_callback(step):

    def print_loss(loss):
        global global_loss
+        info = nvmlDeviceGetMemoryInfo(handle)
        global_loss += loss.mean()
-        print(step+1, 'time', datetime.datetime.now(), 'loss',  global_loss/FLAGS.loss_print_every_n_iter)
+        print(step+1, 'time', datetime.datetime.now(), 'loss',  global_loss/FLAGS.loss_print_every_n_iter, 'mem', info.used)
        global_loss = 0.0

    if (step + 1) % FLAGS.loss_print_every_n_iter == 0:
@@ -139,7 +148,7 @@ def _create_train_callback(step):

 def CreateOptimizer(args):
    lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [args.learning_rate])
-    return flow.optimizer.LARS(lr_scheduler)
+    return flow.optimizer.LazyAdam(lr_scheduler)


 def _get_train_conf():
@@ -177,8 +186,37 @@ def eval_job():
    predict = flow.math.sigmoid(logits)
    return loss, predict, labels

+
+def InitNodes(args):
+    if args.num_nodes > 1:
+        assert args.num_nodes <= len(args.node_ips)
+        flow.env.ctrl_port(args.ctrl_port)
+        nodes = []
+        for ip in args.node_ips[:args.num_nodes]:
+            addr_dict = {}
+            addr_dict["addr"] = ip
+            nodes.append(addr_dict)
+
+        flow.env.machine(nodes)
+
+def print_args(args):
+    print("=".ljust(66, "="))
+    print("Running {}: num_gpu_per_node = {}, num_nodes = {}.".format(
+        'OneFlow-WDL', args.gpu_num_per_node, args.num_nodes))
+    print("=".ljust(66, "="))
+    for arg in vars(args):
+        print("{} = {}".format(arg, getattr(args, arg)))
+    print("-".ljust(66, "-"))
+    #print("Time stamp: {}".format(
+    #    str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S"))))
+
+
 def main():
-    flow.config.gpu_device_num(FLAGS.gpu_num)
+    print_args(FLAGS)
+    InitNodes(FLAGS)
+    flow.config.gpu_device_num(FLAGS.gpu_num_per_node)
+    flow.config.enable_model_io_v2(True)
+    flow.config.enable_debug_mode(True)
    #flow.config.enable_numa_aware_cuda_malloc_host(True)
    #flow.config.collective_boxing.enable_fusion(False)
    check_point = flow.train.CheckPoint()
@@ -201,4 +239,6 @@ def main():


 if __name__ == '__main__':
+    nvmlInit()
    main()
+    nvmlShutdown()
--- a/LanguageModeling/BERT/config.py
+++ b/LanguageModeling/BERT/config.py
@@ -48,6 +48,7 @@ def get_parser(parser=None):
                        help='node/machine number for training')
    parser.add_argument('--node_ips', type=str_list, default=['192.168.1.13', '192.168.1.14'],
                        help='nodes ip list for training, devided by ",", length >= num_nodes')
+    parser.add_argument("--ctrl_port", type=int, default=50051, help='ctrl_port for multinode job')

    # train
    parser.add_argument("--learning_rate", type=float, default=1e-4, help="Learning rate")

--- a/LanguageModeling/BERT/util.py
+++ b/LanguageModeling/BERT/util.py
@@ -26,7 +26,7 @@ import oneflow as flow
 def InitNodes(args):
    if args.num_nodes > 1:
        assert args.num_nodes <= len(args.node_ips)
-        #flow.env.ctrl_port(12138)
+        flow.env.ctrl_port(args.ctrl_port)
        nodes = []
        for ip in args.node_ips[:args.num_nodes]:
            addr_dict = {}