diff --git a/README.md b/README.md index b0736556e23efbf43e341a1e7e3b2ab8030b001b..90d4cc1d4b6ff6722078a6a192183d92d2b2f33c 100755 --- a/README.md +++ b/README.md @@ -5,35 +5,41 @@ OneFlow models for benchmarking. ### cnns * 1 node, 1 gpu: ``` - python3 cnn_benchmark/of_cnn_benchmarks.py \ - --gpu_num_per_node=1 \ - --model="vgg16" \ - --batch_size_per_device=8 \ - --iter_num=5 \ - --learning_rate=0.01 \ - --optimizer="sgd" \ - --loss_print_every_n_iter=1 \ - --warmup_iter_num=2 \ - --data_dir="/dataset/ofrecord/imagenet/train" + python3 cnn_benchmark/of_cnn_train_val.py \ + --gpu_num_per_node=1 \ + --batch_size_per_device=32 \ + --val_batch_size_per_device=32 \ + --use_new_dataloader=True \ + --train_data_part_num=256 \ + --val_data_part_num=256 \ + --num_epochs=1 \ + --optimizer="sgd" \ + --learning_rate=0.256 \ + --use_fp16=False \ + --use_boxing_v2=True \ + --model="resnet50" ``` -* 2 nodes, 2 gpu each node: +* 2 nodes: - simply add `--node_num=2 --node_list="192.168.1.12,192.168.1.14" ` : + simply add `--num_nodes=2 --node_ips="192.168.1.12,192.168.1.14" ` : ``` - python3 cnn_benchmark/of_cnn_benchmarks.py \ - --gpu_num_per_node=2 \ - --node_num=2 \ - --node_list="192.168.1.12,192.168.1.14" \ - --model="vgg16" \ - --batch_size_per_device=8 \ - --iter_num=5 \ - --learning_rate=0.01 \ - --optimizer="sgd" \ - --loss_print_every_n_iter=1 \ - --warmup_iter_num=2 \ - --data_dir="/dataset/ofrecord/imagenet/train" + python3 cnn_benchmark/of_cnn_train_val.py \ + --num_nodes=2 \ + --node_ips="192.168.1.12,192.168.1.14" \ + --gpu_num_per_node=1 \ + --batch_size_per_device=32 \ + --val_batch_size_per_device=32 \ + --use_new_dataloader=True \ + --train_data_part_num=256 \ + --val_data_part_num=256 \ + --num_epochs=1 \ + --optimizer="sgd" \ + --learning_rate=0.256 \ + --use_fp16=False \ + --use_boxing_v2=True \ + --model="resnet50" ``` ### bert pretrain @@ -42,15 +48,10 @@ OneFlow models for benchmarking. ``` python3 bert_benchmark/run_pretraining.py \ --gpu_num_per_node=1 \ - --node_num=1 \ --learning_rate=1e-4 \ - --weight_l2=0.01 \ - --batch_size_per_device=24 \ + --batch_size_per_device=12 \ --iter_num=5 \ --loss_print_every_n_iter=1 \ - --warmup_iter_num=2 \ - --data_dir="/dataset/ofrecord/wiki_128" \ - --data_part_num=1 \ --seq_length=128 \ --max_predictions_per_seq=20 \ --num_hidden_layers=12 \ @@ -60,22 +61,23 @@ OneFlow models for benchmarking. --vocab_size=30522 \ --attention_probs_dropout_prob=0.1 \ --hidden_dropout_prob=0.1 \ - --hidden_size_per_head=64 + --hidden_size_per_head=64 \ + --data_part_num=1 \ + --data_dir="/dataset/bert/of_wiki_seq_len_128" ``` * bert large: + bert large's is different from bert base in flowing configurations: + `--max_predictions_per_seq=80 --num_hidden_layers=24 --num_attention_heads=16 --max_position_embeddings=512` + ``` - python3 ../bert_benchmark/run_pretraining.py \ - --gpu_num_per_node=$GPU_NUM_PER_NODE \ - --node_num=$NODE_NUM \ - --node_list=$NODE_LIST \ + python3 bert_benchmark/run_pretraining.py \ + --gpu_num_per_node=1 \ --learning_rate=1e-4 \ - --weight_l2=0.01 \ - --batch_size_per_device=24 \ + --batch_size_per_device=12 \ --iter_num=5 \ --loss_print_every_n_iter=1 \ - --warmup_iter_num=2 \ - --seq_length=512 \ + --seq_length=128 \ --max_predictions_per_seq=80 \ --num_hidden_layers=24 \ --num_attention_heads=16 \ @@ -85,27 +87,13 @@ OneFlow models for benchmarking. --attention_probs_dropout_prob=0.1 \ --hidden_dropout_prob=0.1 \ --hidden_size_per_head=64 \ - --data_part_num=32 \ - --data_dir=/dataset/ofrecord/wiki_512" + --data_part_num=1 \ + --data_dir="/dataset/bert/of_wiki_seq_len_128" ``` -* 2 nodes, 2 gpu each node: +* 2 nodes: - simply add `--node_num=2 --node_list='192.168.1.12,192.168.1.14' `,see above. - -## Inference -* 1 node, 1 gpu: - ``` - python3 cnn_benchmark/of_cnn_infer_benchmarks.py \ - --gpu_num_per_node=1 \ - --model="vgg16" \ - --batch_size_per_device=8 \ - --iter_num=5 \ - --print_every_n_iter=1 \ - --warmup_iter_num=2 \ - --data_dir="/dataset/ofrecord/imagenet/train" - ``` - If you want to run the benchmark with TensorRT or XLA, only pass `--use_tensorrt` or `--use_xla_jit` to enable it. Low-precision arithmetics such as float16 or int8 are usually faster than 32bit float, and you can pass `--precision=float16` for acceleration. + simply add `--num_nodes=2 --node_ips="192.168.1.12,192.168.1.14" ` : ## build docker images from wheel diff --git a/cnn_benchmark/__init__.py b/cnn_benchmark/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/cnn_benchmark/alexnet.py b/cnn_benchmark/alexnet.py deleted file mode 100755 index e1b015fce18aed307348c8d2845eab65483c712e..0000000000000000000000000000000000000000 --- a/cnn_benchmark/alexnet.py +++ /dev/null @@ -1,245 +0,0 @@ -# ################################################################### -# alexnet.py -# 使用方法说明: -# 单机运行: python alexnet.py -g 1 -# -g 指定使用的GPU个数 -# 多机运行: python alexnet.py -g 8 -m -n "192.168.1.15,192.168.1.16" -# -g 指定使用的GPU个数 -# -m 指定使用多机运行 -# -n 指定各个机器ip地址,用逗号分格 -# ################################################################### - -import oneflow as flow -import argparse - -DATA_DIR = "/dataset/imagenet_1k/oneflow/30/train" -parser = argparse.ArgumentParser(description="flags for multi-node and resource") -parser.add_argument("-i", "--iter_num", type=int, default=10, required=False) -parser.add_argument("-g", "--gpu_num_per_node", type=int, default=1, required=False) -parser.add_argument( - "-m", "--multinode", default=False, action="store_true", required=False -) -parser.add_argument("-n", "--node_list", type=str, default=None, required=False) -parser.add_argument("-e", "--eval_dir", type=str, default=DATA_DIR, required=False) -parser.add_argument("-t", "--train_dir", type=str, default=DATA_DIR, required=False) -parser.add_argument("-load", "--model_load_dir", type=str, default="", required=False) -parser.add_argument( - "-save", "--model_save_dir", type=str, default="./checkpoints", required=False -) -args = parser.parse_args() - - -def _data_load_layer(data_dir): - # 从数据集加载图像,并进行数据预处理 - image_blob_conf = flow.data.BlobConf( - "encoded", - shape=(227, 227, 3), - dtype=flow.float, - codec=flow.data.ImageCodec([flow.data.ImageResizePreprocessor(227, 227)]), - preprocessors=[flow.data.NormByChannelPreprocessor((123.68, 116.78, 103.94))], - ) - - # 从数据集加载标签 - label_blob_conf = flow.data.BlobConf( - "class/label", shape=(), dtype=flow.int32, codec=flow.data.RawCodec() - ) - - # 解码 - labels, images = flow.data.decode_ofrecord( - data_dir, - (label_blob_conf, image_blob_conf), - batch_size=12, - data_part_num=8, - name="decode", - ) - - return labels, images - - -def _conv2d_layer( - name, - input, - filters, - kernel_size=3, - strides=1, - padding="SAME", - data_format="NCHW", - dilation_rate=1, - activation="Relu", - use_bias=False, - weight_initializer=flow.random_uniform_initializer(), - bias_initializer=None, -): - weight_shape = (filters, input.static_shape[1], kernel_size, kernel_size) - weight = flow.get_variable( - name + "-weight", - shape=weight_shape, - dtype=input.dtype, - initializer=weight_initializer, - ) - output = flow.nn.conv2d( - input, weight, strides, padding, data_format, dilation_rate, name=name - ) - if use_bias: - bias = flow.get_variable( - name + "-bias", - shape=(filters,), - dtype=input.dtype, - initializer=bias_initializer, - ) - output = flow.nn.bias_add(output, bias, data_format) - - if activation is not None: - if activation == "Relu": - output = flow.keras.activations.relu(output) - else: - raise NotImplementedError - - return output - - -def alexnet(images, labels): - # 数据数据集格式转换, NHWC -> NCHW - transposed = flow.transpose(images, name="transpose", perm=[0, 3, 1, 2]) - - conv1 = _conv2d_layer( - "conv1", transposed, filters=64, kernel_size=11, strides=4, padding="VALID" - ) - - pool1 = flow.nn.avg_pool2d(conv1, 3, 2, "VALID", "NCHW", name="pool1") - - conv2 = _conv2d_layer("conv2", pool1, filters=192, kernel_size=5) - - pool2 = flow.nn.avg_pool2d(conv2, 3, 2, "VALID", "NCHW", name="pool2") - - conv3 = _conv2d_layer("conv3", pool2, filters=384) - - conv4 = _conv2d_layer("conv4", conv3, filters=384) - - conv5 = _conv2d_layer("conv5", conv4, filters=256) - - pool5 = flow.nn.avg_pool2d(conv5, 3, 2, "VALID", "NCHW", name="pool5") - - if len(pool5.shape) > 2: - pool5 = flow.reshape(pool5, shape=(pool5.static_shape[0], -1)) - - fc1 = flow.layers.dense( - inputs=pool5, - units=4096, - activation=flow.keras.activations.relu, - use_bias=False, - kernel_initializer=flow.random_uniform_initializer(), - bias_initializer=False, - trainable=True, - name="fc1", - ) - - dropout1 = flow.nn.dropout(fc1, rate=0.5) - - fc2 = flow.layers.dense( - inputs=dropout1, - units=4096, - activation=flow.keras.activations.relu, - use_bias=False, - kernel_initializer=flow.random_uniform_initializer(), - bias_initializer=False, - trainable=True, - name="fc2", - ) - - dropout2 = flow.nn.dropout(fc2, rate=0.5) - - fc3 = flow.layers.dense( - inputs=dropout2, - units=1001, - activation=None, - use_bias=False, - kernel_initializer=flow.random_uniform_initializer(), - bias_initializer=False, - trainable=True, - name="fc3", - ) - - # 损失函数 - loss = flow.nn.sparse_softmax_cross_entropy_with_logits( - labels, fc3, name="softmax_loss" - ) - - return loss - - -# 训练任务 -@flow.function -def alexnet_train_job(): - # 设置训练超参数 - flow.config.train.primary_lr(0.00001) - flow.config.train.model_update_conf(dict(naive_conf={})) - - # 加载数据 - (labels, images) = _data_load_layer(args.train_dir) - - # 构建网络 - loss = alexnet(images, labels) - - # 指定训练网络的loss(优化目标) - flow.losses.add_loss(loss) - - return loss - - -# 预测任务 -@flow.function -def alexnet_eval_job(): - # 加载数据 - (labels, images) = _data_load_layer(args.eval_dir) - - # 构建预测网络 - loss = alexnet(images, labels) - - return loss - - -def main(): - # 配置运行方式 - flow.config.gpu_device_num(args.gpu_num_per_node) - flow.config.ctrl_port(9788) - flow.config.default_data_type(flow.float) - - # 设置多机分布式端口 - if args.multinode: - flow.config.ctrl_port(12138) - nodes = [] - for n in args.node_list.strip().split(","): - addr_dict = {} - addr_dict["addr"] = n - nodes.append(addr_dict) - flow.config.machine(nodes) - - # 模型加载/初始化 - check_point = flow.train.CheckPoint() - if not args.model_load_dir: - check_point.init() - else: - check_point.load(args.model_load_dir) - - # 训练迭代过程 - print("{:>12} {:>12} {:>12}".format("iter", "loss type", "loss value")) - for i in range(args.iter_num): - fmt_str = "{:>12} {:>12} {:>12.10f}" - - # 打印训练输出 - train_loss = alexnet_train_job().get().mean() - print(fmt_str.format(i, "train loss:", train_loss)) - - # 打印预测输出 - if (i + 1) % 10 == 0: - eval_loss = alexnet_eval_job().get().mean() - print(fmt_str.format(i, "eval loss:", eval_loss)) - - # 保存模型 - if (i + 1) % 100 == 0: - check_point.save(args.model_save_dir + str(i)) - - -if __name__ == "__main__": - main() diff --git a/cnn_benchmark/alexnet_model.py b/cnn_benchmark/alexnet_model.py index 1a87b8b6444cfc8b8c64034a41a48445087c297f..9a559540964a8c158bebb419652baccd74b7aad6 100755 --- a/cnn_benchmark/alexnet_model.py +++ b/cnn_benchmark/alexnet_model.py @@ -6,12 +6,13 @@ import oneflow as flow from model_util import conv2d_layer -def alexnet(images, trainable=True): +def alexnet(images, need_transpose=False): - transposed = flow.transpose(images, name="transpose", perm=[0, 3, 1, 2]) + if need_transpose: + images = flow.transpose(images, name="transpose", perm=[0, 3, 1, 2]) conv1 = conv2d_layer( - "conv1", transposed, filters=64, kernel_size=11, strides=4, padding="VALID" + "conv1", images, filters=64, kernel_size=11, strides=4, padding="VALID" ) pool1 = flow.nn.avg_pool2d(conv1, 3, 2, "VALID", "NCHW", name="pool1") @@ -29,7 +30,7 @@ def alexnet(images, trainable=True): pool5 = flow.nn.avg_pool2d(conv5, 3, 2, "VALID", "NCHW", name="pool5") if len(pool5.shape) > 2: - pool5 = flow.reshape(pool5, shape=(pool5.static_shape[0], -1)) + pool5 = flow.reshape(pool5, shape=(pool5.shape[0], -1)) fc1 = flow.layers.dense( inputs=pool5, @@ -38,7 +39,6 @@ def alexnet(images, trainable=True): use_bias=False, kernel_initializer=flow.random_uniform_initializer(), bias_initializer=False, - trainable=trainable, name="fc1", ) @@ -51,7 +51,6 @@ def alexnet(images, trainable=True): use_bias=False, kernel_initializer=flow.random_uniform_initializer(), bias_initializer=False, - trainable=trainable, name="fc2", ) @@ -64,7 +63,6 @@ def alexnet(images, trainable=True): use_bias=False, kernel_initializer=flow.random_uniform_initializer(), bias_initializer=False, - trainable=trainable, name="fc3", ) diff --git a/cnn_benchmark/benchmark.md b/cnn_benchmark/benchmark.md deleted file mode 100644 index 654ffc068ca2e04ba2da39846065d99d3175ef97..0000000000000000000000000000000000000000 --- a/cnn_benchmark/benchmark.md +++ /dev/null @@ -1,51 +0,0 @@ -## Inference - -测试平台:Nvidia GTX2080Ti单卡. -CUDA版本:10.0 -CUDNN版本:7.5.0 -TensorRT版本:6.0.1 - -Oneflow-Benchmark -branch: of_dev_python_py3 -commit: 985dd3f03887d266e66573db0b31a4cf3051ff31 - -Oneflow: -branch: of_xrt_tensorrt -commit: 726c3a12b9d97b57f9fb7e3d212b63564e20e755 - -### CV - -#### Speed - -输入图片大小为224 (inception-v3为299),预热5 batches,平均吞吐(img/s)为500个batches的平均值。 - -1. batch size为8 - ->| - | Oneflow(fp32) | Oneflow(fp16) | TensorRT(fp32) | TensorRT(fp16) | TensorRT(int8) | ->| ------------ | ------------- | ------------- | -------------- | -------------- | -------------- | ->| alexnet | 2637 | 1550 | 2540 | 2759 | | ->| vgg16 | 371 | 332 | 377 | 1124 | | ->| resnet50 | 657 | 541 | 729 | 940 | | ->| inception-v3 | 433 | 434 | 489 | 999 | | - -2. batch size为50 - ->| - | Oneflow(fp32) | Oneflow(fp16) | TensorRT(fp32) | TensorRT(fp16) | TensorRT(int8) | ->| ------------ | ------------- | ------------- | -------------- | -------------- | -------------- | ->| alexnet | 6999 | 3219 | 4306 | 7704 | | ->| vgg16 | 497 | 476 | 404 | 1482 | | ->| resnet50 | 810 | 619 | 830 | 1285 | | ->| inception-v3 | 544 | 531 | 717 | 1839 | | - - -#### Precision - -总共5w张图片, 统计Top1 accuracy和相对oneflow fp32的分类误差数量。 - ->| - | Oneflow(fp32) | Oneflow(fp16) | TensorRT(fp32) | TensorRT(fp16) | TensorRT(int8) | ->| ------------ | ------------- | ------------- | -------------- | -------------- | -------------- | ->| vgg16 | 0.495 / 0 | 0.495 / 61 | 0.495 / 0 | 0.495 / 101 | | ->| alexnet | | | | | | ->| resnet50 | 0.613 / 0 | 0.613 / 59 | 0.613 / 0 | 0.613 / 130 | | ->| inception-v3 | | | | | | - diff --git a/cnn_benchmark/benchmark_util.py b/cnn_benchmark/benchmark_util.py deleted file mode 100755 index f71e7c2169aa1693fc14b545263d353da96c6208..0000000000000000000000000000000000000000 --- a/cnn_benchmark/benchmark_util.py +++ /dev/null @@ -1,90 +0,0 @@ -import time -import numpy as np - - -class StopWatch: - def __init__(self): - pass - - def start(self): - self.start_time = time.time() - self.last_split = self.start_time - - def set_start(self, val): - self.start_time = val - self.last_split = self.start_time - - def split(self): - now = time.time() - duration = now - self.last_split - self.last_split = now - return duration - - def stop(self): - self.stop_time = time.time() - - def duration(self): - return self.stop_time - self.start_time - - -class CNNSpeedometer: - def __init__(self): - self.watch = StopWatch() - self.throughoutput_list = [] - - def speedometer_cb( - self, - step, - start_time, - total_batch_size, - skip_iter_num, - iter_num, - loss_print_every_n_iter, - ): - def callback(train_loss): - assert skip_iter_num >= 0 - if skip_iter_num == 0 and step == 0: - self.watch.set_start(start_time) - print("Start trainning without any skipping iteration.") - - if step < skip_iter_num: - if step == 0: - print( - "Skipping {} iterations for benchmark purpose.".format( - skip_iter_num - ) - ) - if (step + 1) == skip_iter_num: - self.watch.start() - print("Start trainning.") - else: - train_step = step - skip_iter_num - - if (train_step + 1) % loss_print_every_n_iter == 0: - loss = train_loss.mean() - - avg_elapse_time_per_iter = ( - self.watch.split() / loss_print_every_n_iter - ) - samples_per_sec = total_batch_size / avg_elapse_time_per_iter - print( - "iter {}, loss: {:.3f}, speed: {:.3f}(sec/batch), {:.3f}(images/sec)".format( - train_step, loss, avg_elapse_time_per_iter, samples_per_sec - ) - ) - self.throughoutput_list.append(samples_per_sec) - - if (train_step + 1) == iter_num: - self.watch.stop() - totoal_duration = self.watch.duration() - avg_samples_per_sec = total_batch_size * iter_num / totoal_duration - - print("-".ljust(66, "-")) - print( - "average speed: {:.3f}(images/sec), new_cal_method: {:.3f}(images/sec)".format( - avg_samples_per_sec, np.mean(self.throughoutput_list) - ) - ) - print("-".ljust(66, "-")) - - return callback diff --git a/cnn_benchmark/config.py b/cnn_benchmark/config.py old mode 100644 new mode 100755 index 802586b3f00d99eb9e12189b86cb14edb304a8a4..a30df4c9cddeefd2aba2004d9cddb5d9a7a77ef2 --- a/cnn_benchmark/config.py +++ b/cnn_benchmark/config.py @@ -1,94 +1,122 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + import argparse from datetime import datetime -import logging -import oneflow -from dali import add_dali_args +from optimizer_util import add_optimizer_args +from ofrecord_util import add_ofrecord_args + def get_parser(parser=None): def str_list(x): return x.split(',') + def int_list(x): return list(map(int, x.split(','))) + def float_list(x): return list(map(float, x.split(','))) + def str2bool(v): + if v.lower() in ('yes', 'true', 't', 'y', '1'): + return True + elif v.lower() in ('no', 'false', 'f', 'n', '0'): + return False + else: + raise argparse.ArgumentTypeError('Unsupported value encountered.') + if parser is None: parser = argparse.ArgumentParser("flags for cnn benchmark") - parser.add_argument("--dtype", type=str, default='float32', help="float16 float32") + parser.add_argument("--dtype", type=str, + default='float32', help="float16 float32") # resouce parser.add_argument("--gpu_num_per_node", type=int, default=1) parser.add_argument('--num_nodes', type=int, default=1, help='node/machine number for training') - parser.add_argument('--node_ips', type=str_list, default=['192.168.1.15', '192.168.1.16'], + parser.add_argument('--node_ips', type=str_list, default=['192.168.1.13', '192.168.1.14'], help='nodes ip list for training, devided by ",", length >= num_nodes') - parser.add_argument("--model", type=str, default="vgg16", help="vgg16 or resnet50") + parser.add_argument("--model", type=str, default="vgg16", + help="vgg16 or resnet50") + parser.add_argument( + '--use_fp16', + type=str2bool, + nargs='?', + const=True, + help='Whether to use use fp16' + ) + parser.add_argument( + '--use_boxing_v2', + type=str2bool, + nargs='?', + const=True, + help='Whether to use boxing v2' + ) - # train - parser.add_argument("--model_load_dir", type=str, default=None, help="model load directory if need") + parser.add_argument( + '--use_new_dataloader', + type=str2bool, + nargs='?', + const=True, + help='Whether to use new dataloader' + ) + # train and validaion + parser.add_argument('--num_epochs', type=int, + default=90, help='number of epochs') + parser.add_argument("--model_load_dir", type=str, + default=None, help="model load directory if need") parser.add_argument("--batch_size_per_device", type=int, default=64) - parser.add_argument("--learning_rate", type=float, default=0.256) - parser.add_argument("--optimizer", type=str, default="momentum-cosine-decay", - help="sgd, adam, momentum, momentum-cosine-decay") - parser.add_argument("--weight_l2", type=float, default=1.0/32768, help="weight decay parameter") - - # from mxnet - parser.add_argument('--num_epochs', type=int, default=90, help='number of epochs') - parser.add_argument('--lr', type=float, default=0.1, help='initial learning rate') - parser.add_argument('--lr-schedule', choices=('multistep', 'cosine'), default='cosine', - help='learning rate schedule') - parser.add_argument('--lr-factor', type=float, default=0.256, - help='the ratio to reduce lr on each step') - parser.add_argument('--lr-steps', type=float_list, default=[], - help='the epochs to reduce the lr, e.g. 30,60') - parser.add_argument('--warmup-epochs', type=int, default=5, - help='the epochs to ramp-up lr to scaled large-batch value') - - parser.add_argument("--input_layout", type=str, default='NHWC', help="NCHW or NHWC") - parser.add_argument('--image-shape', type=int_list, default=[3, 224, 224], - help='the image shape feed into the network') + parser.add_argument("--val_batch_size_per_device", type=int, default=8) + + # for data process + parser.add_argument("--num_examples", type=int, + default=1281167, help="train pic number") + parser.add_argument("--num_val_examples", type=int, + default=50000, help="validation pic number") parser.add_argument('--rgb-mean', type=float_list, default=[123.68, 116.779, 103.939], help='a tuple of size 3 for the mean rgb') parser.add_argument('--rgb-std', type=float_list, default=[58.393, 57.12, 57.375], help='a tuple of size 3 for the std rgb') - parser.add_argument('--data_train', type=str, help='the training data') - parser.add_argument('--data_train_idx', type=str, default='', help='the index of training data') - parser.add_argument('--data_val', type=str, help='the validation data') - parser.add_argument('--data_val_idx', type=str, default='', help='the index of validation data') - parser.add_argument("--num_examples", type=int, default=1281167, help="train pic number") - parser.add_argument("--num_val_examples", type=int, default=50000, help="validation pic number") - - ## snapshot + parser.add_argument("--input_layout", type=str, + default='NHWC', help="NCHW or NHWC") + parser.add_argument('--image-shape', type=int_list, default=[3, 224, 224], + help='the image shape feed into the network') + + # snapshot parser.add_argument("--model_save_dir", type=str, - default="./output/model_save-{}".format(str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S"))), - help="model save directory", - ) - parser.add_argument("--val_batch_size_per_device", type=int, default=8) + default="./output/snapshots/model_save-{}".format( + str(datetime.now().strftime("%Y%m%d%H%M%S"))), + help="model save directory", + ) # log and loss print - parser.add_argument("--log_dir", type=str, default="./output", help="log info save directory") + parser.add_argument("--log_dir", type=str, + default="./output", help="log info save directory") parser.add_argument( "--loss_print_every_n_iter", type=int, default=1, help="print loss every n iteration", ) - add_dali_args(parser) + add_ofrecord_args(parser) + add_optimizer_args(parser) return parser def print_args(args): print("=".ljust(66, "=")) print("Running {}: num_gpu_per_node = {}, num_nodes = {}.".format( - args.model, args.gpu_num_per_node, args.num_nodes)) + args.model, args.gpu_num_per_node, args.num_nodes)) print("=".ljust(66, "=")) for arg in vars(args): print("{} = {}".format(arg, getattr(args, arg))) print("-".ljust(66, "-")) - print("Time stamp: {}".format(str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S")))) + print("Time stamp: {}".format( + str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S")))) if __name__ == '__main__': diff --git a/cnn_benchmark/dali.py b/cnn_benchmark/dali.py deleted file mode 100644 index 5c87687b6d26688bfe21475ceda5087c80dbeb2b..0000000000000000000000000000000000000000 --- a/cnn_benchmark/dali.py +++ /dev/null @@ -1,418 +0,0 @@ -# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -import time -import ctypes -#import logging -import warnings -from nvidia import dali -from nvidia.dali.pipeline import Pipeline -import nvidia.dali.ops as ops -import nvidia.dali.types as types -from nvidia.dali.backend import TensorGPU - - -def add_dali_args(parser): - group = parser.add_argument_group('DALI data backend', - 'entire group applies only to dali data backend') - group.add_argument('--dali-separ-val', action='store_true', - help='each process will perform independent validation on whole val-set') - group.add_argument('--dali-threads', type=int, default=3, help="number of threads" +\ - "per GPU for DALI") - group.add_argument('--dali-validation-threads', type=int, default=10, - help="number of threads per GPU for DALI for validation") - group.add_argument('--dali-prefetch-queue', type=int, default=2, - help="DALI prefetch queue depth") - group.add_argument('--dali-nvjpeg-memory-padding', type=int, default=64, - help="Memory padding value for nvJPEG (in MB)") - group.add_argument('--dali-fuse-decoder', type=int, default=1, - help="0 or 1 whether to fuse decoder or not") - return parser - - -class HybridTrainPipe(Pipeline): - def __init__(self, args, batch_size, num_threads, device_id, rec_path, idx_path, - shard_id, num_shards, crop_shape, nvjpeg_padding, prefetch_queue=3, - output_layout=types.NCHW, pad_output=True, dtype='float16', dali_cpu=False): - super(HybridTrainPipe, self).__init__(batch_size, num_threads, device_id, - seed=12 + device_id, - prefetch_queue_depth = prefetch_queue) - self.input = ops.MXNetReader(path=[rec_path], index_path=[idx_path], - random_shuffle=True, shard_id=shard_id, num_shards=num_shards) - - dali_device = "cpu" if dali_cpu else "mixed" - dali_resize_device = "cpu" if dali_cpu else "gpu" - - if args.dali_fuse_decoder: - self.decode = ops.ImageDecoderRandomCrop(device=dali_device, output_type=types.RGB, - device_memory_padding=nvjpeg_padding, - host_memory_padding=nvjpeg_padding) - self.resize = ops.Resize(device=dali_resize_device, resize_x=crop_shape[1], - resize_y=crop_shape[0]) - else: - self.decode = ops.ImageDecoder(device=dali_device, output_type=types.RGB, - device_memory_padding=nvjpeg_padding, - host_memory_padding=nvjpeg_padding) - self.resize = ops.RandomResizedCrop(device=dali_resize_device, size=crop_shape) - - - self.cmnp = ops.CropMirrorNormalize(device=dali_resize_device, - output_dtype=types.FLOAT16 if dtype == 'float16' else types.FLOAT, - output_layout=output_layout, crop=crop_shape, pad_output=pad_output, - image_type=types.RGB, mean=args.rgb_mean, std=args.rgb_std) - self.coin = ops.CoinFlip(probability=0.5) - - def define_graph(self): - rng = self.coin() - self.jpegs, self.labels = self.input(name="Reader") - - images = self.decode(self.jpegs) - images = self.resize(images) - output = self.cmnp(images, mirror=rng) - return [output, self.labels] - - -class HybridValPipe(Pipeline): - def __init__(self, args, batch_size, num_threads, device_id, rec_path, idx_path, shard_id, - num_shards, crop_shape, nvjpeg_padding, prefetch_queue=3, resize_shp=None, - output_layout=types.NCHW, pad_output=True, dtype='float16', dali_cpu=False): - super(HybridValPipe, self).__init__(batch_size, num_threads, device_id, seed=12 + device_id, - prefetch_queue_depth=prefetch_queue) - self.input = ops.MXNetReader(path=[rec_path], index_path=[idx_path], - random_shuffle=False, shard_id=shard_id, num_shards=num_shards) - - if dali_cpu: - dali_device = "cpu" - self.decode = ops.ImageDecoder(device=dali_device, output_type=types.RGB) - else: - dali_device = "gpu" - self.decode = ops.ImageDecoder(device="mixed", output_type=types.RGB, - device_memory_padding=nvjpeg_padding, - host_memory_padding=nvjpeg_padding) - self.resize = ops.Resize(device=dali_device, resize_shorter=resize_shp) if resize_shp else None - self.cmnp = ops.CropMirrorNormalize(device=dali_device, - output_dtype=types.FLOAT16 if dtype == 'float16' else types.FLOAT, - output_layout=output_layout, crop=crop_shape, pad_output=pad_output, - image_type=types.RGB, mean=args.rgb_mean, std=args.rgb_std) - - def define_graph(self): - self.jpegs, self.labels = self.input(name="Reader") - images = self.decode(self.jpegs) - if self.resize: - images = self.resize(images) - output = self.cmnp(images) - return [output, self.labels] - - -def feed_ndarray(dali_tensor, arr): - """ - Copy contents of DALI tensor to numpy's NDArray. - - Parameters - ---------- - `dali_tensor` : nvidia.dali.backend.TensorCPU or nvidia.dali.backend.TensorGPU - Tensor from which to copy - `arr` : numpy.NDArray - Destination of the copy - """ - # Wait until arr is no longer used by the engine - assert dali_tensor.shape() == list(arr.shape), \ - ("Shapes do not match: DALI tensor has shape {0}" - ", but NDArray has shape {1}".format(dali_tensor.shape(), list(arr.shape))) - # Get CTypes void pointer to the underlying memory held by arr - c_type_pointer = ctypes.c_void_p(arr.ctypes.data) - # Copy data from DALI tensor to ptr - dali_tensor.copy_to_external(c_type_pointer) - - -class DALIGenericIterator(object): - """ - General DALI iterator for Numpy. It can return any number of - outputs from the DALI pipeline in the form of ndarray. - - Parameters - ---------- - pipelines : list of nvidia.dali.pipeline.Pipeline - List of pipelines to use - size : int, Number of samples in the epoch (Usually the size of the dataset). - data_layout : str, optional, default = 'NCHW' - Either 'NHWC' or 'NCHW' - layout of the pipeline outputs. - fill_last_batch : bool, optional, default = True - Whether to fill the last batch with data up to 'self.batch_size'. - The iterator would return the first integer multiple - of self._num_gpus * self.batch_size entries which exceeds 'size'. - Setting this flag to False will cause the iterator to return - exactly 'size' entries. - auto_reset : bool, optional, default = False - Whether the iterator resets itself for the next epoch - or it requires reset() to be called separately. - squeeze_labels: bool, optional, default = True - Whether the iterator should squeeze the labels before - copying them to the ndarray. - dynamic_shape: bool, optional, default = False - Whether the shape of the output of the DALI pipeline can - change during execution. If True, the ndarray will be resized accordingly - if the shape of DALI returned tensors changes during execution. - If False, the iterator will fail in case of change. - last_batch_padded : bool, optional, default = False - Whether the last batch provided by DALI is padded with the last sample - or it just wraps up. In the conjunction with `fill_last_batch` it tells - if the iterator returning last batch with data only partially filled with - data from the current epoch is dropping padding samples or samples from - the next epoch. If set to False next epoch will end sooner as data from - it was consumed but dropped. If set to True next epoch would be the - same length as the first one. For this happen, the option `pad_last_batch` - in the reader need to be set to `True` as well. - - Example - ------- - With the data set [1,2,3,4,5,6,7] and the batch size 2: - fill_last_batch = False, last_batch_padded = True -> last batch = [7], next iteration will return [1, 2] - fill_last_batch = False, last_batch_padded = False -> last batch = [7], next iteration will return [2, 3] - fill_last_batch = True, last_batch_padded = True -> last batch = [7, 7], next iteration will return [1, 2] - fill_last_batch = True, last_batch_padded = False -> last batch = [7, 1], next iteration will return [2, 3] - """ - def __init__(self, - pipelines, - size, - output_map=['data', 'label'], - data_layout='NCHW', - fill_last_batch=False, - auto_reset=False, - squeeze_labels=True, - dynamic_shape=False, - last_batch_padded=False): - if not isinstance(pipelines, list): - pipelines = [pipelines] - self._num_gpus = len(pipelines) - assert pipelines is not None, "Number of provided pipelines has to be at least 1" - self.batch_size = pipelines[0].batch_size - self._size = int(size) - self._pipes = pipelines - self._fill_last_batch = fill_last_batch - self._last_batch_padded = last_batch_padded - self._auto_reset = auto_reset - self._squeeze_labels = squeeze_labels - self._dynamic_shape = dynamic_shape - # Build all pipelines - for p in self._pipes: - with p._check_api_type_scope(types.PipelineAPIType.ITERATOR): - p.build() - # Use double-buffering of data batches - self._data_batches = [[None] for i in range(self._num_gpus)] - self._counter = 0 - self._current_data_batch = 0 - self.output_map = output_map - - # We need data about the batches (like shape information), - # so we need to run a single batch as part of setup to get that info - for p in self._pipes: - with p._check_api_type_scope(types.PipelineAPIType.ITERATOR): - p.schedule_run() - self._first_batch = None - self._first_batch = self.next() - - - def __next__(self): - if self._first_batch is not None: - batch = self._first_batch - self._first_batch = None - return batch - if self._counter >= self._size: - if self._auto_reset: - self.reset() - raise StopIteration - # Gather outputs - outputs = [] - for p in self._pipes: - with p._check_api_type_scope(types.PipelineAPIType.ITERATOR): - outputs.append(p.share_outputs()) - for i in range(self._num_gpus): - # MXNet wants batches with clear distinction between - # data and label entries, so segregate outputs into - # 2 categories - # Change DALI TensorLists into Tensors - category_tensors = dict() - category_info = dict() - for j, out in enumerate(outputs[i]): - x = out.as_tensor() - category_tensors[self.output_map[j]] = x#.as_tensor() - if self._squeeze_labels and self.output_map[j]=='label': - category_tensors[self.output_map[j]].squeeze() - category_info[self.output_map[j]] = (x.shape(), np.dtype(x.dtype())) - - # If we did not yet allocate memory for that batch, do it now - if self._data_batches[i][self._current_data_batch] is None: - for category in self.output_map: - t = category_tensors[category] - assert type(t) is not TensorGPU, "CPU data only"#TODO - d = [] - for (shape, dtype) in category_info.values(): - d.append(np.zeros(shape, dtype = dtype)) - - self._data_batches[i][self._current_data_batch] = d - - d = self._data_batches[i][self._current_data_batch] - # Copy data from DALI Tensors to NDArrays - if self._dynamic_shape: - for j, (shape, dtype) in enumerate(category_info): - if list(d[j].shape) != shape: - d[j] = np.zeros(shape, dtype = dtype) - - for j, d_arr in enumerate(d): - feed_ndarray(category_tensors[self.output_map[j]], d_arr) - - for p in self._pipes: - with p._check_api_type_scope(types.PipelineAPIType.ITERATOR): - p.release_outputs() - p.schedule_run() - - copy_db_index = self._current_data_batch - # Change index for double buffering - self._current_data_batch = (self._current_data_batch + 1) % 1 - self._counter += self._num_gpus * self.batch_size - - assert not self._fill_last_batch - ## padding the last batch - #if (not self._fill_last_batch) and (self._counter > self._size): - # # this is the last batch and we need to pad - # overflow = self._counter - self._size - # overflow_per_device = overflow // self._num_gpus - # difference = self._num_gpus - (overflow % self._num_gpus) - # for i in range(self._num_gpus): - # if i < difference: - # self._data_batches[i][copy_db_index].pad = overflow_per_device - # else: - # self._data_batches[i][copy_db_index].pad = overflow_per_device + 1 - #else: - # for db in self._data_batches: - # db[copy_db_index].pad = 0 - - #_data_batches[gpu_id][_current_data_batch][images, labels] - images = [db[copy_db_index][0] for db in self._data_batches] - labels = [db[copy_db_index][1] for db in self._data_batches] - return images, labels - #return [db[copy_db_index] for db in self._data_batches] - - def next(self): - """ - Returns the next batch of data. - """ - return self.__next__() - - def __iter__(self): - return self - - def reset(self): - """ - Resets the iterator after the full epoch. - DALI iterators do not support resetting before the end of the epoch - and will ignore such request. - """ - if self._counter >= self._size: - if self._fill_last_batch and not self._last_batch_padded: - self._counter = self._counter % self._size - else: - self._counter = 0 - for p in self._pipes: - p.reset() - if p.empty(): - with p._check_api_type_scope(types.PipelineAPIType.ITERATOR): - p.schedule_run() - else: - print("DALI iterator does not support resetting while epoch is not finished. Ignoring...") - - -def get_rec_iter(args, dali_cpu=False, concat=True): - gpus = range(args.gpu_num_per_node) - rank = 0 #TODO - nWrk = 1 #TODO - - num_threads = args.dali_threads - num_validation_threads = args.dali_validation_threads - pad_output = (args.image_shape[0] == 4) - - # the input_layout w.r.t. the model is the output_layout of the image pipeline - output_layout = types.NHWC if args.input_layout == 'NHWC' else types.NCHW - - trainpipes = [HybridTrainPipe(args = args, - batch_size = args.batch_size_per_device, - num_threads = num_threads, - device_id = gpu_id, - rec_path = args.data_train, - idx_path = args.data_train_idx, - shard_id = gpus.index(gpu_id) + len(gpus)*rank, - num_shards = len(gpus)*nWrk, - crop_shape = args.image_shape[1:], - output_layout = output_layout, - dtype = args.dtype, - pad_output = pad_output, - dali_cpu = dali_cpu, - nvjpeg_padding = args.dali_nvjpeg_memory_padding * 1024 * 1024, - prefetch_queue = args.dali_prefetch_queue) for gpu_id in gpus] - - if args.data_val: - valpipes = [HybridValPipe(args = args, - batch_size = args.val_batch_size_per_device, - num_threads = num_validation_threads, - device_id = gpu_id, - rec_path = args.data_val, - idx_path = args.data_val_idx, - shard_id = gpus.index(gpu_id) + len(gpus)*rank, - num_shards = len(gpus)*nWrk, - crop_shape = args.image_shape[1:], - resize_shp = 256, #args.data_val_resize, - output_layout = output_layout, - dtype = args.dtype, - pad_output = pad_output, - dali_cpu = dali_cpu, - nvjpeg_padding = args.dali_nvjpeg_memory_padding * 1024 * 1024, - prefetch_queue = args.dali_prefetch_queue) for gpu_id in gpus] - trainpipes[0].build() - if args.data_val: - valpipes[0].build() - val_examples = valpipes[0].epoch_size("Reader") - - if args.num_examples < trainpipes[0].epoch_size("Reader"): - warnings.warn("{} training examples will be used, although full training set contains {} examples".format(args.num_examples, trainpipes[0].epoch_size("Reader"))) - - dali_train_iter = DALIGenericIterator(trainpipes, args.num_examples) - if args.data_val: - dali_val_iter = DALIGenericIterator(valpipes, val_examples, fill_last_batch = False) - else: - dali_val_iter = None - - return dali_train_iter, dali_val_iter - - -if __name__ == '__main__': - import config as configs - parser = configs.get_parser() - args = parser.parse_args() - configs.print_args(args) - train_data_iter, val_data_iter = get_rec_iter(args, True) - for epoch in range(args.num_epochs): - tic = time.time() - last_time = time.time() - print('Starting epoch {}'.format(epoch)) - train_data_iter.reset() - for i, batches in enumerate(train_data_iter): - images, labels = batches - if i % args.loss_print_every_n_iter == 0: - print(args.loss_print_every_n_iter * 256 / (time.time() - last_time)) - last_time = time.time() - #print(images.shape) - epoch_time = time.time() - tic - print('epoch mena images/sec', 1281167 / epoch_time, epoch_time) diff --git a/cnn_benchmark/dali_consistent.py b/cnn_benchmark/dali_consistent.py deleted file mode 100644 index 52a350157f225bcd87e5120cf99c80edca099c32..0000000000000000000000000000000000000000 --- a/cnn_benchmark/dali_consistent.py +++ /dev/null @@ -1,423 +0,0 @@ -# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -import time -import ctypes -#import logging -import warnings -from nvidia import dali -from nvidia.dali.pipeline import Pipeline -import nvidia.dali.ops as ops -import nvidia.dali.types as types -from nvidia.dali.backend import TensorGPU - - -def add_dali_args(parser): - group = parser.add_argument_group('DALI data backend', - 'entire group applies only to dali data backend') - group.add_argument('--dali-separ-val', action='store_true', - help='each process will perform independent validation on whole val-set') - group.add_argument('--dali-threads', type=int, default=3, help="number of threads" +\ - "per GPU for DALI") - group.add_argument('--dali-validation-threads', type=int, default=10, - help="number of threads per GPU for DALI for validation") - group.add_argument('--dali-prefetch-queue', type=int, default=2, - help="DALI prefetch queue depth") - group.add_argument('--dali-nvjpeg-memory-padding', type=int, default=64, - help="Memory padding value for nvJPEG (in MB)") - group.add_argument('--dali-fuse-decoder', type=int, default=1, - help="0 or 1 whether to fuse decoder or not") - return parser - - -class HybridTrainPipe(Pipeline): - def __init__(self, args, batch_size, num_threads, device_id, rec_path, idx_path, - shard_id, num_shards, crop_shape, nvjpeg_padding, prefetch_queue=3, - output_layout=types.NCHW, pad_output=True, dtype='float16', dali_cpu=False): - super(HybridTrainPipe, self).__init__(batch_size, num_threads, device_id, - seed=12 + device_id, - prefetch_queue_depth = prefetch_queue) - self.input = ops.MXNetReader(path=[rec_path], index_path=[idx_path], - random_shuffle=True, shard_id=shard_id, num_shards=num_shards) - self.cast = ops.Cast(dtype=types.INT32) - - dali_device = "cpu" if dali_cpu else "mixed" - dali_resize_device = "cpu" if dali_cpu else "gpu" - - if args.dali_fuse_decoder: - self.decode = ops.ImageDecoderRandomCrop(device=dali_device, output_type=types.RGB, - device_memory_padding=nvjpeg_padding, - host_memory_padding=nvjpeg_padding) - self.resize = ops.Resize(device=dali_resize_device, resize_x=crop_shape[1], - resize_y=crop_shape[0]) - else: - self.decode = ops.ImageDecoder(device=dali_device, output_type=types.RGB, - device_memory_padding=nvjpeg_padding, - host_memory_padding=nvjpeg_padding) - self.resize = ops.RandomResizedCrop(device=dali_resize_device, size=crop_shape) - - - self.cmnp = ops.CropMirrorNormalize(device=dali_resize_device, - output_dtype=types.FLOAT16 if dtype == 'float16' else types.FLOAT, - output_layout=output_layout, crop=crop_shape, pad_output=pad_output, - image_type=types.RGB, mean=args.rgb_mean, std=args.rgb_std) - self.coin = ops.CoinFlip(probability=0.5) - - def define_graph(self): - rng = self.coin() - self.jpegs, self.labels = self.input(name="Reader") - - images = self.decode(self.jpegs) - images = self.resize(images) - output = self.cmnp(images, mirror=rng) - return [output, self.cast(self.labels)] - - -class HybridValPipe(Pipeline): - def __init__(self, args, batch_size, num_threads, device_id, rec_path, idx_path, shard_id, - num_shards, crop_shape, nvjpeg_padding, prefetch_queue=3, resize_shp=None, - output_layout=types.NCHW, pad_output=True, dtype='float16', dali_cpu=False): - super(HybridValPipe, self).__init__(batch_size, num_threads, device_id, seed=12 + device_id, - prefetch_queue_depth=prefetch_queue) - self.input = ops.MXNetReader(path=[rec_path], index_path=[idx_path], - random_shuffle=False, shard_id=shard_id, num_shards=num_shards) - - self.cast = ops.Cast(dtype=types.INT32) - if dali_cpu: - dali_device = "cpu" - self.decode = ops.ImageDecoder(device=dali_device, output_type=types.RGB) - else: - dali_device = "gpu" - self.decode = ops.ImageDecoder(device="mixed", output_type=types.RGB, - device_memory_padding=nvjpeg_padding, - host_memory_padding=nvjpeg_padding) - self.resize = ops.Resize(device=dali_device, resize_shorter=resize_shp) if resize_shp else None - self.cmnp = ops.CropMirrorNormalize(device=dali_device, - output_dtype=types.FLOAT16 if dtype == 'float16' else types.FLOAT, - output_layout=output_layout, crop=crop_shape, pad_output=pad_output, - image_type=types.RGB, mean=args.rgb_mean, std=args.rgb_std) - - def define_graph(self): - self.jpegs, self.labels = self.input(name="Reader") - images = self.decode(self.jpegs) - if self.resize: - images = self.resize(images) - output = self.cmnp(images) - #return [output, self.labels] - return [output, self.cast(self.labels)] - - -def feed_ndarray(dali_tensor, arr, offset): - """ - Copy contents of DALI tensor to numpy's NDArray. - - Parameters - ---------- - `dali_tensor` : nvidia.dali.backend.TensorCPU or nvidia.dali.backend.TensorGPU - Tensor from which to copy - `arr` : numpy.NDArray - Destination of the copy - """ - # Wait until arr is no longer used by the engine - #assert dali_tensor.shape() == list(arr.shape), \ - # ("Shapes do not match: DALI tensor has shape {0}" - # ", but NDArray has shape {1}".format(dali_tensor.shape(), list(arr.shape))) - # Get CTypes void pointer to the underlying memory held by arr - c_type_pointer = ctypes.c_void_p(arr.ctypes.data + offset) - # Copy data from DALI tensor to ptr - dali_tensor.copy_to_external(c_type_pointer) - - -class DALIGenericIterator(object): - """ - General DALI iterator for Numpy. It can return any number of - outputs from the DALI pipeline in the form of ndarray. - - Parameters - ---------- - pipelines : list of nvidia.dali.pipeline.Pipeline - List of pipelines to use - size : int, Number of samples in the epoch (Usually the size of the dataset). - data_layout : str, optional, default = 'NCHW' - Either 'NHWC' or 'NCHW' - layout of the pipeline outputs. - fill_last_batch : bool, optional, default = True - Whether to fill the last batch with data up to 'self.batch_size'. - The iterator would return the first integer multiple - of self._num_gpus * self.batch_size entries which exceeds 'size'. - Setting this flag to False will cause the iterator to return - exactly 'size' entries. - auto_reset : bool, optional, default = False - Whether the iterator resets itself for the next epoch - or it requires reset() to be called separately. - squeeze_labels: bool, optional, default = True - Whether the iterator should squeeze the labels before - copying them to the ndarray. - dynamic_shape: bool, optional, default = False - Whether the shape of the output of the DALI pipeline can - change during execution. If True, the ndarray will be resized accordingly - if the shape of DALI returned tensors changes during execution. - If False, the iterator will fail in case of change. - last_batch_padded : bool, optional, default = False - Whether the last batch provided by DALI is padded with the last sample - or it just wraps up. In the conjunction with `fill_last_batch` it tells - if the iterator returning last batch with data only partially filled with - data from the current epoch is dropping padding samples or samples from - the next epoch. If set to False next epoch will end sooner as data from - it was consumed but dropped. If set to True next epoch would be the - same length as the first one. For this happen, the option `pad_last_batch` - in the reader need to be set to `True` as well. - - Example - ------- - With the data set [1,2,3,4,5,6,7] and the batch size 2: - fill_last_batch = False, last_batch_padded = True -> last batch = [7], next iteration will return [1, 2] - fill_last_batch = False, last_batch_padded = False -> last batch = [7], next iteration will return [2, 3] - fill_last_batch = True, last_batch_padded = True -> last batch = [7, 7], next iteration will return [1, 2] - fill_last_batch = True, last_batch_padded = False -> last batch = [7, 1], next iteration will return [2, 3] - """ - def __init__(self, - pipelines, - size, - output_map=['data', 'label'], - data_layout='NCHW', - fill_last_batch=False, - auto_reset=False, - squeeze_labels=True, - dynamic_shape=False, - last_batch_padded=False): - if not isinstance(pipelines, list): - pipelines = [pipelines] - self._num_gpus = len(pipelines) - assert pipelines is not None, "Number of provided pipelines has to be at least 1" - self.batch_size = pipelines[0].batch_size - self._size = int(size) - self._pipes = pipelines - self._fill_last_batch = fill_last_batch - self._last_batch_padded = last_batch_padded - self._auto_reset = auto_reset - self._squeeze_labels = squeeze_labels - assert dynamic_shape == False, "support fixed shape only." - self._dynamic_shape = dynamic_shape - # Build all pipelines - for p in self._pipes: - with p._check_api_type_scope(types.PipelineAPIType.ITERATOR): - p.build() - # Use double-buffering of data batches - #self._data_batches = [[None] for i in range(self._num_gpus)] - self._data_batches = [None for i in range(2)] - self._counter = 0 - self._current_data_batch = 0 - self.output_map = output_map - - # We need data about the batches (like shape information), - # so we need to run a single batch as part of setup to get that info - for p in self._pipes: - with p._check_api_type_scope(types.PipelineAPIType.ITERATOR): - p.schedule_run() - self._first_batch = None - self._first_batch = self.next() - - - def __next__(self): - if self._first_batch is not None: - batch = self._first_batch - self._first_batch = None - return batch - if self._counter >= self._size: - if self._auto_reset: - self.reset() - raise StopIteration - # Gather outputs - outputs = [] - for p in self._pipes: - with p._check_api_type_scope(types.PipelineAPIType.ITERATOR): - outputs.append(p.share_outputs()) - for gpu_id in range(self._num_gpus): - # MXNet wants batches with clear distinction between - # data and label entries, so segregate outputs into - # 2 categories - # Change DALI TensorLists into Tensors - category_tensors = dict() - category_info = [] - for j, out in enumerate(outputs[gpu_id]): - x = out.as_tensor() - category_tensors[self.output_map[j]] = x#.as_tensor() - if self._squeeze_labels and self.output_map[j]=='label': - category_tensors[self.output_map[j]].squeeze() - category_info.append((x.shape(), np.dtype(x.dtype()))) - - # If we did not yet allocate memory for that batch, do it now - if self._data_batches[self._current_data_batch] is None: - for category in self.output_map: - t = category_tensors[category] - assert type(t) is not TensorGPU, "CPU data only"#TODO - d = [] - self.category_nbytes = [] - for j, (shape, dtype) in enumerate(category_info): - self.category_nbytes.append(np.zeros(shape, dtype = dtype).nbytes) - shape[0] = self._num_gpus * shape[0] - d.append(np.zeros(shape, dtype = dtype)) - - self._data_batches[self._current_data_batch] = d - - d = self._data_batches[self._current_data_batch] - # Copy data from DALI Tensors to NDArrays - if self._dynamic_shape: - for j, (shape, dtype) in enumerate(category_info): - if list(d[j].shape) != shape: - d[j] = np.zeros(shape, dtype = dtype) - - for j, d_arr in enumerate(d): - offset = gpu_id * self.category_nbytes[j] - feed_ndarray(category_tensors[self.output_map[j]], d_arr, offset) - - for p in self._pipes: - with p._check_api_type_scope(types.PipelineAPIType.ITERATOR): - p.release_outputs() - p.schedule_run() - - copy_db_index = self._current_data_batch - # Change index for double buffering - self._current_data_batch = (self._current_data_batch + 1) % 1 - self._counter += self._num_gpus * self.batch_size - - assert not self._fill_last_batch - ## padding the last batch - #if (not self._fill_last_batch) and (self._counter > self._size): - # # this is the last batch and we need to pad - # overflow = self._counter - self._size - # overflow_per_device = overflow // self._num_gpus - # difference = self._num_gpus - (overflow % self._num_gpus) - # for i in range(self._num_gpus): - # if i < difference: - # self._data_batches[i][copy_db_index].pad = overflow_per_device - # else: - # self._data_batches[i][copy_db_index].pad = overflow_per_device + 1 - #else: - # for db in self._data_batches: - # db[copy_db_index].pad = 0 - - return self._data_batches[copy_db_index] - - def next(self): - """ - Returns the next batch of data. - """ - return self.__next__() - - def __iter__(self): - return self - - def reset(self): - """ - Resets the iterator after the full epoch. - DALI iterators do not support resetting before the end of the epoch - and will ignore such request. - """ - if self._counter >= self._size: - if self._fill_last_batch and not self._last_batch_padded: - self._counter = self._counter % self._size - else: - self._counter = 0 - for p in self._pipes: - p.reset() - if p.empty(): - with p._check_api_type_scope(types.PipelineAPIType.ITERATOR): - p.schedule_run() - else: - print("DALI iterator does not support resetting while epoch is not finished. Ignoring...") - - -def get_rec_iter(args, dali_cpu=False, concat=True): - gpus = range(args.gpu_num_per_node) - rank = 0 #TODO - nWrk = 1 #TODO - - num_threads = args.dali_threads - num_validation_threads = args.dali_validation_threads - pad_output = (args.image_shape[0] == 4) - - # the input_layout w.r.t. the model is the output_layout of the image pipeline - output_layout = types.NHWC if args.input_layout == 'NHWC' else types.NCHW - - trainpipes = [HybridTrainPipe(args = args, - batch_size = args.batch_size_per_device, - num_threads = num_threads, - device_id = gpu_id, - rec_path = args.data_train, - idx_path = args.data_train_idx, - shard_id = gpus.index(gpu_id) + len(gpus)*rank, - num_shards = len(gpus)*nWrk, - crop_shape = args.image_shape[1:], - output_layout = output_layout, - dtype = args.dtype, - pad_output = pad_output, - dali_cpu = dali_cpu, - nvjpeg_padding = args.dali_nvjpeg_memory_padding * 1024 * 1024, - prefetch_queue = args.dali_prefetch_queue) for gpu_id in gpus] - - if args.data_val: - valpipes = [HybridValPipe(args = args, - batch_size = args.val_batch_size_per_device, - num_threads = num_validation_threads, - device_id = gpu_id, - rec_path = args.data_val, - idx_path = args.data_val_idx, - shard_id = gpus.index(gpu_id) + len(gpus)*rank, - num_shards = len(gpus)*nWrk, - crop_shape = args.image_shape[1:], - resize_shp = 256, #args.data_val_resize, - output_layout = output_layout, - dtype = args.dtype, - pad_output = pad_output, - dali_cpu = dali_cpu, - nvjpeg_padding = args.dali_nvjpeg_memory_padding * 1024 * 1024, - prefetch_queue = args.dali_prefetch_queue) for gpu_id in gpus] - trainpipes[0].build() - if args.data_val: - valpipes[0].build() - val_examples = valpipes[0].epoch_size("Reader") - - if args.num_examples < trainpipes[0].epoch_size("Reader"): - warnings.warn("{} training examples will be used, although full training set contains {} examples".format(args.num_examples, trainpipes[0].epoch_size("Reader"))) - - dali_train_iter = DALIGenericIterator(trainpipes, args.num_examples) - if args.data_val: - dali_val_iter = DALIGenericIterator(valpipes, val_examples, fill_last_batch = False) - else: - dali_val_iter = None - - return dali_train_iter, dali_val_iter - - -if __name__ == '__main__': - import config as configs - parser = configs.get_parser() - args = parser.parse_args() - configs.print_args(args) - train_data_iter, val_data_iter = get_rec_iter(args, True) - for epoch in range(args.num_epochs): - tic = time.time() - last_time = time.time() - print('Starting epoch {}'.format(epoch)) - train_data_iter.reset() - for i, batches in enumerate(train_data_iter): - images, labels = batches - if i % args.loss_print_every_n_iter == 0: - print(args.loss_print_every_n_iter * 256 / (time.time() - last_time)) - last_time = time.time() - #print(images.shape) - epoch_time = time.time() - tic - print('epoch mena images/sec', 1281167 / epoch_time, epoch_time) diff --git a/cnn_benchmark/data_loader.py b/cnn_benchmark/data_loader.py deleted file mode 100644 index 69381ada3e9f1e6a69b1f5243b32e5c129c18369..0000000000000000000000000000000000000000 --- a/cnn_benchmark/data_loader.py +++ /dev/null @@ -1,55 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import oneflow as flow - - -def load_imagenet(data_dir, image_size, batch_size, data_part_num): - image_blob_conf = flow.data.BlobConf( - "encoded", - shape=(image_size, image_size, 3), - dtype=flow.float, - codec=flow.data.ImageCodec( - [ - flow.data.ImagePreprocessor('mirror'), - #flow.data.ImageCropPreprocessor(width=228, height=228), - flow.data.ImageResizePreprocessor(image_size, image_size), - ] - ), - preprocessors=[ - flow.data.NormByChannelPreprocessor( - (123.68, 116.78, 103.94), (255.0, 255.0, 255.0) - ) - ], - ) - - label_blob_conf = flow.data.BlobConf( - "class/label", shape=(), dtype=flow.int32, codec=flow.data.RawCodec() - ) - - return flow.data.decode_ofrecord( - data_dir, - (label_blob_conf, image_blob_conf), - batch_size=batch_size, - data_part_num=data_part_num, - part_name_suffix_length=5, - #shuffle = True, - #buffer_size=16384, - name="decode", - ) - - -def load_synthetic(image_size, batch_size): - label = flow.data.decode_random( - shape=(), - dtype=flow.int32, - batch_size=batch_size, - initializer=flow.zeros_initializer(flow.int32), - ) - - image = flow.data.decode_random( - shape=(image_size, image_size, 3), dtype=flow.float, batch_size=batch_size - ) - - return label, image diff --git a/cnn_e2e/inception_model.py b/cnn_benchmark/inception_model.py old mode 100644 new mode 100755 similarity index 100% rename from cnn_e2e/inception_model.py rename to cnn_benchmark/inception_model.py diff --git a/cnn_benchmark/inceptionv3_model.py b/cnn_benchmark/inceptionv3.py old mode 100644 new mode 100755 similarity index 74% rename from cnn_benchmark/inceptionv3_model.py rename to cnn_benchmark/inceptionv3.py index 7bf4690342cc885c3620b05e8ebea64e417e054e..db324ec7f411fbb00153f617ba2ad3ea8e8ccf69 --- a/cnn_benchmark/inceptionv3_model.py +++ b/cnn_benchmark/inceptionv3.py @@ -1,12 +1,64 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - - import oneflow as flow import oneflow.core.operator.op_conf_pb2 as op_conf_util - - +from datetime import datetime +import argparse +import os +import numpy + +_DATA_DIR = "/dataset/PNGS/PNG299/of_record_repeated" +_EVAL_DIR = _DATA_DIR +_TRAIN_DIR = _DATA_DIR +_MODEL_LOAD = "/dataset/PNGS/cnns_model_for_test/inceptionv3/models/of_model" +_MODEL_SAVE_DIR = "./model_save-{}".format( + str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S")) +) +NODE_LIST = "192.168.1.12,192.168.1.14" + +class DLNetSpec(object): + def __init__(self, enable_auto_mixed_precision): + self.batch_size = 8 + self.data_part_num = 32 + self.eval_dir = _DATA_DIR + self.train_dir = _DATA_DIR + self.model_save_dir = _MODEL_SAVE_DIR + self.model_load_dir = _MODEL_LOAD + self.num_nodes = 1 + self.gpu_num_per_node = 1 + self.iter_num = 10 + self.enable_auto_mixed_precision = enable_auto_mixed_precision + +parser = argparse.ArgumentParser(description="flags for multi-node and resource") +parser.add_argument("-g", "--gpu_num_per_node", type=int, default=1, required=False) +parser.add_argument("-i", "--iter_num", type=int, default=10, required=False) +parser.add_argument("-b", "--batch_size", type=int, default=8, required=False) +parser.add_argument( + "-m", "--multinode", default=False, action="store_true", required=False +) +parser.add_argument("-n", "--node_list", type=str, default=NODE_LIST, required=False) +parser.add_argument( + "-s", "--skip_scp_binary", default=False, action="store_true", required=False +) +parser.add_argument( + "-c", + "--scp_binary_without_uuid", + default=False, + action="store_true", + required=False, +) +parser.add_argument( + "-r", "--remote_by_hand", default=False, action="store_true", required=False +) +parser.add_argument("-e", "--eval_dir", type=str, default=_DATA_DIR, required=False) +parser.add_argument("-t", "--train_dir", type=str, default=_DATA_DIR, required=False) +parser.add_argument( + "-load", "--model_load_dir", type=str, default=_MODEL_LOAD, required=False +) +parser.add_argument( + "-save", "--model_save_dir", type=str, default=_MODEL_SAVE_DIR, required=False +) +parser.add_argument("-dn", "--data_part_num", type=int, default=32, required=False) + +# TODO: add this interface to oneflow.layers def _conv2d_layer( name, input, @@ -18,7 +70,6 @@ def _conv2d_layer( dilation_rate=1, activation=op_conf_util.kSigmoid, use_bias=True, - trainable=True, weight_initializer=flow.random_uniform_initializer(), bias_initializer=flow.constant_initializer(), ): @@ -26,7 +77,7 @@ def _conv2d_layer( kernel_size = (kernel_size, kernel_size) else: kernel_size = tuple(kernel_size) - weight_shape = (filters, input.static_shape[1]) + kernel_size + weight_shape = (filters, input.shape[1]) + kernel_size weight = flow.get_variable( name + "-weight", shape=weight_shape, @@ -56,6 +107,25 @@ def _conv2d_layer( return output +def _data_load_layer(args, data_dir): + image_blob_conf = flow.data.BlobConf( + "encoded", + shape=(299, 299, 3), + dtype=flow.float, + codec=flow.data.ImageCodec([flow.data.ImagePreprocessor("bgr2rgb")]), + preprocessors=[flow.data.NormByChannelPreprocessor((123.68, 116.78, 103.94))], + ) + label_blob_conf = flow.data.BlobConf( + "class/label", shape=(), dtype=flow.int32, codec=flow.data.RawCodec() + ) + node_num = args.num_nodes + total_batch_size = args.batch_size * args.gpu_num_per_node * node_num + return flow.data.decode_ofrecord( + data_dir, (image_blob_conf, label_blob_conf), + batch_size=total_batch_size, data_part_num=args.data_part_num, name="decode", + ) + + def InceptionA(in_blob, index): with flow.deprecated.variable_scope("mixed_{}".format(index)): with flow.deprecated.variable_scope("branch1x1"): @@ -425,7 +495,7 @@ def InceptionE(in_blob, index): return concat_total -def inceptionv3(images, labels, trainable=True): +def InceptionV3(images, labels, trainable=True): images = flow.transpose(images, perm=[0, 3, 1, 2]) conv0 = _conv2d_layer( @@ -506,5 +576,75 @@ def inceptionv3(images, labels, trainable=True): fc1 = flow.matmul(pool3, weight) fc1 = flow.nn.bias_add(fc1, bias) - return fc1 + loss = flow.nn.sparse_softmax_cross_entropy_with_logits( + labels=labels, logits=fc1, name="softmax_loss" + ) + return loss + + +def main(args): + flow.config.machine_num(args.num_nodes) + flow.config.gpu_device_num(args.gpu_num_per_node) + func_config = flow.FunctionConfig() + func_config.default_distribute_strategy(flow.distribute.consistent_strategy()) + func_config.default_data_type(flow.float) + func_config.train.primary_lr(0.0001) + func_config.train.model_update_conf(dict(naive_conf={})) + func_config.enable_auto_mixed_precision(args.enable_auto_mixed_precision) + @flow.function(func_config) + def TrainNet(): + (images, labels) = _data_load_layer(args, args.train_dir) + loss = InceptionV3(images, labels) + flow.losses.add_loss(loss) + return loss + check_point = flow.train.CheckPoint() + if not args.model_load_dir: + check_point.init() + else: + check_point.load(args.model_load_dir) + + num_nodes = args.num_nodes + print("Traning inceptionv3: num_gpu_per_node = {}, num_nodes = {}.".format(args.gpu_num_per_node, num_nodes)) + + print("{:>12} {:>12} {:>12}".format("iter", "loss type", "loss value")) + loss = [] + for i in range(args.iter_num): + train_loss = TrainNet().get().mean() + loss.append(train_loss) + + fmt_str = "{:>12} {:>12} {:>12.6f}" + print(fmt_str.format(i, "train loss:", train_loss)) + + if (i + 1) % 100 == 0: + check_point.save(_MODEL_SAVE_DIR + str(i)) + + # save loss to file + loss_file = "{}n{}c.npy".format(str(num_nodes), str(args.gpu_num_per_node * num_nodes)) + loss_path = "./of_loss/inceptionv3" + if not os.path.exists(loss_path): os.makedirs(loss_path) + numpy.save(os.path.join(loss_path, loss_file), loss) + +if __name__ == "__main__": + args = parser.parse_args() + if args.multinode: + flow.env.ctrl_port(12138) + nodes = [] + for n in args.node_list.strip().split(","): + addr_dict = {} + addr_dict["addr"] = n + nodes.append(addr_dict) + + flow.env.machine(nodes) + if args.remote_by_hand is False: + if args.scp_binary_without_uuid: + flow.deprecated.init_worker(scp_binary=True, use_uuid=False) + elif args.skip_scp_binary: + flow.deprecated.init_worker(scp_binary=False, use_uuid=False) + else: + flow.deprecated.init_worker(scp_binary=True, use_uuid=True) + + main(args) + if (args.multinode and args.skip_scp_binary is False + and args.scp_binary_without_uuid is False): + flow.deprecated.delete_worker() diff --git a/cnn_e2e/job_function_util.py b/cnn_benchmark/job_function_util.py old mode 100644 new mode 100755 similarity index 100% rename from cnn_e2e/job_function_util.py rename to cnn_benchmark/job_function_util.py diff --git a/cnn_benchmark/loss_top1.ipynb b/cnn_benchmark/loss_top1.ipynb deleted file mode 100644 index 481714d3b8db55afcca34347eed658a8e8660dd4..0000000000000000000000000000000000000000 --- a/cnn_benchmark/loss_top1.ipynb +++ /dev/null @@ -1,280 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import altair as alt\n", - "import pandas as pd\n", - "import numpy as np\n", - "import os\n", - "import glob" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "summary = pd.read_csv('summary.csv').drop(['rank', 'note', 'time'], axis=1)\n", - "summary = summary.groupby(['iter', 'legend'], as_index=False).mean()" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array(['cfg', 'loss', 'top1_accuracy'], dtype=object)" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "summary['legend'].unique()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
iterlegendvalue
101100top1_accuracy0.00122
202200top1_accuracy0.00096
303300top1_accuracy0.00278
\n", - "
" - ], - "text/plain": [ - " iter legend value\n", - "101 100 top1_accuracy 0.00122\n", - "202 200 top1_accuracy 0.00096\n", - "303 300 top1_accuracy 0.00278" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "summary[summary['legend']=='top1_accuracy']" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "from plot_value import plot_many_by_legend" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.LayerChart(...)" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.LayerChart(...)" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "plot_many_by_legend({'loss': summary})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/cnn_benchmark/model_util.py b/cnn_benchmark/model_util.py old mode 100644 new mode 100755 index 8cc4c917d0efa351ca7e605783743f936abbcd8c..28e05ee9a4fffc03be08604b8571ffd7c903d2d0 --- a/cnn_benchmark/model_util.py +++ b/cnn_benchmark/model_util.py @@ -17,7 +17,14 @@ def conv2d_layer( weight_initializer=flow.random_uniform_initializer(), bias_initializer=flow.constant_initializer(), ): - weight_shape = (filters, input.shape[1], kernel_size, kernel_size) + if isinstance(kernel_size, int): + kernel_size_1 = kernel_size + kernel_size_2 = kernel_size + if isinstance(kernel_size, list): + kernel_size_1 = kernel_size[0] + kernel_size_2 = kernel_size[1] + + weight_shape = (filters, input.shape[1], kernel_size_1, kernel_size_2) weight = flow.get_variable( name + "-weight", shape=weight_shape, @@ -43,3 +50,43 @@ def conv2d_layer( raise NotImplementedError return output + + +def conv2d_layer_with_bn( + name, + input, + filters, + kernel_size=3, + strides=1, + padding="SAME", + data_format="NCHW", + dilation_rate=1, + activation="Relu", + use_bias=True, + weight_initializer=flow.random_uniform_initializer(), + bias_initializer=flow.constant_initializer(), + use_bn=True, +): + output = conv2d_layer(name=name, + input=input, + filters=filters, + kernel_size=kernel_size, + strides=strides, + padding=padding, + data_format=data_format, + dilation_rate=dilation_rate, + activation=activation, + use_bias=use_bias, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer) + + if use_bn: + output = flow.layers.batch_normalization(inputs=output, + axis=1, + momentum=0.997, + epsilon=1.001e-5, + center=True, + scale=True, + trainable=True, + name=name + "_bn") + return output diff --git a/cnn_benchmark/of_cnn_benchmarks.py b/cnn_benchmark/of_cnn_benchmarks.py deleted file mode 100755 index a4ba38ad32b9904b72d8a4aa9049ee05fa3f6646..0000000000000000000000000000000000000000 --- a/cnn_benchmark/of_cnn_benchmarks.py +++ /dev/null @@ -1,249 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import os -import time -import argparse -from datetime import datetime - -import oneflow as flow - -import data_loader -import vgg_model -import resnet_model -import alexnet_model -import benchmark_util - - -parser = argparse.ArgumentParser(description="flags for cnn benchmark") - -# resouce -parser.add_argument("--gpu_num_per_node", type=int, default=1, required=False) -parser.add_argument("--node_num", type=int, default=1) -parser.add_argument( - "--node_list", - type=str, - default=None, - required=False, - help="nodes' IP address, split by comma", -) - -# train -parser.add_argument( - "--model", type=str, default="vgg16", required=False, help="vgg16 or resnet50" -) -parser.add_argument("--batch_size_per_device", type=int, default=8, required=False) -parser.add_argument("--learning_rate", type=float, default=1e-4, required=False) -parser.add_argument( - "--optimizer", type=str, default="sgd", required=False, help="sgd, adam, momentum" -) -parser.add_argument( - "--weight_l2", - type=float, - default=None, - required=False, - help="weight decay parameter", -) -parser.add_argument( - "--iter_num", type=int, default=10, required=False, help="total iterations to run" -) -parser.add_argument( - "--skip_iter_num", - type=int, - default=0, - required=False, - help="number of skipping iterations for benchmark purpose.", -) -parser.add_argument( - "--data_dir", type=str, default=None, required=False, help="dataset directory" -) -parser.add_argument( - "--data_part_num", - type=int, - default=32, - required=False, - help="data part number in dataset", -) -parser.add_argument( - "--image_size", type=int, default=228, required=False, help="image size" -) - -# log and resore/save -parser.add_argument( - "--loss_print_every_n_iter", - type=int, - default=1, - required=False, - help="print loss every n iteration", -) -parser.add_argument( - "--model_save_every_n_iter", - type=int, - default=200, - required=False, - help="save model every n iteration", -) -parser.add_argument( - "--model_save_dir", - type=str, - default="./output/model_save-{}".format( - str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S")) - ), - required=False, - help="model save directory", -) -parser.add_argument( - "--save_last_snapshot", - type=bool, - default=False, - required=False, - help="save model snapshot for last iteration", -) -parser.add_argument( - "--model_load_dir", - type=str, - default=None, - required=False, - help="model load directory", -) -parser.add_argument( - "--log_dir", - type=str, - default="./output", - required=False, - help="log info save directory", -) - -args = parser.parse_args() - - -model_dict = { - "resnet50": resnet_model.resnet50, - "vgg16": vgg_model.vgg16, - "alexnet": alexnet_model.alexnet, -} - -optimizer_dict = { - "sgd": {"naive_conf": {}}, - "adam": {"adam_conf": {"beta1": 0.9}}, - "momentum": {"momentum_conf": {"beta": 0.9}}, - "momentum-decay": { - "momentum_conf": {"beta": 0.9}, - "learning_rate_decay": { - "polynomial_conf": {"decay_batches": 300000, "end_learning_rate": 0.0001,}, - }, - }, -} - -# "warmup_conf": {"linear_conf": {"warmup_batches":10000, "start_multiplier":0}}, - -func_config = flow.FunctionConfig() -func_config.default_distribute_strategy(flow.distribute.consistent_strategy()) -func_config.train.primary_lr(args.learning_rate) -func_config.default_data_type(flow.float) -func_config.train.model_update_conf(optimizer_dict[args.optimizer]) -func_config.disable_all_reduce_sequence(True) -func_config.all_reduce_group_min_mbyte(8) -func_config.all_reduce_group_num(128) - -if args.weight_l2: - func_config.train.weight_l2(args.weight_l2) - -flow.config.gpu_device_num(args.gpu_num_per_node) - - -@flow.function(func_config) -def TrainNet(): - - total_device_num = args.node_num * args.gpu_num_per_node - batch_size = total_device_num * args.batch_size_per_device - - if args.data_dir: - assert os.path.exists(args.data_dir) - print("Loading data from {}".format(args.data_dir)) - (labels, images) = data_loader.load_imagenet( - args.data_dir, args.image_size, batch_size, args.data_part_num - ) - else: - print("Loading synthetic data.") - (labels, images) = data_loader.load_synthetic(args.image_size, batch_size) - - logits = model_dict[args.model](images) - loss = flow.nn.sparse_softmax_cross_entropy_with_logits( - labels, logits, name="softmax_loss" - ) - flow.losses.add_loss(loss) - - return loss - - -def main(): - print("=".ljust(66, "=")) - print( - "Running {}: num_gpu_per_node = {}, num_nodes = {}.".format( - args.model, args.gpu_num_per_node, args.node_num - ) - ) - print("=".ljust(66, "=")) - for arg in vars(args): - print("{} = {}".format(arg, getattr(args, arg))) - print("-".ljust(66, "-")) - print("Time stamp: {}".format(str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S")))) - flow.env.grpc_use_no_signal() - flow.env.log_dir(args.log_dir) - - if args.node_num > 1: - nodes = [] - for n in args.node_list.strip().split(","): - addr_dict = {} - addr_dict["addr"] = n - nodes.append(addr_dict) - - flow.env.machine(nodes) - - check_point = flow.train.CheckPoint() - if args.model_load_dir: - assert os.path.isdir(args.model_load_dir) - print("Restoring model from {}.".format(args.model_load_dir)) - check_point.load(args.model_load_dir) - else: - print("Init model on demand.") - check_point.init() - - total_batch_size = ( - args.node_num * args.gpu_num_per_node * args.batch_size_per_device - ) - speedometer = benchmark_util.CNNSpeedometer() - start_time = time.time() - - for step in range(args.skip_iter_num + args.iter_num): - cb = speedometer.speedometer_cb( - step, - start_time, - total_batch_size, - args.skip_iter_num, - args.iter_num, - args.loss_print_every_n_iter, - ) - TrainNet().async_get(cb) - - if (step + 1) % args.model_save_every_n_iter == 0: - if not os.path.exists(args.model_save_dir): - os.makedirs(args.model_save_dir) - snapshot_save_path = os.path.join( - args.model_save_dir, "snapshot_%d" % (step + 1) - ) - print("Saving model to {}.".format(snapshot_save_path)) - check_point.save(snapshot_save_path) - - if args.save_last_snapshot: - snapshot_save_path = os.path.join(args.model_save_dir, "last_snapshot") - if not os.path.exists(snapshot_save_path): - os.makedirs(snapshot_save_path) - print("Saving model to {}.".format(snapshot_save_path)) - check_point.save(snapshot_save_path) - - -if __name__ == "__main__": - main() diff --git a/cnn_benchmark/of_cnn_infer_benchmarks.py b/cnn_benchmark/of_cnn_infer_benchmarks.py deleted file mode 100755 index 958498fabd995982981339b00d0dcfa1ff376b6f..0000000000000000000000000000000000000000 --- a/cnn_benchmark/of_cnn_infer_benchmarks.py +++ /dev/null @@ -1,225 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import os -import time -import argparse -from datetime import datetime - -import oneflow as flow - -import data_loader -import vgg_model -import resnet_model -import alexnet_model -import inceptionv3_model - -parser = argparse.ArgumentParser(description="flags for cnn benchmark") - -# resouce -parser.add_argument("--gpu_num_per_node", type=int, default=1, required=False) -parser.add_argument("--node_num", type=int, default=1) -parser.add_argument( - "--node_list", - type=str, - default=None, - required=False, - help="nodes' IP address, split by comma", -) - -# train -parser.add_argument( - "--model", type=str, default="vgg16", required=False, help="vgg16 or resnet50" -) -parser.add_argument("--batch_size_per_device", type=int, default=8, required=False) -parser.add_argument( - "--iter_num", type=int, default=10, required=False, help="total iterations to run" -) -parser.add_argument( - "--warmup_iter_num", - type=int, - default=0, - required=False, - help="total iterations to run", -) -parser.add_argument( - "--data_dir", type=str, default=None, required=False, help="dataset directory" -) -parser.add_argument( - "--data_part_num", - type=int, - default=32, - required=False, - help="data part number in dataset", -) -parser.add_argument( - "--image_size", type=int, default=228, required=False, help="image size" -) - -parser.add_argument( - "--use_tensorrt", - dest="use_tensorrt", - action="store_true", - default=False, - required=False, - help="inference with tensorrt", -) -parser.add_argument( - "--use_xla_jit", - dest="use_xla_jit", - action="store_true", - default=False, - required=False, - help="inference with xla jit", -) - -parser.add_argument( - "--precision", - type=str, - default="float32", - required=False, - help="inference with low precision", -) - -# log and resore/save -parser.add_argument( - "--print_every_n_iter", - type=int, - default=1, - required=False, - help="print log every n iterations", -) -parser.add_argument( - "--model_load_dir", - type=str, - default=None, - required=False, - help="model load directory", -) -parser.add_argument( - "--log_dir", - type=str, - default="./output", - required=False, - help="log info save directory", -) - -args = parser.parse_args() - -model_dict = { - "resnet50": resnet_model.resnet50, - "inceptionv3": inceptionv3_model.inceptionv3, - "vgg16": vgg_model.vgg16, - "alexnet": alexnet_model.alexnet, -} - - -@flow.function -def InferenceNet(): - - total_device_num = args.node_num * args.gpu_num_per_node - batch_size = total_device_num * args.batch_size_per_device - - if args.use_tensorrt: - flow.config.use_tensorrt() - if args.use_xla_jit: - flow.config.use_xla_jit() - - if args.precision == "float16": - if not args.use_tensorrt: - flow.config.enable_auto_mixed_precision() - else: - flow.config.tensorrt.use_fp16() - - if args.data_dir: - assert os.path.exists(args.data_dir) - print("Loading data from {}".format(args.data_dir)) - (labels, images) = data_loader.load_imagenet( - args.data_dir, args.image_size, batch_size, args.data_part_num - ) - else: - print("Loading synthetic data.") - (labels, images) = data_loader.load_synthetic(args.image_size, batch_size) - - logits = model_dict[args.model](images) - softmax = flow.nn.softmax(logits) - return softmax - - -def main(): - print("=".ljust(66, "=")) - print( - "Running {}: num_gpu_per_node = {}, num_nodes = {}.".format( - args.model, args.gpu_num_per_node, args.node_num - ) - ) - print("=".ljust(66, "=")) - for arg in vars(args): - print("{} = {}".format(arg, getattr(args, arg))) - print("-".ljust(66, "-")) - print("Time stamp: {}".format(str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S")))) - flow.config.default_data_type(flow.float) - flow.config.gpu_device_num(args.gpu_num_per_node) - flow.env.grpc_use_no_signal() - flow.env.log_dir(args.log_dir) - # flow.config.enable_inplace(False) - # flow.config.ctrl_port(12140) - - if args.node_num > 1: - nodes = [] - for n in args.node_list.strip().split(","): - addr_dict = {} - addr_dict["addr"] = n - nodes.append(addr_dict) - - flow.env.machine(nodes) - - check_point = flow.train.CheckPoint() - if args.model_load_dir: - assert os.path.isdir(args.model_load_dir) - print("Restoring model from {}.".format(args.model_load_dir)) - check_point.load(args.model_load_dir) - else: - print("Init model on demand.") - check_point.init() - - # warmups - print("Runing warm up for {} iterations.".format(args.warmup_iter_num)) - for step in range(args.warmup_iter_num): - predictions = InferenceNet().get() - - main.total_time = 0.0 - main.batch_size = args.node_num * args.gpu_num_per_node * args.batch_size_per_device - main.start_time = time.time() - - def create_callback(step): - def callback(predictions): - if step % args.print_every_n_iter == 0: - cur_time = time.time() - duration = cur_time - main.start_time - main.total_time += duration - main.start_time = cur_time - images_per_sec = main.batch_size / duration - print( - "iter {}, speed: {:.3f}(sec/batch), {:.3f}(images/sec)".format( - step, duration, images_per_sec - ) - ) - if step == args.iter_num - 1: - avg_img_per_sec = main.batch_size * args.iter_num / main.total_time - print("-".ljust(66, "-")) - print("average speed: {:.3f}(images/sec)".format(avg_img_per_sec)) - print("-".ljust(66, "-")) - - return callback - - for step in range(args.iter_num): - InferenceNet().async_get(create_callback(step)) - # predictions = InferenceNet().get() - # create_callback(step)(predictions) - # print(predictions) - - -if __name__ == "__main__": - main() diff --git a/cnn_benchmark/of_cnn_train_val.py b/cnn_benchmark/of_cnn_train_val.py index 5b0f59269c4d5d3ac0f5e99480623637e2254b7f..b59888da846bbd1005d51f777e329c1ed8def8d4 100755 --- a/cnn_benchmark/of_cnn_train_val.py +++ b/cnn_benchmark/of_cnn_train_val.py @@ -3,195 +3,117 @@ from __future__ import division from __future__ import print_function import os -import time import math -import numpy as np -import config as configs -parser = configs.get_parser() -args = parser.parse_args() -configs.print_args(args) - -from util import Snapshot, Summary, nodes_init, StopWatch -from dali import get_rec_iter import oneflow as flow -import vgg_model + +import ofrecord_util +import config as configs +from util import Snapshot, Summary, InitNodes, Metric +from job_function_util import get_train_config, get_val_config +import inception_model import resnet_model +import vgg_model import alexnet_model +parser = configs.get_parser() +args = parser.parse_args() +configs.print_args(args) + total_device_num = args.num_nodes * args.gpu_num_per_node train_batch_size = total_device_num * args.batch_size_per_device val_batch_size = total_device_num * args.val_batch_size_per_device (C, H, W) = args.image_shape epoch_size = math.ceil(args.num_examples / train_batch_size) -num_train_batches = epoch_size * args.num_epochs -num_warmup_batches = epoch_size * args.warmup_epochs -decay_batches = num_train_batches - num_warmup_batches -num_val_steps = args.num_val_examples / val_batch_size +num_val_steps = int(args.num_val_examples / val_batch_size) -summary = Summary(args.log_dir, args) -timer = StopWatch() model_dict = { "resnet50": resnet_model.resnet50, "vgg16": vgg_model.vgg16, "alexnet": alexnet_model.alexnet, -} - -optimizer_dict = { - "sgd": {"naive_conf": {}}, - "adam": {"adam_conf": {"beta1": 0.9}}, - "momentum": {"momentum_conf": {"beta": 0.9}}, - "momentum-decay": { - "momentum_conf": {"beta": 0.9}, - "learning_rate_decay": { - "polynomial_conf": {"decay_batches": 300000, "end_learning_rate": 0.0001,}, - }, - }, - "momentum-cosine-decay": { - "momentum_conf": {"beta": 0.875}, - "warmup_conf": {"linear_conf": {"warmup_batches":num_warmup_batches, "start_multiplier":0}}, - "learning_rate_decay": {"cosine_conf": {"decay_batches": decay_batches}}, - }, + "inceptionv3": inception_model.inceptionv3, } flow.config.gpu_device_num(args.gpu_num_per_node) flow.config.enable_debug_mode(True) -def get_train_config(): - train_config = flow.function_config() - #train_config.default_distribute_strategy(flow.distribute.consistent_strategy()) - train_config.default_data_type(flow.float) - train_config.train.primary_lr(args.learning_rate) - train_config.disable_all_reduce_sequence(True) - #train_config.all_reduce_group_min_mbyte(8) - #train_config.all_reduce_group_num(128) - # train_config.all_reduce_lazy_ratio(0) - - # train_config.enable_nccl_hierarchical_all_reduce(True) - # train_config.cudnn_buf_limit_mbyte(2048) - # train_config.concurrency_width(2) - train_config.all_reduce_group_num(128) - train_config.all_reduce_group_min_mbyte(8) - - train_config.train.model_update_conf(optimizer_dict[args.optimizer]) - - if args.weight_l2 and 0: - train_config.train.weight_l2(args.weight_l2) - - train_config.enable_inplace(True) - return train_config - - -image_shape = (args.batch_size_per_device, H, W, C) -label_shape = (args.batch_size_per_device, 1) -@flow.function(get_train_config()) -def TrainNet(images=flow.MirroredTensorDef(image_shape, dtype=flow.float), - labels=flow.MirroredTensorDef(label_shape, dtype=flow.int32)): - logits = model_dict[args.model](images) - loss = flow.nn.sparse_softmax_cross_entropy_with_logits(labels, logits, name="softmax_loss") + +if args.use_boxing_v2: + flow.config.collective_boxing.nccl_fusion_threshold_mb(8) + flow.config.collective_boxing.nccl_fusion_all_reduce_use_buffer(False) + + +@flow.function(get_train_config(args)) +def TrainNet(): + if args.train_data_dir: + assert os.path.exists(args.train_data_dir) + print("Loading data from {}".format(args.train_data_dir)) + if args.use_new_dataloader: + (labels, images) = ofrecord_util.load_imagenet_for_training2(args) + else: + (labels, images) = ofrecord_util.load_imagenet_for_training(args) + # note: images.shape = (N C H W) in cc's new dataloader(load_imagenet_for_training2) + else: + print("Loading synthetic data.") + (labels, images) = ofrecord_util.load_synthetic(args) + + + logits = model_dict[args.model]( + images, need_transpose=False if (args.use_new_dataloader and args.train_data_dir) else True) + loss = flow.nn.sparse_softmax_cross_entropy_with_logits( + labels, logits, name="softmax_loss") + loss = flow.math.reduce_mean(loss) flow.losses.add_loss(loss) - softmax = flow.nn.softmax(logits) - outputs = {"loss": loss, "softmax":softmax, "labels": labels} + predictions = flow.nn.softmax(logits) + outputs = {"loss": loss, "predictions": predictions, "labels": labels} return outputs -def get_val_config(): - val_config = flow.function_config() - #val_config.default_distribute_strategy(flow.distribute.consistent_strategy()) - val_config.default_data_type(flow.float) - return val_config - - -image_shape = (args.val_batch_size_per_device, H, W, C) -label_shape = (args.val_batch_size_per_device, 1) -@flow.function(get_val_config()) -def InferenceNet(images=flow.MirroredTensorDef(image_shape, dtype=flow.float), - labels=flow.MirroredTensorDef(label_shape, dtype=flow.int32)): - logits = model_dict[args.model](images) - softmax = flow.nn.softmax(logits) - outputs = {"softmax":softmax, "labels": labels} - return outputs#(softmax, labels) - - -def acc_acc(step, predictions): - classfications = np.argmax(predictions['softmax'].ndarray(), axis=1) - labels = predictions['labels'].reshape(-1) - if step == 0: - main.correct = 0.0 - main.total = 0.0 +@flow.function(get_val_config(args)) +def InferenceNet(): + if args.val_data_dir: + assert os.path.exists(args.val_data_dir) + print("Loading data from {}".format(args.val_data_dir)) + if args.use_new_dataloader: + (labels, images) = ofrecord_util.load_imagenet_for_validation2(args) + else: + (labels, images) = ofrecord_util.load_imagenet_for_validation(args) else: - main.correct += np.sum(classfications == labels); - main.total += len(labels) - - -def train_callback(epoch, step): - def callback(train_outputs): - acc_acc(step, train_outputs) - loss = train_outputs['loss'].mean() - summary.scalar('loss', loss, step) - #summary.scalar('learning_rate', train_outputs['lr'], step) - if (step-1) % args.loss_print_every_n_iter == 0: - throughput = args.loss_print_every_n_iter * train_batch_size / timer.split() - accuracy = main.correct/main.total - print("epoch {}, iter {}, loss: {:.6f}, accuracy: {:.6f}, samples/s: {:.3f}".format( - epoch, step-1, loss, accuracy, throughput)) - summary.scalar('train_accuracy', accuracy, step) - main.correct = 0.0 - main.total = 0.0 - return callback - - -def do_predictions(epoch, predict_step, predictions): - acc_acc(predict_step, predictions) - if predict_step + 1 == num_val_steps: - assert main.total > 0 - summary.scalar('top1_accuracy', main.correct/main.total, epoch) - #summary.scalar('top1_correct', main.correct, epoch) - #summary.scalar('total_val_images', main.total, epoch) - print("epoch {}, top 1 accuracy: {:.6f}, time: {:.2f}".format(epoch, - main.correct/main.total, timer.split())) - - -def predict_callback(epoch, predict_step): - def callback(predictions): - do_predictions(epoch, predict_step, predictions) - return callback + print("Loading synthetic data.") + (labels, images) = ofrecord_util.load_synthetic(args) + + logits = model_dict[args.model]( + images, need_transpose=False if (args.use_new_dataloader and args.train_data_dir) else True) + predictions = flow.nn.softmax(logits) + outputs = {"predictions": predictions, "labels": labels} + return outputs def main(): - nodes_init(args) + InitNodes(args) flow.env.grpc_use_no_signal() flow.env.log_dir(args.log_dir) + summary = Summary(args.log_dir, args) snapshot = Snapshot(args.model_save_dir, args.model_load_dir) - train_data_iter, val_data_iter = get_rec_iter(args, True) - timer.start() for epoch in range(args.num_epochs): - tic = time.time() - print('Starting epoch {} at {:.2f}'.format(epoch, tic)) - train_data_iter.reset() - for i, batches in enumerate(train_data_iter): - images, labels = batches - TrainNet(batches).async_get(train_callback(epoch, i)) - # if i > 30:#debug - # break - #break - print('epoch {} training time: {:.2f}'.format(epoch, time.time() - tic)) - if args.data_val: - tic = time.time() - val_data_iter.reset() - for i, batches in enumerate(val_data_iter): - images, labels = batches - InferenceNet(images, labels).async_get(predict_callback(epoch, i)) - #acc_acc(i, InferenceNet(images, labels.astype(np.int32)).get()) - - summary.save() - snapshot.save('epoch_{}'.format(epoch+1)) + metric = Metric(desc='train', calculate_batches=args.loss_print_every_n_iter, + summary=summary, save_summary_steps=epoch_size, + batch_size=train_batch_size, loss_key='loss') + for i in range(epoch_size): + TrainNet().async_get(metric.metric_cb(epoch, i)) + + if args.val_data_dir: + metric = Metric(desc='validation', calculate_batches=num_val_steps, summary=summary, + save_summary_steps=num_val_steps, batch_size=val_batch_size) + for i in range(num_val_steps): + InferenceNet().async_get(metric.metric_cb(epoch, i)) + snapshot.save('epoch_{}'.format(epoch)) if __name__ == "__main__": diff --git a/cnn_benchmark/of_cnn_train_val_consistent.py b/cnn_benchmark/of_cnn_train_val_consistent.py deleted file mode 100755 index 4f96cfde352b8ee3286465f88c8c918f8790aca2..0000000000000000000000000000000000000000 --- a/cnn_benchmark/of_cnn_train_val_consistent.py +++ /dev/null @@ -1,195 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import os -import time -import math -import numpy as np - -import config as configs -parser = configs.get_parser() -args = parser.parse_args() -configs.print_args(args) - -from util import Snapshot, Summary, nodes_init, StopWatch -from dali_consistent import get_rec_iter -import oneflow as flow -import vgg_model -import resnet_model -import alexnet_model - - - -total_device_num = args.num_nodes * args.gpu_num_per_node -train_batch_size = total_device_num * args.batch_size_per_device -val_batch_size = total_device_num * args.val_batch_size_per_device -(C, H, W) = args.image_shape -epoch_size = math.ceil(args.num_examples / train_batch_size) -num_train_batches = epoch_size * args.num_epochs -num_warmup_batches = epoch_size * args.warmup_epochs -decay_batches = num_train_batches - num_warmup_batches -num_val_steps = args.num_val_examples / val_batch_size - -summary = Summary(args.log_dir, args) -timer = StopWatch() - -model_dict = { - "resnet50": resnet_model.resnet50, - "vgg16": vgg_model.vgg16, - "alexnet": alexnet_model.alexnet, -} - -optimizer_dict = { - "sgd": {"naive_conf": {}}, - "adam": {"adam_conf": {"beta1": 0.9}}, - "momentum": {"momentum_conf": {"beta": 0.9}}, - "momentum-decay": { - "momentum_conf": {"beta": 0.9}, - "learning_rate_decay": { - "polynomial_conf": {"decay_batches": 300000, "end_learning_rate": 0.0001,}, - }, - }, - "momentum-cosine-decay": { - "momentum_conf": {"beta": 0.875}, - "warmup_conf": {"linear_conf": {"warmup_batches":num_warmup_batches, "start_multiplier":0}}, - "learning_rate_decay": {"cosine_conf": {"decay_batches": decay_batches}}, - }, -} - - -flow.config.gpu_device_num(args.gpu_num_per_node) -flow.config.enable_debug_mode(True) -def get_train_config(): - train_config = flow.function_config() - train_config.default_distribute_strategy(flow.distribute.consistent_strategy()) - train_config.default_data_type(flow.float) - train_config.train.primary_lr(args.learning_rate) - train_config.disable_all_reduce_sequence(True) - #train_config.all_reduce_group_min_mbyte(8) - #train_config.all_reduce_group_num(128) - # train_config.all_reduce_lazy_ratio(0) - - # train_config.enable_nccl_hierarchical_all_reduce(True) - # train_config.cudnn_buf_limit_mbyte(2048) - # train_config.concurrency_width(2) - train_config.all_reduce_group_num(128) - train_config.all_reduce_group_min_mbyte(8) - - train_config.train.model_update_conf(optimizer_dict[args.optimizer]) - - if args.weight_l2 and 0: - train_config.train.weight_l2(args.weight_l2) - - train_config.enable_inplace(True) - return train_config - - -@flow.function(get_train_config()) -def TrainNet(images=flow.FixedTensorDef((train_batch_size, H, W, C), dtype=flow.float), - labels=flow.FixedTensorDef((train_batch_size, ), dtype=flow.int32)): - logits = model_dict[args.model](images) - loss = flow.nn.sparse_softmax_cross_entropy_with_logits(labels, logits, name="softmax_loss") - #loss = flow.math.reduce_mean(loss) - flow.losses.add_loss(loss) - softmax = flow.nn.softmax(logits) - outputs = {"loss": loss, "softmax":softmax, "labels": labels} - return outputs - - -def get_val_config(): - val_config = flow.function_config() - val_config.default_distribute_strategy(flow.distribute.consistent_strategy()) - val_config.default_data_type(flow.float) - return val_config - - -@flow.function(get_val_config()) -def InferenceNet(images=flow.FixedTensorDef((val_batch_size, H, W, C), dtype=flow.float), - labels=flow.FixedTensorDef((val_batch_size, ), dtype=flow.int32)): - logits = model_dict[args.model](images) - softmax = flow.nn.softmax(logits) - outputs = {"softmax":softmax, "labels": labels} - return outputs#(softmax, labels) - - -def acc_acc(step, predictions): - classfications = np.argmax(predictions['softmax'].ndarray(), axis=1) - labels = predictions['labels'].reshape(-1) - if step == 0: - main.correct = 0.0 - main.total = 0.0 - else: - main.correct += np.sum(classfications == labels); - main.total += len(labels) - - -def train_callback(epoch, step): - def callback(train_outputs): - acc_acc(step, train_outputs) - loss = train_outputs['loss'].mean() - summary.scalar('loss', loss, step) - #summary.scalar('learning_rate', train_outputs['lr'], step) - if (step-1) % args.loss_print_every_n_iter == 0: - throughput = args.loss_print_every_n_iter * train_batch_size / timer.split() - accuracy = main.correct/main.total - print("epoch {}, iter {}, loss: {:.6f}, accuracy: {:.6f}, samples/s: {:.3f}".format( - epoch, step-1, loss, accuracy, throughput)) - summary.scalar('train_accuracy', accuracy, step) - main.correct = 0.0 - main.total = 0.0 - return callback - - -def do_predictions(epoch, predict_step, predictions): - acc_acc(predict_step, predictions) - if predict_step + 1 == num_val_steps: - assert main.total > 0 - summary.scalar('top1_accuracy', main.correct/main.total, epoch) - #summary.scalar('top1_correct', main.correct, epoch) - #summary.scalar('total_val_images', main.total, epoch) - print("epoch {}, top 1 accuracy: {:.6f}, time: {:.2f}".format(epoch, - main.correct/main.total, timer.split())) - - -def predict_callback(epoch, predict_step): - def callback(predictions): - do_predictions(epoch, predict_step, predictions) - return callback - - -def main(): - nodes_init(args) - - flow.env.grpc_use_no_signal() - flow.env.log_dir(args.log_dir) - - snapshot = Snapshot(args.model_save_dir, args.model_load_dir) - - train_data_iter, val_data_iter = get_rec_iter(args, True) - timer.start() - for epoch in range(args.num_epochs): - tic = time.time() - print('Starting epoch {} at {:.2f}'.format(epoch, tic)) - train_data_iter.reset() - for i, batches in enumerate(train_data_iter): - images, labels = batches - TrainNet(images, labels).async_get(train_callback(epoch, i)) - # if i > 30:#debug - # break - #break - print('epoch {} training time: {:.2f}'.format(epoch, time.time() - tic)) - if args.data_val: - tic = time.time() - val_data_iter.reset() - for i, batches in enumerate(val_data_iter): - images, labels = batches - InferenceNet(images, labels).async_get(predict_callback(epoch, i)) - #acc_acc(i, InferenceNet(images, labels.astype(np.int32)).get()) - - summary.save() - snapshot.save('epoch_{}'.format(epoch+1)) - - -if __name__ == "__main__": - main() diff --git a/cnn_e2e/of_cnn_val.py b/cnn_benchmark/of_cnn_val.py similarity index 100% rename from cnn_e2e/of_cnn_val.py rename to cnn_benchmark/of_cnn_val.py diff --git a/cnn_e2e/ofrecord_util.py b/cnn_benchmark/ofrecord_util.py old mode 100644 new mode 100755 similarity index 100% rename from cnn_e2e/ofrecord_util.py rename to cnn_benchmark/ofrecord_util.py diff --git a/cnn_e2e/optimizer_util.py b/cnn_benchmark/optimizer_util.py old mode 100644 new mode 100755 similarity index 100% rename from cnn_e2e/optimizer_util.py rename to cnn_benchmark/optimizer_util.py diff --git a/cnn_benchmark/plot_value.py b/cnn_benchmark/plot_value.py deleted file mode 100644 index 45399e74385d2f4c46871f271e9d566ea6f0a699..0000000000000000000000000000000000000000 --- a/cnn_benchmark/plot_value.py +++ /dev/null @@ -1,171 +0,0 @@ -# pip3 install -U altair vega_datasets jupyterlab --user -import altair as alt -import pandas as pd -import numpy as np -import os -import glob - - -def plot_value(df): - legends = df["legend"].unique() - poly_data = pd.DataFrame( - {"iter": np.linspace(df["iter"].min(), df["iter"].max(), 1000)} - ) - for legend in legends: - poly_data[legend + "-fit"] = np.poly1d( - np.polyfit( - df[df["legend"] == legend]["iter"], - df[df["legend"] == legend]["value"], - 3, - ) - )(poly_data["iter"]) - - base = alt.Chart(df).interactive() - - chart = base.mark_circle().encode(x="iter", y="value", color="legend:N") - - polynomial_fit = ( - alt.Chart(poly_data) - .transform_fold( - [legend + "-fit" for legend in legends], as_=["legend", "value"] - ) - .mark_line() - .encode(x="iter:Q", y="value:Q", color="legend:N") - ) - chart += polynomial_fit - chart.display() - - -def plot_by_legend(df): - legends = df["legend"].unique() - for legend in legends: - df[df["legend"] == legend] - plot_value(df[df["legend"] == legend]) - - -def plot_many_by_legend(df_dict): - legend_set_unsored = [] - legend_set_sorted = [ - "loss", - "top1_accuracy", - ] - for _, df in df_dict.items(): - for legend in list(df["legend"].unique()): - if ( - legend not in legend_set_sorted - and "note" in df - and df[df["legend"] == legend]["note"].size is 0 - ): - legend_set_unsored.append(legend) - for legend in legend_set_sorted + legend_set_unsored: - df_by_legend = pd.concat( - [ - update_legend(df[df["legend"] == legend].copy(), k) - for k, df in df_dict.items() - ], - axis=0, - sort=False, - ) - plot_value(df_by_legend) - - -def update_legend(df, prefix): - if df["legend"].size > 0: - df["legend"] = df.apply( - lambda row: "{}-{}".format(prefix, row["legend"]), axis=1 - ) - return df - - -COLUMNS = [ - "loss", -] - - -def make_loss_frame(hisogram, column_index, legend="undefined"): - assert column_index < len(COLUMNS) - ndarray = np.array(hisogram)[:, [column_index, len(COLUMNS)]] - return pd.DataFrame( - {"iter": ndarray[:, 1], "legend": legend, "value": ndarray[:, 0]} - ) - - -def make_loss_frame5(losses_hisogram, source): - return pd.concat( - [ - make_loss_frame(losses_hisogram, column_index, legend=column_name) - for column_index, column_name in enumerate(COLUMNS) - ], - axis=0, - ignore_index=True, - ) - - -def wildcard_at(path, index): - result = glob.glob(path) - assert len(result) > 0, "there is no files in {}".format(path) - result.sort(key=os.path.getmtime) - return result[index] - - -def get_df(path, wildcard, index=-1): - if os.path.isdir(path): - path = wildcard_at(os.path.join(path, wildcard), index) - - return pd.read_csv(path) - - -def get_metrics_sr(df1, df2): - limit = min(df1["iter"].max(), df2["iter"].max(), df1.size, df2.size) - rate = 1 if limit <= 2500 else limit // 2500 + 1 - return limit, rate - - -def subset_and_mod(df, limit, take_every_n): - df_limited = df[df["iter"] < limit] - return df_limited[df_limited["iter"] % take_every_n == 0] - - -if __name__ == "__main__": - import argparse - - parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS) - parser.add_argument("-d", "--metrics_dir", type=str) - parser.add_argument("-o", "--oneflow_metrics_path", type=str) - parser.add_argument("-p", "--pytorch_metrics_path", type=str) - args = parser.parse_args() - - if hasattr(args, "metrics_dir"): - flow_metrics_path = args.metrics_dir - torch_metrics_path = args.metrics_dir - - if hasattr(args, "oneflow_metrics_path"): - flow_metrics_path = args.oneflow_metrics_path - - if hasattr(args, "pytorch_metrics_path"): - torch_metrics_path = args.pytorch_metrics_path - - assert os.path.exists(flow_metrics_path), "{} not found".format( - flow_metrics_path - ) - assert os.path.exists(torch_metrics_path), "{} not found".format( - torch_metrics_path - ) - - flow_df = get_df(flow_metrics_path, "loss*.csv") - flow_df.drop(["rank", "note"], axis=1) - if "primary_lr" in flow_df["legend"].unique(): - flow_df["legend"].replace("primary_lr", "lr", inplace=True) - flow_df = flow_df.groupby(["iter", "legend"], as_index=False).mean() - torch_df = get_df(torch_metrics_path, "torch*.csv") - if torch_df[torch_df["value"].notnull()]["iter"].min() == 0: - torch_df["iter"] += 1 - limit, rate = get_metrics_sr(flow_df, torch_df) - plot_many_by_legend( - { - "flow": subset_and_mod(flow_df, limit, rate), - "torch": subset_and_mod(torch_df, limit, rate), - } - ) - - # plot_by_legend(flow_df) diff --git a/cnn_benchmark/resnet_model.py b/cnn_benchmark/resnet_model.py index 7d4a4085a5997eedad8cece845044d20f690dae0..c8d9f94f386d0c279b0e94f1ed9fd5d8cb3890bb 100755 --- a/cnn_benchmark/resnet_model.py +++ b/cnn_benchmark/resnet_model.py @@ -19,14 +19,18 @@ def _conv2d( padding="SAME", data_format="NCHW", dilations=1, + weight_initializer=flow.variance_scaling_initializer( + 2, 'fan_in', 'random_normal', data_format="NCHW"), + weight_regularizer=flow.regularizers.l2(1.0/32768), trainable=True, - weight_initializer=flow.variance_scaling_initializer(data_format="NCHW"), ): weight = flow.get_variable( name + "-weight", - shape=(filters, input.static_shape[1], kernel_size, kernel_size), + shape=(filters, input.shape[1], kernel_size, kernel_size), dtype=input.dtype, initializer=weight_initializer, + regularizer=weight_regularizer, + model_name="weight", trainable=trainable, ) return flow.nn.conv2d( @@ -38,7 +42,7 @@ def _batch_norm(inputs, name=None, trainable=True): return flow.layers.batch_normalization( inputs=inputs, axis=1, - momentum=0.9,#97, + momentum=0.9, # 97, epsilon=1.001e-5, center=True, scale=True, @@ -121,9 +125,11 @@ def resnet_stem(input): return pool1 -def resnet50(images, trainable=True): +def resnet50(images, trainable=True, need_transpose=False): - images = flow.transpose(images, name="transpose", perm=[0, 3, 1, 2]) + # note: images.shape = (N C H W) in cc's new dataloader, transpose is not needed anymore + if need_transpose: + images = flow.transpose(images, name="transpose", perm=[0, 3, 1, 2]) with flow.deprecated.variable_scope("Resnet"): stem = resnet_stem(images) @@ -134,10 +140,13 @@ def resnet50(images, trainable=True): fc1001 = flow.layers.dense( flow.reshape(pool5, (pool5.shape[0], -1)), - units=1001, + units=1000, use_bias=True, - kernel_initializer=flow.xavier_uniform_initializer(), + kernel_initializer=flow.variance_scaling_initializer( + 2, 'fan_in', 'random_normal'), + # kernel_initializer=flow.xavier_uniform_initializer(), bias_initializer=flow.zeros_initializer(), + kernel_regularizer=flow.regularizers.l2(1.0/32768), trainable=trainable, name="fc1001", ) diff --git a/cnn_benchmark/util.py b/cnn_benchmark/util.py old mode 100644 new mode 100755 index e2d339b93549b228075b3aad5928e8751e313f62..f7e13406a66287e45018620a68765cf4cc5dc928 --- a/cnn_benchmark/util.py +++ b/cnn_benchmark/util.py @@ -1,23 +1,29 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + import os import time +import numpy as np import pandas as pd from datetime import datetime import oneflow as flow -def nodes_init(args): +def InitNodes(args): if args.num_nodes > 1: assert args.num_nodes <= len(args.node_ips) + flow.env.ctrl_port(12138) nodes = [] - for n in args.node_list.strip().split(","): + for ip in args.node_ips: addr_dict = {} - addr_dict["addr"] = n + addr_dict["addr"] = ip nodes.append(addr_dict) flow.env.machine(nodes) -class Snapshot: +class Snapshot(object): def __init__(self, model_save_dir, model_load_dir): self._model_save_dir = model_save_dir self._check_point = flow.train.CheckPoint() @@ -26,8 +32,9 @@ class Snapshot: print("Restoring model from {}.".format(model_load_dir)) self._check_point.load(model_load_dir) else: - print("Init model on demand.") self._check_point.init() + self.save('initial_model') + print("Init model on demand.") def save(self, name): snapshot_save_path = os.path.join(self._model_save_dir, "snapshot_{}".format(name)) @@ -37,25 +44,25 @@ class Snapshot: self._check_point.save(snapshot_save_path) -class Summary(): - def __init__(self, log_dir, config): +class Summary(object): + def __init__(self, log_dir, config, filename='summary.csv'): + self._filename = filename self._log_dir = log_dir - self._metrics = pd.DataFrame({"iter": 0, "legend": "cfg", "note": str(config)}, index=[0]) + self._metrics = pd.DataFrame({"epoch":0, "iter": 0, "legend": "cfg", "note": str(config)}, index=[0]) - def scalar(self, legend, value, step=-1): + def scalar(self, legend, value, epoch, step=-1): # TODO: support rank(which device/gpu) df = pd.DataFrame( - {"iter": step, "legend": legend, "value": value, "rank": 0, "time": time.time()}, + {"epoch": epoch, "iter": step, "legend": legend, "value": value, "rank": 0}, index=[0]) self._metrics = pd.concat([self._metrics, df], axis=0, sort=False) def save(self): - save_path = os.path.join(self._log_dir, "summary.csv") + save_path = os.path.join(self._log_dir, self._filename) self._metrics.to_csv(save_path, index=False) - print("saved: {}".format(save_path)) -class StopWatch: +class StopWatch(object): def __init__(self): pass @@ -75,3 +82,87 @@ class StopWatch: def duration(self): return self.stop_time - self.start_time + +def match_top_k(predictions, labels, top_k=1): + max_k_preds = predictions.argsort(axis=1)[:, -top_k:][:, ::-1] + match_array = np.logical_or.reduce(max_k_preds==labels.reshape((-1, 1)), axis=1) + num_matched = match_array.sum() + return num_matched, match_array.shape[0] + + +class Metric(object): + def __init__(self, summary=None, save_summary_steps=-1, desc='train', calculate_batches=-1, + batch_size=256, top_k=5, prediction_key='predictions', label_key='labels', + loss_key=None): + self.summary = summary + self.save_summary = isinstance(self.summary, Summary) + self.save_summary_steps = save_summary_steps + self.desc = desc + self.calculate_batches = calculate_batches + self.top_k = top_k + self.prediction_key = prediction_key + self.label_key = label_key + self.loss_key = loss_key + if loss_key: + self.fmt = "{}: epoch {}, iter {}, loss: {:.6f}, top_1: {:.6f}, top_k: {:.6f}, samples/s: {:.3f}" + else: + self.fmt = "{}: epoch {}, iter {}, top_1: {:.6f}, top_k: {:.6f}, samples/s: {:.3f}" + + self.timer = StopWatch() + self.timer.start() + self._clear() + + def _clear(self): + self.top_1_num_matched = 0 + self.top_k_num_matched = 0 + self.num_samples = 0.0 + + def metric_cb(self, epoch, step): + def callback(outputs): + if step == 0: self._clear() + if self.prediction_key: + num_matched, num_samples = match_top_k(outputs[self.prediction_key], + outputs[self.label_key]) + self.top_1_num_matched += num_matched + num_matched, _ = match_top_k(outputs[self.prediction_key], + outputs[self.label_key], self.top_k) + self.top_k_num_matched += num_matched + else: + num_samples = outputs[self.label_key].shape[0] + + self.num_samples += num_samples + + if (step + 1) % self.calculate_batches == 0: + throughput = self.num_samples / self.timer.split() + if self.prediction_key: + top_1_accuracy = self.top_1_num_matched / self.num_samples + top_k_accuracy = self.top_k_num_matched / self.num_samples + else: + top_1_accuracy = 0.0 + top_k_accuracy = 0.0 + + if self.loss_key: + loss = outputs[self.loss_key].mean() + print(self.fmt.format(self.desc, epoch, step + 1, loss, top_1_accuracy, + top_k_accuracy, throughput)) + if self.save_summary: + self.summary.scalar(self.desc+"_" + self.loss_key, loss, epoch, step) + else: + print(self.fmt.format(self.desc, epoch, step + 1, top_1_accuracy, + top_k_accuracy, throughput)) + + self._clear() + if self.save_summary: + self.summary.scalar(self.desc + "_throughput", throughput, epoch, step) + if self.prediction_key: + self.summary.scalar(self.desc + "_top_1", top_1_accuracy, epoch, step) + self.summary.scalar(self.desc + "_top_{}".format(self.top_k), + top_k_accuracy, epoch, step) + + if self.save_summary: + if (step + 1) % self.save_summary_steps == 0: + self.summary.save() + + return callback + + diff --git a/cnn_benchmark/vgg_model.py b/cnn_benchmark/vgg_model.py index 5ca8c7014b12bada882cdcc61881ba518dd88408..274096ddbe13eb766b1f6dea198e81e72d3fdcd1 100755 --- a/cnn_benchmark/vgg_model.py +++ b/cnn_benchmark/vgg_model.py @@ -3,7 +3,6 @@ from __future__ import division from __future__ import print_function import oneflow as flow -import oneflow.core.operator.op_conf_pb2 as op_conf_util from model_util import conv2d_layer @@ -24,10 +23,12 @@ def _conv_block(in_blob, index, filters, conv_times): return conv_block -def vgg16(images, trainable=True): +def vgg16(images, need_transpose=False): - transposed = flow.transpose(images, name="transpose", perm=[0, 3, 1, 2]) - conv1 = _conv_block(transposed, 0, 64, 2) + if need_transpose: + images = flow.transpose(images, name="transpose", perm=[0, 3, 1, 2]) + + conv1 = _conv_block(images, 0, 64, 2) pool1 = flow.nn.max_pool2d(conv1[-1], 2, 2, "VALID", "NCHW", name="pool1") conv2 = _conv_block(pool1, 2, 128, 2) @@ -42,26 +43,13 @@ def vgg16(images, trainable=True): conv5 = _conv_block(pool4, 10, 512, 3) pool5 = flow.nn.max_pool2d(conv5[-1], 2, 2, "VALID", "NCHW", name="pool5") - def _get_kernel_initializer(): - kernel_initializer = op_conf_util.InitializerConf() - kernel_initializer.truncated_normal_conf.std = 0.816496580927726 - return kernel_initializer - - def _get_bias_initializer(): - bias_initializer = op_conf_util.InitializerConf() - bias_initializer.constant_conf.value = 0.0 - return bias_initializer - - pool5 = flow.reshape(pool5, [pool5.shape[0], -1]) - fc6 = flow.layers.dense( - inputs=pool5, + inputs=flow.reshape(pool5, [pool5.shape[0], -1]), units=4096, activation=flow.keras.activations.relu, use_bias=True, - kernel_initializer=_get_kernel_initializer(), - bias_initializer=_get_bias_initializer(), - trainable=trainable, + kernel_initializer=flow.truncated_normal(0.816496580927726), + bias_initializer=flow.constant_initializer(), name="fc1", ) @@ -72,9 +60,8 @@ def vgg16(images, trainable=True): units=4096, activation=flow.keras.activations.relu, use_bias=True, - kernel_initializer=_get_kernel_initializer(), - bias_initializer=_get_bias_initializer(), - trainable=trainable, + kernel_initializer=flow.truncated_normal(0.816496580927726), + bias_initializer=flow.constant_initializer(), name="fc2", ) fc7 = flow.nn.dropout(fc7, rate=0.5) @@ -83,9 +70,8 @@ def vgg16(images, trainable=True): inputs=fc7, units=1001, use_bias=True, - kernel_initializer=_get_kernel_initializer(), - bias_initializer=_get_bias_initializer(), - trainable=trainable, + kernel_initializer=flow.truncated_normal(0.816496580927726), + bias_initializer=flow.constant_initializer(), name="fc_final", ) diff --git a/cnn_e2e/alexnet_model.py b/cnn_e2e/alexnet_model.py deleted file mode 100755 index 9a559540964a8c158bebb419652baccd74b7aad6..0000000000000000000000000000000000000000 --- a/cnn_e2e/alexnet_model.py +++ /dev/null @@ -1,69 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import oneflow as flow -from model_util import conv2d_layer - - -def alexnet(images, need_transpose=False): - - if need_transpose: - images = flow.transpose(images, name="transpose", perm=[0, 3, 1, 2]) - - conv1 = conv2d_layer( - "conv1", images, filters=64, kernel_size=11, strides=4, padding="VALID" - ) - - pool1 = flow.nn.avg_pool2d(conv1, 3, 2, "VALID", "NCHW", name="pool1") - - conv2 = conv2d_layer("conv2", pool1, filters=192, kernel_size=5) - - pool2 = flow.nn.avg_pool2d(conv2, 3, 2, "VALID", "NCHW", name="pool2") - - conv3 = conv2d_layer("conv3", pool2, filters=384) - - conv4 = conv2d_layer("conv4", conv3, filters=384) - - conv5 = conv2d_layer("conv5", conv4, filters=256) - - pool5 = flow.nn.avg_pool2d(conv5, 3, 2, "VALID", "NCHW", name="pool5") - - if len(pool5.shape) > 2: - pool5 = flow.reshape(pool5, shape=(pool5.shape[0], -1)) - - fc1 = flow.layers.dense( - inputs=pool5, - units=4096, - activation=flow.keras.activations.relu, - use_bias=False, - kernel_initializer=flow.random_uniform_initializer(), - bias_initializer=False, - name="fc1", - ) - - dropout1 = flow.nn.dropout(fc1, rate=0.5) - - fc2 = flow.layers.dense( - inputs=dropout1, - units=4096, - activation=flow.keras.activations.relu, - use_bias=False, - kernel_initializer=flow.random_uniform_initializer(), - bias_initializer=False, - name="fc2", - ) - - dropout2 = flow.nn.dropout(fc2, rate=0.5) - - fc3 = flow.layers.dense( - inputs=dropout2, - units=1001, - activation=None, - use_bias=False, - kernel_initializer=flow.random_uniform_initializer(), - bias_initializer=False, - name="fc3", - ) - - return fc3 diff --git a/cnn_e2e/config.py b/cnn_e2e/config.py deleted file mode 100644 index 284f1c67fbfcabb505e4214300472a2e4198f42d..0000000000000000000000000000000000000000 --- a/cnn_e2e/config.py +++ /dev/null @@ -1,116 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import argparse -from datetime import datetime -import logging - -from optimizer_util import add_optimizer_args -from ofrecord_util import add_ofrecord_args - -def get_parser(parser=None): - def str_list(x): - return x.split(',') - - def int_list(x): - return list(map(int, x.split(','))) - - def float_list(x): - return list(map(float, x.split(','))) - - def str2bool(v): - if v.lower() in ('yes', 'true', 't', 'y', '1'): - return True - elif v.lower() in ('no', 'false', 'f', 'n', '0'): - return False - else: - raise argparse.ArgumentTypeError('Unsupported value encountered.') - - if parser is None: - parser = argparse.ArgumentParser("flags for cnn benchmark") - - parser.add_argument("--dtype", type=str, default='float32', help="float16 float32") - - # resouce - parser.add_argument("--gpu_num_per_node", type=int, default=1) - parser.add_argument('--num_nodes', type=int, default=1, - help='node/machine number for training') - parser.add_argument('--node_ips', type=str_list, default=['192.168.1.13', '192.168.1.14'], - help='nodes ip list for training, devided by ",", length >= num_nodes') - - parser.add_argument("--model", type=str, default="vgg16", help="vgg16 or resnet50") - parser.add_argument( - '--use_fp16', - type=str2bool, - nargs='?', - const=True, - help='Whether to use use fp16' - ) - parser.add_argument( - '--use_boxing_v2', - type=str2bool, - nargs='?', - const=True, - help='Whether to use boxing v2' - ) - - parser.add_argument( - '--use_new_dataloader', - type=str2bool, - nargs='?', - const=True, - help='Whether to use new dataloader' - ) - # train and validaion - parser.add_argument('--num_epochs', type=int, default=90, help='number of epochs') - parser.add_argument("--model_load_dir", type=str, default=None, help="model load directory if need") - parser.add_argument("--batch_size_per_device", type=int, default=64) - parser.add_argument("--val_batch_size_per_device", type=int, default=8) - - # for data process - parser.add_argument("--num_examples", type=int, default=1281167, help="train pic number") - parser.add_argument("--num_val_examples", type=int, default=50000, help="validation pic number") - parser.add_argument('--rgb-mean', type=float_list, default=[123.68, 116.779, 103.939], - help='a tuple of size 3 for the mean rgb') - parser.add_argument('--rgb-std', type=float_list, default=[58.393, 57.12, 57.375], - help='a tuple of size 3 for the std rgb') - parser.add_argument("--input_layout", type=str, default='NHWC', help="NCHW or NHWC") - parser.add_argument('--image-shape', type=int_list, default=[3, 224, 224], - help='the image shape feed into the network') - - - ## snapshot - parser.add_argument("--model_save_dir", type=str, - default="./output/snapshots/model_save-{}".format(str(datetime.now().strftime("%Y%m%d%H%M%S"))), - help="model save directory", - ) - - # log and loss print - parser.add_argument("--log_dir", type=str, default="./output", help="log info save directory") - parser.add_argument( - "--loss_print_every_n_iter", - type=int, - default=1, - help="print loss every n iteration", - ) - add_ofrecord_args(parser) - add_optimizer_args(parser) - return parser - - -def print_args(args): - print("=".ljust(66, "=")) - print("Running {}: num_gpu_per_node = {}, num_nodes = {}.".format( - args.model, args.gpu_num_per_node, args.num_nodes)) - print("=".ljust(66, "=")) - for arg in vars(args): - print("{} = {}".format(arg, getattr(args, arg))) - print("-".ljust(66, "-")) - print("Time stamp: {}".format(str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S")))) - - -if __name__ == '__main__': - parser = get_parser() - args = parser.parse_args() - print_args(args) diff --git a/cnn_e2e/model_util.py b/cnn_e2e/model_util.py deleted file mode 100644 index 28e05ee9a4fffc03be08604b8571ffd7c903d2d0..0000000000000000000000000000000000000000 --- a/cnn_e2e/model_util.py +++ /dev/null @@ -1,92 +0,0 @@ -from __future__ import absolute_import - -import oneflow as flow - - -def conv2d_layer( - name, - input, - filters, - kernel_size=3, - strides=1, - padding="SAME", - data_format="NCHW", - dilation_rate=1, - activation="Relu", - use_bias=True, - weight_initializer=flow.random_uniform_initializer(), - bias_initializer=flow.constant_initializer(), -): - if isinstance(kernel_size, int): - kernel_size_1 = kernel_size - kernel_size_2 = kernel_size - if isinstance(kernel_size, list): - kernel_size_1 = kernel_size[0] - kernel_size_2 = kernel_size[1] - - weight_shape = (filters, input.shape[1], kernel_size_1, kernel_size_2) - weight = flow.get_variable( - name + "-weight", - shape=weight_shape, - dtype=input.dtype, - initializer=weight_initializer, - ) - output = flow.nn.conv2d( - input, weight, strides, padding, data_format, dilation_rate, name=name - ) - if use_bias: - bias = flow.get_variable( - name + "-bias", - shape=(filters,), - dtype=input.dtype, - initializer=bias_initializer, - ) - output = flow.nn.bias_add(output, bias, data_format) - - if activation is not None: - if activation == "Relu": - output = flow.keras.activations.relu(output) - else: - raise NotImplementedError - - return output - - -def conv2d_layer_with_bn( - name, - input, - filters, - kernel_size=3, - strides=1, - padding="SAME", - data_format="NCHW", - dilation_rate=1, - activation="Relu", - use_bias=True, - weight_initializer=flow.random_uniform_initializer(), - bias_initializer=flow.constant_initializer(), - use_bn=True, -): - output = conv2d_layer(name=name, - input=input, - filters=filters, - kernel_size=kernel_size, - strides=strides, - padding=padding, - data_format=data_format, - dilation_rate=dilation_rate, - activation=activation, - use_bias=use_bias, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer) - - if use_bn: - output = flow.layers.batch_normalization(inputs=output, - axis=1, - momentum=0.997, - epsilon=1.001e-5, - center=True, - scale=True, - trainable=True, - name=name + "_bn") - return output diff --git a/cnn_e2e/of_cnn_train_val.py b/cnn_e2e/of_cnn_train_val.py deleted file mode 100755 index b59888da846bbd1005d51f777e329c1ed8def8d4..0000000000000000000000000000000000000000 --- a/cnn_e2e/of_cnn_train_val.py +++ /dev/null @@ -1,120 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import os -import math - -import oneflow as flow - -import ofrecord_util -import config as configs -from util import Snapshot, Summary, InitNodes, Metric -from job_function_util import get_train_config, get_val_config -import inception_model -import resnet_model -import vgg_model -import alexnet_model - - -parser = configs.get_parser() -args = parser.parse_args() -configs.print_args(args) - - -total_device_num = args.num_nodes * args.gpu_num_per_node -train_batch_size = total_device_num * args.batch_size_per_device -val_batch_size = total_device_num * args.val_batch_size_per_device -(C, H, W) = args.image_shape -epoch_size = math.ceil(args.num_examples / train_batch_size) -num_val_steps = int(args.num_val_examples / val_batch_size) - - -model_dict = { - "resnet50": resnet_model.resnet50, - "vgg16": vgg_model.vgg16, - "alexnet": alexnet_model.alexnet, - "inceptionv3": inception_model.inceptionv3, -} - - -flow.config.gpu_device_num(args.gpu_num_per_node) -flow.config.enable_debug_mode(True) - -if args.use_boxing_v2: - flow.config.collective_boxing.nccl_fusion_threshold_mb(8) - flow.config.collective_boxing.nccl_fusion_all_reduce_use_buffer(False) - - -@flow.function(get_train_config(args)) -def TrainNet(): - if args.train_data_dir: - assert os.path.exists(args.train_data_dir) - print("Loading data from {}".format(args.train_data_dir)) - if args.use_new_dataloader: - (labels, images) = ofrecord_util.load_imagenet_for_training2(args) - else: - (labels, images) = ofrecord_util.load_imagenet_for_training(args) - # note: images.shape = (N C H W) in cc's new dataloader(load_imagenet_for_training2) - else: - print("Loading synthetic data.") - (labels, images) = ofrecord_util.load_synthetic(args) - - - logits = model_dict[args.model]( - images, need_transpose=False if (args.use_new_dataloader and args.train_data_dir) else True) - loss = flow.nn.sparse_softmax_cross_entropy_with_logits( - labels, logits, name="softmax_loss") - loss = flow.math.reduce_mean(loss) - flow.losses.add_loss(loss) - predictions = flow.nn.softmax(logits) - outputs = {"loss": loss, "predictions": predictions, "labels": labels} - return outputs - - -@flow.function(get_val_config(args)) -def InferenceNet(): - if args.val_data_dir: - assert os.path.exists(args.val_data_dir) - print("Loading data from {}".format(args.val_data_dir)) - if args.use_new_dataloader: - (labels, images) = ofrecord_util.load_imagenet_for_validation2(args) - else: - (labels, images) = ofrecord_util.load_imagenet_for_validation(args) - else: - print("Loading synthetic data.") - (labels, images) = ofrecord_util.load_synthetic(args) - - logits = model_dict[args.model]( - images, need_transpose=False if (args.use_new_dataloader and args.train_data_dir) else True) - predictions = flow.nn.softmax(logits) - outputs = {"predictions": predictions, "labels": labels} - return outputs - - -def main(): - InitNodes(args) - - flow.env.grpc_use_no_signal() - flow.env.log_dir(args.log_dir) - - summary = Summary(args.log_dir, args) - snapshot = Snapshot(args.model_save_dir, args.model_load_dir) - - for epoch in range(args.num_epochs): - metric = Metric(desc='train', calculate_batches=args.loss_print_every_n_iter, - summary=summary, save_summary_steps=epoch_size, - batch_size=train_batch_size, loss_key='loss') - for i in range(epoch_size): - TrainNet().async_get(metric.metric_cb(epoch, i)) - - if args.val_data_dir: - metric = Metric(desc='validation', calculate_batches=num_val_steps, summary=summary, - save_summary_steps=num_val_steps, batch_size=val_batch_size) - for i in range(num_val_steps): - InferenceNet().async_get(metric.metric_cb(epoch, i)) - snapshot.save('epoch_{}'.format(epoch)) - - -if __name__ == "__main__": - main() diff --git a/cnn_e2e/resnet_model.py b/cnn_e2e/resnet_model.py deleted file mode 100755 index c8d9f94f386d0c279b0e94f1ed9fd5d8cb3890bb..0000000000000000000000000000000000000000 --- a/cnn_e2e/resnet_model.py +++ /dev/null @@ -1,154 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import oneflow as flow - - -BLOCK_COUNTS = [3, 4, 6, 3] -BLOCK_FILTERS = [256, 512, 1024, 2048] -BLOCK_FILTERS_INNER = [64, 128, 256, 512] - - -def _conv2d( - name, - input, - filters, - kernel_size, - strides=1, - padding="SAME", - data_format="NCHW", - dilations=1, - weight_initializer=flow.variance_scaling_initializer( - 2, 'fan_in', 'random_normal', data_format="NCHW"), - weight_regularizer=flow.regularizers.l2(1.0/32768), - trainable=True, -): - weight = flow.get_variable( - name + "-weight", - shape=(filters, input.shape[1], kernel_size, kernel_size), - dtype=input.dtype, - initializer=weight_initializer, - regularizer=weight_regularizer, - model_name="weight", - trainable=trainable, - ) - return flow.nn.conv2d( - input, weight, strides, padding, data_format, dilations, name=name - ) - - -def _batch_norm(inputs, name=None, trainable=True): - return flow.layers.batch_normalization( - inputs=inputs, - axis=1, - momentum=0.9, # 97, - epsilon=1.001e-5, - center=True, - scale=True, - trainable=trainable, - name=name, - ) - - -def conv2d_affine(input, name, filters, kernel_size, strides, activation=None): - # input data_format must be NCHW, cannot check now - padding = "SAME" if strides > 1 or kernel_size > 1 else "VALID" - output = _conv2d(name, input, filters, kernel_size, strides, padding) - output = _batch_norm(output, name + "_bn") - if activation == "Relu": - output = flow.keras.activations.relu(output) - - return output - - -def bottleneck_transformation(input, block_name, filters, filters_inner, strides): - a = conv2d_affine( - input, block_name + "_branch2a", filters_inner, 1, 1, activation="Relu", - ) - - b = conv2d_affine( - a, block_name + "_branch2b", filters_inner, 3, strides, activation="Relu", - ) - - c = conv2d_affine(b, block_name + "_branch2c", filters, 1, 1) - - return c - - -def residual_block(input, block_name, filters, filters_inner, strides_init): - if strides_init != 1 or block_name == "res2_0": - shortcut = conv2d_affine( - input, block_name + "_branch1", filters, 1, strides_init - ) - else: - shortcut = input - - bottleneck = bottleneck_transformation( - input, block_name, filters, filters_inner, strides_init - ) - - return flow.keras.activations.relu(bottleneck + shortcut) - - -def residual_stage(input, stage_name, counts, filters, filters_inner, stride_init=2): - output = input - for i in range(counts): - block_name = "%s_%d" % (stage_name, i) - output = residual_block( - output, block_name, filters, filters_inner, stride_init if i == 0 else 1, - ) - - return output - - -def resnet_conv_x_body(input, on_stage_end=lambda x: x): - output = input - for i, (counts, filters, filters_inner) in enumerate( - zip(BLOCK_COUNTS, BLOCK_FILTERS, BLOCK_FILTERS_INNER) - ): - stage_name = "res%d" % (i + 2) - output = residual_stage( - output, stage_name, counts, filters, filters_inner, 1 if i == 0 else 2, - ) - on_stage_end(output) - - return output - - -def resnet_stem(input): - conv1 = _conv2d("conv1", input, 64, 7, 2) - conv1_bn = flow.keras.activations.relu(_batch_norm(conv1, "conv1_bn")) - pool1 = flow.nn.max_pool2d( - conv1_bn, ksize=3, strides=2, padding="VALID", data_format="NCHW", name="pool1", - ) - return pool1 - - -def resnet50(images, trainable=True, need_transpose=False): - - # note: images.shape = (N C H W) in cc's new dataloader, transpose is not needed anymore - if need_transpose: - images = flow.transpose(images, name="transpose", perm=[0, 3, 1, 2]) - - with flow.deprecated.variable_scope("Resnet"): - stem = resnet_stem(images) - body = resnet_conv_x_body(stem, lambda x: x) - pool5 = flow.nn.avg_pool2d( - body, ksize=7, strides=1, padding="VALID", data_format="NCHW", name="pool5", - ) - - fc1001 = flow.layers.dense( - flow.reshape(pool5, (pool5.shape[0], -1)), - units=1000, - use_bias=True, - kernel_initializer=flow.variance_scaling_initializer( - 2, 'fan_in', 'random_normal'), - # kernel_initializer=flow.xavier_uniform_initializer(), - bias_initializer=flow.zeros_initializer(), - kernel_regularizer=flow.regularizers.l2(1.0/32768), - trainable=trainable, - name="fc1001", - ) - - return fc1001 diff --git a/cnn_e2e/util.py b/cnn_e2e/util.py deleted file mode 100644 index f7e13406a66287e45018620a68765cf4cc5dc928..0000000000000000000000000000000000000000 --- a/cnn_e2e/util.py +++ /dev/null @@ -1,168 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import os -import time -import numpy as np -import pandas as pd -from datetime import datetime -import oneflow as flow - - -def InitNodes(args): - if args.num_nodes > 1: - assert args.num_nodes <= len(args.node_ips) - flow.env.ctrl_port(12138) - nodes = [] - for ip in args.node_ips: - addr_dict = {} - addr_dict["addr"] = ip - nodes.append(addr_dict) - - flow.env.machine(nodes) - - -class Snapshot(object): - def __init__(self, model_save_dir, model_load_dir): - self._model_save_dir = model_save_dir - self._check_point = flow.train.CheckPoint() - if model_load_dir: - assert os.path.isdir(model_load_dir) - print("Restoring model from {}.".format(model_load_dir)) - self._check_point.load(model_load_dir) - else: - self._check_point.init() - self.save('initial_model') - print("Init model on demand.") - - def save(self, name): - snapshot_save_path = os.path.join(self._model_save_dir, "snapshot_{}".format(name)) - if not os.path.exists(snapshot_save_path): - os.makedirs(snapshot_save_path) - print("Saving model to {}.".format(snapshot_save_path)) - self._check_point.save(snapshot_save_path) - - -class Summary(object): - def __init__(self, log_dir, config, filename='summary.csv'): - self._filename = filename - self._log_dir = log_dir - self._metrics = pd.DataFrame({"epoch":0, "iter": 0, "legend": "cfg", "note": str(config)}, index=[0]) - - def scalar(self, legend, value, epoch, step=-1): - # TODO: support rank(which device/gpu) - df = pd.DataFrame( - {"epoch": epoch, "iter": step, "legend": legend, "value": value, "rank": 0}, - index=[0]) - self._metrics = pd.concat([self._metrics, df], axis=0, sort=False) - - def save(self): - save_path = os.path.join(self._log_dir, self._filename) - self._metrics.to_csv(save_path, index=False) - - -class StopWatch(object): - def __init__(self): - pass - - def start(self): - self.start_time = time.time() - self.last_split = self.start_time - - def split(self): - now = time.time() - duration = now - self.last_split - self.last_split = now - return duration - - def stop(self): - self.stop_time = time.time() - - def duration(self): - return self.stop_time - self.start_time - - -def match_top_k(predictions, labels, top_k=1): - max_k_preds = predictions.argsort(axis=1)[:, -top_k:][:, ::-1] - match_array = np.logical_or.reduce(max_k_preds==labels.reshape((-1, 1)), axis=1) - num_matched = match_array.sum() - return num_matched, match_array.shape[0] - - -class Metric(object): - def __init__(self, summary=None, save_summary_steps=-1, desc='train', calculate_batches=-1, - batch_size=256, top_k=5, prediction_key='predictions', label_key='labels', - loss_key=None): - self.summary = summary - self.save_summary = isinstance(self.summary, Summary) - self.save_summary_steps = save_summary_steps - self.desc = desc - self.calculate_batches = calculate_batches - self.top_k = top_k - self.prediction_key = prediction_key - self.label_key = label_key - self.loss_key = loss_key - if loss_key: - self.fmt = "{}: epoch {}, iter {}, loss: {:.6f}, top_1: {:.6f}, top_k: {:.6f}, samples/s: {:.3f}" - else: - self.fmt = "{}: epoch {}, iter {}, top_1: {:.6f}, top_k: {:.6f}, samples/s: {:.3f}" - - self.timer = StopWatch() - self.timer.start() - self._clear() - - def _clear(self): - self.top_1_num_matched = 0 - self.top_k_num_matched = 0 - self.num_samples = 0.0 - - def metric_cb(self, epoch, step): - def callback(outputs): - if step == 0: self._clear() - if self.prediction_key: - num_matched, num_samples = match_top_k(outputs[self.prediction_key], - outputs[self.label_key]) - self.top_1_num_matched += num_matched - num_matched, _ = match_top_k(outputs[self.prediction_key], - outputs[self.label_key], self.top_k) - self.top_k_num_matched += num_matched - else: - num_samples = outputs[self.label_key].shape[0] - - self.num_samples += num_samples - - if (step + 1) % self.calculate_batches == 0: - throughput = self.num_samples / self.timer.split() - if self.prediction_key: - top_1_accuracy = self.top_1_num_matched / self.num_samples - top_k_accuracy = self.top_k_num_matched / self.num_samples - else: - top_1_accuracy = 0.0 - top_k_accuracy = 0.0 - - if self.loss_key: - loss = outputs[self.loss_key].mean() - print(self.fmt.format(self.desc, epoch, step + 1, loss, top_1_accuracy, - top_k_accuracy, throughput)) - if self.save_summary: - self.summary.scalar(self.desc+"_" + self.loss_key, loss, epoch, step) - else: - print(self.fmt.format(self.desc, epoch, step + 1, top_1_accuracy, - top_k_accuracy, throughput)) - - self._clear() - if self.save_summary: - self.summary.scalar(self.desc + "_throughput", throughput, epoch, step) - if self.prediction_key: - self.summary.scalar(self.desc + "_top_1", top_1_accuracy, epoch, step) - self.summary.scalar(self.desc + "_top_{}".format(self.top_k), - top_k_accuracy, epoch, step) - - if self.save_summary: - if (step + 1) % self.save_summary_steps == 0: - self.summary.save() - - return callback - - diff --git a/cnn_e2e/vgg_model.py b/cnn_e2e/vgg_model.py deleted file mode 100755 index 274096ddbe13eb766b1f6dea198e81e72d3fdcd1..0000000000000000000000000000000000000000 --- a/cnn_e2e/vgg_model.py +++ /dev/null @@ -1,78 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import oneflow as flow -from model_util import conv2d_layer - - -def _conv_block(in_blob, index, filters, conv_times): - conv_block = [] - conv_block.insert(0, in_blob) - for i in range(conv_times): - conv_i = conv2d_layer( - name="conv{}".format(index), - input=conv_block[i], - filters=filters, - kernel_size=3, - strides=1, - ) - conv_block.append(conv_i) - index += 1 - - return conv_block - - -def vgg16(images, need_transpose=False): - - if need_transpose: - images = flow.transpose(images, name="transpose", perm=[0, 3, 1, 2]) - - conv1 = _conv_block(images, 0, 64, 2) - pool1 = flow.nn.max_pool2d(conv1[-1], 2, 2, "VALID", "NCHW", name="pool1") - - conv2 = _conv_block(pool1, 2, 128, 2) - pool2 = flow.nn.max_pool2d(conv2[-1], 2, 2, "VALID", "NCHW", name="pool2") - - conv3 = _conv_block(pool2, 4, 256, 3) - pool3 = flow.nn.max_pool2d(conv3[-1], 2, 2, "VALID", "NCHW", name="pool3") - - conv4 = _conv_block(pool3, 7, 512, 3) - pool4 = flow.nn.max_pool2d(conv4[-1], 2, 2, "VALID", "NCHW", name="pool4") - - conv5 = _conv_block(pool4, 10, 512, 3) - pool5 = flow.nn.max_pool2d(conv5[-1], 2, 2, "VALID", "NCHW", name="pool5") - - fc6 = flow.layers.dense( - inputs=flow.reshape(pool5, [pool5.shape[0], -1]), - units=4096, - activation=flow.keras.activations.relu, - use_bias=True, - kernel_initializer=flow.truncated_normal(0.816496580927726), - bias_initializer=flow.constant_initializer(), - name="fc1", - ) - - fc6 = flow.nn.dropout(fc6, rate=0.5) - - fc7 = flow.layers.dense( - inputs=fc6, - units=4096, - activation=flow.keras.activations.relu, - use_bias=True, - kernel_initializer=flow.truncated_normal(0.816496580927726), - bias_initializer=flow.constant_initializer(), - name="fc2", - ) - fc7 = flow.nn.dropout(fc7, rate=0.5) - - fc8 = flow.layers.dense( - inputs=fc7, - units=1001, - use_bias=True, - kernel_initializer=flow.truncated_normal(0.816496580927726), - bias_initializer=flow.constant_initializer(), - name="fc_final", - ) - - return fc8