From d7ae4c1f76702e458b0de5ba33f42f23070f123b Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Wed, 1 Apr 2020 15:53:12 +0800 Subject: [PATCH] support io test --- cnn_e2e/ofrecord_util.py | 35 +++++++++++++++++++++++++++++++++++ io_test.sh | 28 ++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+) create mode 100755 io_test.sh diff --git a/cnn_e2e/ofrecord_util.py b/cnn_e2e/ofrecord_util.py index b716058..90fbdd1 100644 --- a/cnn_e2e/ofrecord_util.py +++ b/cnn_e2e/ofrecord_util.py @@ -109,3 +109,38 @@ def load_imagenet_for_training2(args): mean=args.rgb_mean, std=args.rgb_std, output_dtype = flow.float) print(normal.shape) return label, normal + + +if __name__ == "__main__": + import os + import config as configs + from util import Summary, InitNodes, Metric + from job_function_util import get_val_config + parser = configs.get_parser() + args = parser.parse_args() + configs.print_args(args) + + flow.config.gpu_device_num(args.gpu_num_per_node) + flow.config.enable_debug_mode(True) + @flow.function(get_val_config(args)) + def IOTest(): + if args.train_data_dir: + assert os.path.exists(args.train_data_dir) + print("Loading data from {}".format(args.train_data_dir)) + (labels, images) = load_imagenet_for_training(args) + #(labels, images) = load_imagenet_for_training2(args) + else: + print("Loading synthetic data.") + (labels, images) = load_synthetic(args) + predictions = labels + outputs = {"predictions":predictions, "labels": labels} + return outputs + + total_device_num = args.num_nodes * args.gpu_num_per_node + train_batch_size = total_device_num * args.batch_size_per_device + summary = Summary(args.log_dir, args, filename='io_test.csv') + metric = Metric(desc='io_test', calculate_batches=args.loss_print_every_n_iter, + summary=summary, save_summary_steps=args.loss_print_every_n_iter, + batch_size=train_batch_size) + for i in range(1000): + IOTest().async_get(metric.metric_cb(0, i)) diff --git a/io_test.sh b/io_test.sh new file mode 100755 index 0000000..c6af6f8 --- /dev/null +++ b/io_test.sh @@ -0,0 +1,28 @@ +rm -rf core.* +#DATA_ROOT=/mnt/13_nfs/xuan/ImageNet/ofrecord +DATA_ROOT=/dataset/ImageNet/ofrecord +#DATA_ROOT=/dataset/imagenet-mxnet + #python3 cnn_benchmark/of_cnn_train_val.py \ +#gdb --args \ +#nvprof -f -o resnet.nvvp \ + python3 cnn_e2e/ofrecord_util.py \ + --train_data_dir=$DATA_ROOT/train \ + --train_data_part_num=256 \ + --val_data_dir=$DATA_ROOT/validation \ + --val_data_part_num=256 \ + --num_nodes=1 \ + --node_ips='11.11.1.13,11.11.1.14' \ + --gpu_num_per_node=4 \ + --optimizer="momentum-cosine-decay" \ + --learning_rate=0.256 \ + --loss_print_every_n_iter=20 \ + --batch_size_per_device=64 \ + --val_batch_size_per_device=125 \ + --model="resnet50" + #--use_fp16 true \ + #--weight_l2=3.0517578125e-05 \ + #--num_examples=1024 \ + #--optimizer="momentum-decay" \ + #--data_dir="/mnt/13_nfs/xuan/ImageNet/ofrecord/train" + #--data_dir="/mnt/dataset/xuan/ImageNet/ofrecord/train" + #--warmup_iter_num=10000 \ -- GitLab