From d23d33fca9d53270e727593169fe46febe97a4fe Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Wed, 1 Apr 2020 00:26:30 +0800 Subject: [PATCH] modify scripts for dali 2 nodes --- cnn_e2e/dali_cnn_train_val.py | 80 +++++++---------------------------- dali_e2e.sh | 8 +++- 2 files changed, 22 insertions(+), 66 deletions(-) diff --git a/cnn_e2e/dali_cnn_train_val.py b/cnn_e2e/dali_cnn_train_val.py index 966fdbe..936d0a2 100755 --- a/cnn_e2e/dali_cnn_train_val.py +++ b/cnn_e2e/dali_cnn_train_val.py @@ -3,7 +3,6 @@ from __future__ import division from __future__ import print_function import os -import time import math import numpy as np @@ -12,7 +11,7 @@ parser = configs.get_parser() args = parser.parse_args() configs.print_args(args) -from util import Snapshot, Summary, InitNodes, StopWatch +from util import Snapshot, Summary, InitNodes, Metric from dali_util import get_rec_iter from job_function_util import get_train_config, get_val_config import oneflow as flow @@ -25,10 +24,9 @@ total_device_num = args.num_nodes * args.gpu_num_per_node train_batch_size = total_device_num * args.batch_size_per_device val_batch_size = total_device_num * args.val_batch_size_per_device (C, H, W) = args.image_shape +epoch_size = math.ceil(args.num_examples / train_batch_size) num_val_steps = args.num_val_examples / val_batch_size -summary = Summary(args.log_dir, args) -timer = StopWatch() model_dict = { "resnet50": resnet_model.resnet50, @@ -45,10 +43,10 @@ def TrainNet(images=flow.FixedTensorDef((train_batch_size, H, W, C), dtype=flow. labels=flow.FixedTensorDef((train_batch_size, ), dtype=flow.int32)): logits = model_dict[args.model](images) loss = flow.nn.sparse_softmax_cross_entropy_with_logits(labels, logits, name="softmax_loss") - #loss = flow.math.reduce_mean(loss) + loss = flow.math.reduce_mean(loss) flow.losses.add_loss(loss) - softmax = flow.nn.softmax(logits) - outputs = {"loss": loss, "softmax":softmax, "labels": labels} + predictions = flow.nn.softmax(logits) + outputs = {"loss": loss, "predictions":predictions, "labels": labels} return outputs @@ -56,87 +54,41 @@ def TrainNet(images=flow.FixedTensorDef((train_batch_size, H, W, C), dtype=flow. def InferenceNet(images=flow.FixedTensorDef((val_batch_size, H, W, C), dtype=flow.float), labels=flow.FixedTensorDef((val_batch_size, ), dtype=flow.int32)): logits = model_dict[args.model](images) - softmax = flow.nn.softmax(logits) - outputs = {"softmax":softmax, "labels": labels} + predictions = flow.nn.softmax(logits) + outputs = {"predictions":predictions, "labels": labels} return outputs#(softmax, labels) -def acc_acc(step, predictions): - classfications = np.argmax(predictions['softmax'].ndarray(), axis=1) - labels = predictions['labels'].reshape(-1) - if step == 0: - main.correct = 0.0 - main.total = 0.0 - else: - main.correct += np.sum(classfications == labels); - main.total += len(labels) - - -def train_callback(epoch, step): - def callback(train_outputs): - acc_acc(step, train_outputs) - loss = train_outputs['loss'].mean() - summary.scalar('loss', loss, step) - #summary.scalar('learning_rate', train_outputs['lr'], step) - if (step-1) % args.loss_print_every_n_iter == 0: - throughput = args.loss_print_every_n_iter * train_batch_size / timer.split() - accuracy = main.correct/main.total - print("epoch {}, iter {}, loss: {:.6f}, accuracy: {:.6f}, samples/s: {:.3f}".format( - epoch, step-1, loss, accuracy, throughput)) - summary.scalar('train_accuracy', accuracy, step) - main.correct = 0.0 - main.total = 0.0 - return callback - - -def do_predictions(epoch, predict_step, predictions): - acc_acc(predict_step, predictions) - if predict_step + 1 == num_val_steps: - assert main.total > 0 - summary.scalar('top1_accuracy', main.correct/main.total, epoch) - #summary.scalar('top1_correct', main.correct, epoch) - #summary.scalar('total_val_images', main.total, epoch) - print("epoch {}, top 1 accuracy: {:.6f}, time: {:.2f}".format(epoch, - main.correct/main.total, timer.split())) - - -def predict_callback(epoch, predict_step): - def callback(predictions): - do_predictions(epoch, predict_step, predictions) - return callback - - def main(): InitNodes(args) flow.env.grpc_use_no_signal() flow.env.log_dir(args.log_dir) + summary = Summary(args.log_dir, args) snapshot = Snapshot(args.model_save_dir, args.model_load_dir) train_data_iter, val_data_iter = get_rec_iter(args, True) - timer.start() for epoch in range(args.num_epochs): - tic = time.time() - print('Starting epoch {} at {:.2f}'.format(epoch, tic)) + metric = Metric(desc='train', calculate_batches=args.loss_print_every_n_iter, + summary=summary, save_summary_steps=epoch_size, + batch_size=train_batch_size, loss_key='loss') train_data_iter.reset() for i, batches in enumerate(train_data_iter): images, labels = batches - TrainNet(images, labels).async_get(train_callback(epoch, i)) + TrainNet(images, labels).async_get(metric.metric_cb(epoch, i)) # if i > 30:#debug # break #break - print('epoch {} training time: {:.2f}'.format(epoch, time.time() - tic)) if args.data_val: - tic = time.time() + metric = Metric(desc='validation', calculate_batches=num_val_steps, summary=summary, + save_summary_steps=num_val_steps, batch_size=val_batch_size) val_data_iter.reset() for i, batches in enumerate(val_data_iter): images, labels = batches - InferenceNet(images, labels).async_get(predict_callback(epoch, i)) - #acc_acc(i, InferenceNet(images, labels.astype(np.int32)).get()) + InferenceNet(images, labels).async_get(metric.metric_cb(epoch, i)) - summary.save() - snapshot.save('epoch_{}'.format(epoch+1)) + snapshot.save('epoch_{}'.format(epoch)) if __name__ == "__main__": diff --git a/dali_e2e.sh b/dali_e2e.sh index 0c1e54b..5591e3a 100755 --- a/dali_e2e.sh +++ b/dali_e2e.sh @@ -1,5 +1,7 @@ rm -rf core.* -DATA_ROOT=/mnt/13_nfs/xuan/ImageNet/mxnet +rm -rf output/snapshots/* +#DATA_ROOT=/mnt/13_nfs/xuan/ImageNet/mxnet +DATA_ROOT=/ssd/ImageNet/mxnet #DATA_ROOT=/dataset/imagenet-mxnet #python3 cnn_benchmark/of_cnn_train_val.py \ #gdb --args \ @@ -9,11 +11,13 @@ DATA_ROOT=/mnt/13_nfs/xuan/ImageNet/mxnet --data_train_idx=$DATA_ROOT/train.idx \ --data_val=$DATA_ROOT/val.rec \ --data_val_idx=$DATA_ROOT/val.idx \ + --num_nodes=2 \ + --node_ips='11.11.1.12,11.11.1.14' \ --gpu_num_per_node=4 \ --optimizer="momentum-cosine-decay" \ --learning_rate=0.256 \ --loss_print_every_n_iter=20 \ - --batch_size_per_device=64 \ + --batch_size_per_device=32 \ --val_batch_size_per_device=125 \ --model="resnet50" #--use_fp16 true \ -- GitLab