提交 d23d33fc 编写于 作者: S ShawnXuan

modify scripts for dali 2 nodes

上级 227fef9a
...@@ -3,7 +3,6 @@ from __future__ import division ...@@ -3,7 +3,6 @@ from __future__ import division
from __future__ import print_function from __future__ import print_function
import os import os
import time
import math import math
import numpy as np import numpy as np
...@@ -12,7 +11,7 @@ parser = configs.get_parser() ...@@ -12,7 +11,7 @@ parser = configs.get_parser()
args = parser.parse_args() args = parser.parse_args()
configs.print_args(args) configs.print_args(args)
from util import Snapshot, Summary, InitNodes, StopWatch from util import Snapshot, Summary, InitNodes, Metric
from dali_util import get_rec_iter from dali_util import get_rec_iter
from job_function_util import get_train_config, get_val_config from job_function_util import get_train_config, get_val_config
import oneflow as flow import oneflow as flow
...@@ -25,10 +24,9 @@ total_device_num = args.num_nodes * args.gpu_num_per_node ...@@ -25,10 +24,9 @@ total_device_num = args.num_nodes * args.gpu_num_per_node
train_batch_size = total_device_num * args.batch_size_per_device train_batch_size = total_device_num * args.batch_size_per_device
val_batch_size = total_device_num * args.val_batch_size_per_device val_batch_size = total_device_num * args.val_batch_size_per_device
(C, H, W) = args.image_shape (C, H, W) = args.image_shape
epoch_size = math.ceil(args.num_examples / train_batch_size)
num_val_steps = args.num_val_examples / val_batch_size num_val_steps = args.num_val_examples / val_batch_size
summary = Summary(args.log_dir, args)
timer = StopWatch()
model_dict = { model_dict = {
"resnet50": resnet_model.resnet50, "resnet50": resnet_model.resnet50,
...@@ -45,10 +43,10 @@ def TrainNet(images=flow.FixedTensorDef((train_batch_size, H, W, C), dtype=flow. ...@@ -45,10 +43,10 @@ def TrainNet(images=flow.FixedTensorDef((train_batch_size, H, W, C), dtype=flow.
labels=flow.FixedTensorDef((train_batch_size, ), dtype=flow.int32)): labels=flow.FixedTensorDef((train_batch_size, ), dtype=flow.int32)):
logits = model_dict[args.model](images) logits = model_dict[args.model](images)
loss = flow.nn.sparse_softmax_cross_entropy_with_logits(labels, logits, name="softmax_loss") loss = flow.nn.sparse_softmax_cross_entropy_with_logits(labels, logits, name="softmax_loss")
#loss = flow.math.reduce_mean(loss) loss = flow.math.reduce_mean(loss)
flow.losses.add_loss(loss) flow.losses.add_loss(loss)
softmax = flow.nn.softmax(logits) predictions = flow.nn.softmax(logits)
outputs = {"loss": loss, "softmax":softmax, "labels": labels} outputs = {"loss": loss, "predictions":predictions, "labels": labels}
return outputs return outputs
...@@ -56,87 +54,41 @@ def TrainNet(images=flow.FixedTensorDef((train_batch_size, H, W, C), dtype=flow. ...@@ -56,87 +54,41 @@ def TrainNet(images=flow.FixedTensorDef((train_batch_size, H, W, C), dtype=flow.
def InferenceNet(images=flow.FixedTensorDef((val_batch_size, H, W, C), dtype=flow.float), def InferenceNet(images=flow.FixedTensorDef((val_batch_size, H, W, C), dtype=flow.float),
labels=flow.FixedTensorDef((val_batch_size, ), dtype=flow.int32)): labels=flow.FixedTensorDef((val_batch_size, ), dtype=flow.int32)):
logits = model_dict[args.model](images) logits = model_dict[args.model](images)
softmax = flow.nn.softmax(logits) predictions = flow.nn.softmax(logits)
outputs = {"softmax":softmax, "labels": labels} outputs = {"predictions":predictions, "labels": labels}
return outputs#(softmax, labels) return outputs#(softmax, labels)
def acc_acc(step, predictions):
classfications = np.argmax(predictions['softmax'].ndarray(), axis=1)
labels = predictions['labels'].reshape(-1)
if step == 0:
main.correct = 0.0
main.total = 0.0
else:
main.correct += np.sum(classfications == labels);
main.total += len(labels)
def train_callback(epoch, step):
def callback(train_outputs):
acc_acc(step, train_outputs)
loss = train_outputs['loss'].mean()
summary.scalar('loss', loss, step)
#summary.scalar('learning_rate', train_outputs['lr'], step)
if (step-1) % args.loss_print_every_n_iter == 0:
throughput = args.loss_print_every_n_iter * train_batch_size / timer.split()
accuracy = main.correct/main.total
print("epoch {}, iter {}, loss: {:.6f}, accuracy: {:.6f}, samples/s: {:.3f}".format(
epoch, step-1, loss, accuracy, throughput))
summary.scalar('train_accuracy', accuracy, step)
main.correct = 0.0
main.total = 0.0
return callback
def do_predictions(epoch, predict_step, predictions):
acc_acc(predict_step, predictions)
if predict_step + 1 == num_val_steps:
assert main.total > 0
summary.scalar('top1_accuracy', main.correct/main.total, epoch)
#summary.scalar('top1_correct', main.correct, epoch)
#summary.scalar('total_val_images', main.total, epoch)
print("epoch {}, top 1 accuracy: {:.6f}, time: {:.2f}".format(epoch,
main.correct/main.total, timer.split()))
def predict_callback(epoch, predict_step):
def callback(predictions):
do_predictions(epoch, predict_step, predictions)
return callback
def main(): def main():
InitNodes(args) InitNodes(args)
flow.env.grpc_use_no_signal() flow.env.grpc_use_no_signal()
flow.env.log_dir(args.log_dir) flow.env.log_dir(args.log_dir)
summary = Summary(args.log_dir, args)
snapshot = Snapshot(args.model_save_dir, args.model_load_dir) snapshot = Snapshot(args.model_save_dir, args.model_load_dir)
train_data_iter, val_data_iter = get_rec_iter(args, True) train_data_iter, val_data_iter = get_rec_iter(args, True)
timer.start()
for epoch in range(args.num_epochs): for epoch in range(args.num_epochs):
tic = time.time() metric = Metric(desc='train', calculate_batches=args.loss_print_every_n_iter,
print('Starting epoch {} at {:.2f}'.format(epoch, tic)) summary=summary, save_summary_steps=epoch_size,
batch_size=train_batch_size, loss_key='loss')
train_data_iter.reset() train_data_iter.reset()
for i, batches in enumerate(train_data_iter): for i, batches in enumerate(train_data_iter):
images, labels = batches images, labels = batches
TrainNet(images, labels).async_get(train_callback(epoch, i)) TrainNet(images, labels).async_get(metric.metric_cb(epoch, i))
# if i > 30:#debug # if i > 30:#debug
# break # break
#break #break
print('epoch {} training time: {:.2f}'.format(epoch, time.time() - tic))
if args.data_val: if args.data_val:
tic = time.time() metric = Metric(desc='validation', calculate_batches=num_val_steps, summary=summary,
save_summary_steps=num_val_steps, batch_size=val_batch_size)
val_data_iter.reset() val_data_iter.reset()
for i, batches in enumerate(val_data_iter): for i, batches in enumerate(val_data_iter):
images, labels = batches images, labels = batches
InferenceNet(images, labels).async_get(predict_callback(epoch, i)) InferenceNet(images, labels).async_get(metric.metric_cb(epoch, i))
#acc_acc(i, InferenceNet(images, labels.astype(np.int32)).get())
summary.save() snapshot.save('epoch_{}'.format(epoch))
snapshot.save('epoch_{}'.format(epoch+1))
if __name__ == "__main__": if __name__ == "__main__":
......
rm -rf core.* rm -rf core.*
DATA_ROOT=/mnt/13_nfs/xuan/ImageNet/mxnet rm -rf output/snapshots/*
#DATA_ROOT=/mnt/13_nfs/xuan/ImageNet/mxnet
DATA_ROOT=/ssd/ImageNet/mxnet
#DATA_ROOT=/dataset/imagenet-mxnet #DATA_ROOT=/dataset/imagenet-mxnet
#python3 cnn_benchmark/of_cnn_train_val.py \ #python3 cnn_benchmark/of_cnn_train_val.py \
#gdb --args \ #gdb --args \
...@@ -9,11 +11,13 @@ DATA_ROOT=/mnt/13_nfs/xuan/ImageNet/mxnet ...@@ -9,11 +11,13 @@ DATA_ROOT=/mnt/13_nfs/xuan/ImageNet/mxnet
--data_train_idx=$DATA_ROOT/train.idx \ --data_train_idx=$DATA_ROOT/train.idx \
--data_val=$DATA_ROOT/val.rec \ --data_val=$DATA_ROOT/val.rec \
--data_val_idx=$DATA_ROOT/val.idx \ --data_val_idx=$DATA_ROOT/val.idx \
--num_nodes=2 \
--node_ips='11.11.1.12,11.11.1.14' \
--gpu_num_per_node=4 \ --gpu_num_per_node=4 \
--optimizer="momentum-cosine-decay" \ --optimizer="momentum-cosine-decay" \
--learning_rate=0.256 \ --learning_rate=0.256 \
--loss_print_every_n_iter=20 \ --loss_print_every_n_iter=20 \
--batch_size_per_device=64 \ --batch_size_per_device=32 \
--val_batch_size_per_device=125 \ --val_batch_size_per_device=125 \
--model="resnet50" --model="resnet50"
#--use_fp16 true \ #--use_fp16 true \
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册