提交 5462fe30 编写于 作者: S ShawnXuan

support iteration

上级 a88bc643
......@@ -18,8 +18,10 @@ def get_parser(parser=None):
# resouce
parser.add_argument("--gpu_num_per_node", type=int, default=1)
parser.add_argument("--node_num", type=int, default=1)
parser.add_argument("--node_list", type=str, default=None, help="nodes' IP address, split by comma")
parser.add_argument('--num_nodes', type=int, default=1,
help='node/machine number for training')
parser.add_argument('--node_ips', type=str_list, default=['192.168.1.15', '192.168.1.16'],
help='nodes ip list for training, devided by ",", length >= num_nodes')
parser.add_argument("--model", type=str, default="vgg16", help="vgg16 or resnet50")
......@@ -35,6 +37,17 @@ def get_parser(parser=None):
parser.add_argument("--image_size", type=int, default=224, help="image size")#Todo, remove
# from mxnet
parser.add_argument('--num_epochs', type=int, default=90, help='number of epochs')
parser.add_argument('--lr', type=float, default=0.1, help='initial learning rate')
parser.add_argument('--lr-schedule', choices=('multistep', 'cosine'), default='cosine',
help='learning rate schedule')
parser.add_argument('--lr-factor', type=float, default=0.256,
help='the ratio to reduce lr on each step')
parser.add_argument('--lr-steps', type=float_list, default=[],
help='the epochs to reduce the lr, e.g. 30,60')
parser.add_argument('--warmup-epochs', type=int, default=5,
help='the epochs to ramp-up lr to scaled large-batch value')
parser.add_argument("--input_layout", type=str, default='NHWC', help="NCHW or NHWC")
parser.add_argument('--image-shape', type=int_list, default=[3, 224, 224],
help='the image shape feed into the network')
......@@ -64,7 +77,7 @@ def get_parser(parser=None):
# validation
parser.add_argument("--val_step_num", type=int, default=10, help="total validation step number")
parser.add_argument("--val_batch_size_per_device", type=int, default=8)
parser.add_argument("--val_batch_size_per_device", type=int, default=100)
parser.add_argument("--val_data_dir", type=str, default=None, help="validation dataset directory")
parser.add_argument("--val_data_part_num", type=int, default=32, help="validation data part number")
......
此差异已折叠。
......@@ -5,6 +5,7 @@ from __future__ import print_function
import os
import time
import numpy as np
import logging
import oneflow as flow
......@@ -14,14 +15,13 @@ import resnet_model
import alexnet_model
import config as configs
from util import Snapshot, Summary, print_args, make_lr
from dali import get_rec_pipe
from util import Snapshot, Summary, print_args, make_lr, nodes_init
from dali import get_rec_iter
parser = configs.get_parser()
#args = parser.parse_known_args()[0]
args = parser.parse_args()
print(args)
summary = Summary(args.log_dir, args)
......@@ -47,7 +47,7 @@ optimizer_dict = {
# "warmup_conf": {"linear_conf": {"warmup_batches":10000, "start_multiplier":0}},
total_device_num = args.node_num * args.gpu_num_per_node
total_device_num = args.num_nodes * args.gpu_num_per_node
train_batch_size = total_device_num * args.batch_size_per_device
val_batch_size = total_device_num * args.val_batch_size_per_device
(H, W, C) = (args.image_size, args.image_size, 3)
......@@ -148,54 +148,54 @@ def InferenceNet():
return (softmax, labels)
def train_callback(step):
def callback(train_outputs):
loss = train_outputs['loss'].mean()
summary.scalar('loss', loss, step)
#summary.scalar('learning_rate', train_outputs['lr'], step)
if (step-1) % args.loss_print_every_n_iter == 0:
print("iter {}, loss: {:.6f}".format(step-1, loss))
return callback
def do_predictions(step, predict_step, predictions):
classfications = np.argmax(predictions[0].ndarray(), axis=1)
labels = predictions[1]
if predict_step == 0:
main.correct = 0.0
main.total = 0.0
else:
main.correct += np.sum(classfications == labels);
main.total += len(labels)
if predict_step + 1 == args.val_step_num:
assert main.total > 0
summary.scalar('top1_accuracy', main.correct/main.total, step)
#summary.scalar('top1_correct', main.correct, step)
#summary.scalar('total_val_images', main.total, step)
print("iter {}, top 1 accuracy: {:.6f}".format(step, main.correct/main.total))
def predict_callback(step, predict_step):
def callback(predictions):
do_predictions(step, predict_step, predictions)
return callback
def main():
print_args(args)
def train_callback(step):
def callback(train_outputs):
loss = train_outputs['loss'].mean()
summary.scalar('loss', loss, step)
#summary.scalar('learning_rate', train_outputs['lr'], step)
if (step-1) % args.loss_print_every_n_iter == 0:
print("iter {}, loss: {:.6f}".format(step-1, loss))
return callback
def do_predictions(step, predict_step, predictions):
classfications = np.argmax(predictions[0].ndarray(), axis=1)
labels = predictions[1]
if predict_step == 0:
main.correct = 0.0
main.total = 0.0
else:
main.correct += np.sum(classfications == labels);
main.total += len(labels)
if predict_step + 1 == args.val_step_num:
assert main.total > 0
summary.scalar('top1_accuracy', main.correct/main.total, step)
#summary.scalar('top1_correct', main.correct, step)
#summary.scalar('total_val_images', main.total, step)
print("iter {}, top 1 accuracy: {:.6f}".format(step, main.correct/main.total))
def predict_callback(step, predict_step):
def callback(predictions):
do_predictions(step, predict_step, predictions)
return callback
nodes_init(args)
flow.env.grpc_use_no_signal()
flow.env.log_dir(args.log_dir)
if args.node_num > 1:
nodes = []
for n in args.node_list.strip().split(","):
addr_dict = {}
addr_dict["addr"] = n
nodes.append(addr_dict)
flow.env.machine(nodes)
snapshot = Snapshot(args.model_save_dir, args.model_load_dir)
epoch=0
for epoch in range(args.num_epoch):
logging.info('Starting epoch {}'.format(epoch))
train_pipe, val_pipe = get_rec_pipe(args, True, seed=epoch)
exit()
train_pipe, _ = get_rec_pipe(args, True)
for step in range(args.train_step_num):
# save model every n iter
images, labels = train_pipe.run()
......@@ -207,8 +207,6 @@ def main():
snapshot.save(step)
#TrainNet().async_get(train_callback(step+1))
#print(images.as_cpu().as_array().shape)
#break
NumpyTrainNet(images.as_cpu().as_array(), labels.as_array().astype(np.int32)).async_get(train_callback(step+1))
step += 1
......
......@@ -5,6 +5,18 @@ from datetime import datetime
import oneflow as flow
def nodes_init(args):
if args.num_nodes > 1:
assert args.num_nodes <= len(args.node_ips)
nodes = []
for n in args.node_list.strip().split(","):
addr_dict = {}
addr_dict["addr"] = n
nodes.append(addr_dict)
flow.env.machine(nodes)
class Snapshot:
def __init__(self, model_save_dir, model_load_dir):
self._model_save_dir = model_save_dir
......@@ -24,6 +36,7 @@ class Snapshot:
print("Saving model to {}.".format(snapshot_save_path))
self._check_point.save(snapshot_save_path)
class Summary():
def __init__(self, log_dir, config):
self._log_dir = log_dir
......@@ -82,7 +95,7 @@ def make_lr(train_step_name, model_update_conf, primary_lr, secondary_lr=None):
def print_args(args):
print("=".ljust(66, "="))
print("Running {}: num_gpu_per_node = {}, num_nodes = {}.".format(
args.model, args.gpu_num_per_node, args.node_num))
args.model, args.gpu_num_per_node, args.num_nodes))
print("=".ljust(66, "="))
for arg in vars(args):
print("{} = {}".format(arg, getattr(args, arg)))
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册