提交 f087abe1 编写于 作者: C chenguowei01

train by iters

上级 62e3a252
...@@ -56,7 +56,7 @@ def infer(model, test_dataset=None, model_dir=None, save_dir='output'): ...@@ -56,7 +56,7 @@ def infer(model, test_dataset=None, model_dir=None, save_dir='output'):
raise Exception("Unexpected info '{}' in im_info".format( raise Exception("Unexpected info '{}' in im_info".format(
info[0])) info[0]))
im_file = im_path.replace(test_dataset.data_dir, '') im_file = im_path.replace(test_dataset.dataset_root, '')
if im_file[0] == '/': if im_file[0] == '/':
im_file = im_file[1:] im_file = im_file[1:]
# save added image # save added image
......
...@@ -32,21 +32,21 @@ def train(model, ...@@ -32,21 +32,21 @@ def train(model,
eval_dataset=None, eval_dataset=None,
optimizer=None, optimizer=None,
save_dir='output', save_dir='output',
num_epochs=100, iters=10000,
batch_size=2, batch_size=2,
pretrained_model=None, pretrained_model=None,
resume_model=None, resume_model=None,
save_interval_epochs=1, save_interval_iters=1000,
log_steps=10, log_iters=10,
num_classes=None, num_classes=None,
num_workers=8, num_workers=8,
use_vdl=False): use_vdl=False):
ignore_index = model.ignore_index ignore_index = model.ignore_index
nranks = ParallelEnv().nranks nranks = ParallelEnv().nranks
start_epoch = 0 start_iter = 0
if resume_model is not None: if resume_model is not None:
start_epoch = resume(model, optimizer, resume_model) start_iter = resume(model, optimizer, resume_model)
elif pretrained_model is not None: elif pretrained_model is not None:
load_pretrained_model(model, pretrained_model) load_pretrained_model(model, pretrained_model)
...@@ -75,16 +75,19 @@ def train(model, ...@@ -75,16 +75,19 @@ def train(model,
timer = Timer() timer = Timer()
avg_loss = 0.0 avg_loss = 0.0
steps_per_epoch = len(batch_sampler) iters_per_epoch = len(batch_sampler)
total_steps = steps_per_epoch * (num_epochs - start_epoch)
num_steps = 0
best_mean_iou = -1.0 best_mean_iou = -1.0
best_model_epoch = -1 best_model_iter = -1
train_reader_cost = 0.0 train_reader_cost = 0.0
train_batch_cost = 0.0 train_batch_cost = 0.0
for epoch in range(start_epoch, num_epochs):
timer.start() timer.start()
for step, data in enumerate(loader):
iter = 0
while iter < iters:
for data in loader:
iter += 1
if iter > iters:
break
train_reader_cost += timer.elapsed_time() train_reader_cost += timer.elapsed_time()
images = data[0] images = data[0]
labels = data[1].astype('int64') labels = data[1].astype('int64')
...@@ -101,36 +104,34 @@ def train(model, ...@@ -101,36 +104,34 @@ def train(model,
model.clear_gradients() model.clear_gradients()
avg_loss += loss.numpy()[0] avg_loss += loss.numpy()[0]
lr = optimizer.current_step_lr() lr = optimizer.current_step_lr()
num_steps += 1
train_batch_cost += timer.elapsed_time() train_batch_cost += timer.elapsed_time()
if num_steps % log_steps == 0 and ParallelEnv().local_rank == 0: if (iter) % log_iters == 0 and ParallelEnv().local_rank == 0:
avg_loss /= log_steps avg_loss /= log_iters
avg_train_reader_cost = train_reader_cost / log_steps avg_train_reader_cost = train_reader_cost / log_iters
avg_train_batch_cost = train_batch_cost / log_steps avg_train_batch_cost = train_batch_cost / log_iters
train_reader_cost = 0.0 train_reader_cost = 0.0
train_batch_cost = 0.0 train_batch_cost = 0.0
remain_steps = total_steps - num_steps remain_iters = iters - iter
eta = calculate_eta(remain_steps, avg_train_batch_cost) eta = calculate_eta(remain_iters, avg_train_batch_cost)
logger.info( logger.info(
"[TRAIN] Epoch={}/{}, Step={}/{}, loss={:.4f}, lr={:.6f}, batch_cost={:.4f}, reader_cost={:.4f} | ETA {}" "[TRAIN] epoch={}, iter={}/{}, loss={:.4f}, lr={:.6f}, batch_cost={:.4f}, reader_cost={:.4f} | ETA {}"
.format(epoch + 1, num_epochs, step + 1, steps_per_epoch, .format((iter - 1) // iters_per_epoch + 1, iter, iters,
avg_loss * nranks, lr, avg_train_batch_cost, avg_loss * nranks, lr, avg_train_batch_cost,
avg_train_reader_cost, eta)) avg_train_reader_cost, eta))
if use_vdl: if use_vdl:
log_writer.add_scalar('Train/loss', avg_loss * nranks, log_writer.add_scalar('Train/loss', avg_loss * nranks, iter)
num_steps) log_writer.add_scalar('Train/lr', lr, iter)
log_writer.add_scalar('Train/lr', lr, num_steps)
log_writer.add_scalar('Train/batch_cost', log_writer.add_scalar('Train/batch_cost',
avg_train_batch_cost, num_steps) avg_train_batch_cost, iter)
log_writer.add_scalar('Train/reader_cost', log_writer.add_scalar('Train/reader_cost',
avg_train_reader_cost, num_steps) avg_train_reader_cost, iter)
avg_loss = 0.0 avg_loss = 0.0
timer.restart() timer.restart()
if ((epoch + 1) % save_interval_epochs == 0 if (iter % save_interval_iters == 0
or epoch + 1 == num_epochs) and ParallelEnv().local_rank == 0: or iter == iters) and ParallelEnv().local_rank == 0:
current_save_dir = os.path.join(save_dir, current_save_dir = os.path.join(save_dir,
"epoch_{}".format(epoch + 1)) "iter_{}".format(iter))
if not os.path.isdir(current_save_dir): if not os.path.isdir(current_save_dir):
os.makedirs(current_save_dir) os.makedirs(current_save_dir)
fluid.save_dygraph(model.state_dict(), fluid.save_dygraph(model.state_dict(),
...@@ -145,20 +146,21 @@ def train(model, ...@@ -145,20 +146,21 @@ def train(model,
model_dir=current_save_dir, model_dir=current_save_dir,
num_classes=num_classes, num_classes=num_classes,
ignore_index=ignore_index, ignore_index=ignore_index,
epoch_id=epoch + 1) iter_id=iter)
if mean_iou > best_mean_iou: if mean_iou > best_mean_iou:
best_mean_iou = mean_iou best_mean_iou = mean_iou
best_model_epoch = epoch + 1 best_model_iter = iter
best_model_dir = os.path.join(save_dir, "best_model") best_model_dir = os.path.join(save_dir, "best_model")
fluid.save_dygraph(model.state_dict(), fluid.save_dygraph(
model.state_dict(),
os.path.join(best_model_dir, 'model')) os.path.join(best_model_dir, 'model'))
logger.info( logger.info(
'Current evaluated best model in eval_dataset is epoch_{}, miou={:4f}' 'Current evaluated best model in eval_dataset is iter_{}, miou={:4f}'
.format(best_model_epoch, best_mean_iou)) .format(best_model_iter, best_mean_iou))
if use_vdl: if use_vdl:
log_writer.add_scalar('Evaluate/mIoU', mean_iou, epoch + 1) log_writer.add_scalar('Evaluate/mIoU', mean_iou, iter)
log_writer.add_scalar('Evaluate/aAcc', avg_acc, epoch + 1) log_writer.add_scalar('Evaluate/aAcc', avg_acc, iter)
model.train() model.train()
if use_vdl: if use_vdl:
log_writer.close() log_writer.close()
...@@ -30,22 +30,22 @@ def evaluate(model, ...@@ -30,22 +30,22 @@ def evaluate(model,
model_dir=None, model_dir=None,
num_classes=None, num_classes=None,
ignore_index=255, ignore_index=255,
epoch_id=None): iter_id=None):
ckpt_path = os.path.join(model_dir, 'model') ckpt_path = os.path.join(model_dir, 'model')
para_state_dict, opti_state_dict = fluid.load_dygraph(ckpt_path) para_state_dict, opti_state_dict = fluid.load_dygraph(ckpt_path)
model.set_dict(para_state_dict) model.set_dict(para_state_dict)
model.eval() model.eval()
total_steps = len(eval_dataset) total_iters = len(eval_dataset)
conf_mat = ConfusionMatrix(num_classes, streaming=True) conf_mat = ConfusionMatrix(num_classes, streaming=True)
logger.info( logger.info(
"Start to evaluating(total_samples={}, total_steps={})...".format( "Start to evaluating(total_samples={}, total_iters={})...".format(
len(eval_dataset), total_steps)) len(eval_dataset), total_iters))
timer = Timer() timer = Timer()
timer.start() timer.start()
for step, (im, im_info, label) in tqdm.tqdm( for iter, (im, im_info, label) in tqdm.tqdm(
enumerate(eval_dataset), total=total_steps): enumerate(eval_dataset), total=total_iters):
im = to_variable(im) im = to_variable(im)
pred, _ = model(im) pred, _ = model(im)
pred = pred.numpy().astype('float32') pred = pred.numpy().astype('float32')
...@@ -67,12 +67,12 @@ def evaluate(model, ...@@ -67,12 +67,12 @@ def evaluate(model,
conf_mat.calculate(pred=pred, label=label, ignore=mask) conf_mat.calculate(pred=pred, label=label, ignore=mask)
_, iou = conf_mat.mean_iou() _, iou = conf_mat.mean_iou()
time_step = timer.elapsed_time() time_iter = timer.elapsed_time()
remain_step = total_steps - step - 1 remain_iter = total_iters - iter - 1
logger.debug( logger.debug(
"[EVAL] Epoch={}, Step={}/{}, iou={:4f}, sec/step={:.4f} | ETA {}". "[EVAL] iter_id={}, iter={}/{}, iou={:4f}, sec/iter={:.4f} | ETA {}"
format(epoch_id, step + 1, total_steps, iou, time_step, .format(iter_id, iter + 1, total_iters, iou, time_iter,
calculate_eta(remain_step, time_step))) calculate_eta(remain_iter, time_iter)))
timer.restart() timer.restart()
category_iou, miou = conf_mat.mean_iou() category_iou, miou = conf_mat.mean_iou()
......
...@@ -61,11 +61,11 @@ def parse_args(): ...@@ -61,11 +61,11 @@ def parse_args():
default=[512, 512], default=[512, 512],
type=int) type=int)
parser.add_argument( parser.add_argument(
'--num_epochs', '--iters',
dest='num_epochs', dest='iters',
help='Number epochs for training', help='iters for training',
type=int, type=int,
default=100) default=10000)
parser.add_argument( parser.add_argument(
'--batch_size', '--batch_size',
dest='batch_size', dest='batch_size',
...@@ -91,9 +91,9 @@ def parse_args(): ...@@ -91,9 +91,9 @@ def parse_args():
type=str, type=str,
default=None) default=None)
parser.add_argument( parser.add_argument(
'--save_interval_epochs', '--save_interval_iters',
dest='save_interval_epochs', dest='save_interval_iters',
help='The interval epochs for save a model snapshot', help='The interval iters for save a model snapshot',
type=int, type=int,
default=5) default=5)
parser.add_argument( parser.add_argument(
...@@ -114,9 +114,9 @@ def parse_args(): ...@@ -114,9 +114,9 @@ def parse_args():
help='Eval while training', help='Eval while training',
action='store_true') action='store_true')
parser.add_argument( parser.add_argument(
'--log_steps', '--log_iters',
dest='log_steps', dest='log_iters',
help='Display logging information at every log_steps', help='Display logging information at every log_iters',
default=10, default=10,
type=int) type=int)
parser.add_argument( parser.add_argument(
...@@ -174,11 +174,10 @@ def main(args): ...@@ -174,11 +174,10 @@ def main(args):
# Creat optimizer # Creat optimizer
# todo, may less one than len(loader) # todo, may less one than len(loader)
num_steps_each_epoch = len(train_dataset) // ( num_iters_each_epoch = len(train_dataset) // (
args.batch_size * ParallelEnv().nranks) args.batch_size * ParallelEnv().nranks)
decay_step = args.num_epochs * num_steps_each_epoch
lr_decay = fluid.layers.polynomial_decay( lr_decay = fluid.layers.polynomial_decay(
args.learning_rate, decay_step, end_learning_rate=0, power=0.9) args.learning_rate, args.iters, end_learning_rate=0, power=0.9)
optimizer = fluid.optimizer.Momentum( optimizer = fluid.optimizer.Momentum(
lr_decay, lr_decay,
momentum=0.9, momentum=0.9,
...@@ -192,12 +191,12 @@ def main(args): ...@@ -192,12 +191,12 @@ def main(args):
eval_dataset=eval_dataset, eval_dataset=eval_dataset,
optimizer=optimizer, optimizer=optimizer,
save_dir=args.save_dir, save_dir=args.save_dir,
num_epochs=args.num_epochs, iters=args.iters,
batch_size=args.batch_size, batch_size=args.batch_size,
pretrained_model=args.pretrained_model, pretrained_model=args.pretrained_model,
resume_model=args.resume_model, resume_model=args.resume_model,
save_interval_epochs=args.save_interval_epochs, save_interval_iters=args.save_interval_iters,
log_steps=args.log_steps, log_iters=args.log_iters,
num_classes=train_dataset.num_classes, num_classes=train_dataset.num_classes,
num_workers=args.num_workers, num_workers=args.num_workers,
use_vdl=args.use_vdl) use_vdl=args.use_vdl)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册