提交 f087abe1 编写于 作者: C chenguowei01

train by iters

上级 62e3a252
......@@ -56,7 +56,7 @@ def infer(model, test_dataset=None, model_dir=None, save_dir='output'):
raise Exception("Unexpected info '{}' in im_info".format(
info[0]))
im_file = im_path.replace(test_dataset.data_dir, '')
im_file = im_path.replace(test_dataset.dataset_root, '')
if im_file[0] == '/':
im_file = im_file[1:]
# save added image
......
......@@ -32,21 +32,21 @@ def train(model,
eval_dataset=None,
optimizer=None,
save_dir='output',
num_epochs=100,
iters=10000,
batch_size=2,
pretrained_model=None,
resume_model=None,
save_interval_epochs=1,
log_steps=10,
save_interval_iters=1000,
log_iters=10,
num_classes=None,
num_workers=8,
use_vdl=False):
ignore_index = model.ignore_index
nranks = ParallelEnv().nranks
start_epoch = 0
start_iter = 0
if resume_model is not None:
start_epoch = resume(model, optimizer, resume_model)
start_iter = resume(model, optimizer, resume_model)
elif pretrained_model is not None:
load_pretrained_model(model, pretrained_model)
......@@ -75,16 +75,19 @@ def train(model,
timer = Timer()
avg_loss = 0.0
steps_per_epoch = len(batch_sampler)
total_steps = steps_per_epoch * (num_epochs - start_epoch)
num_steps = 0
iters_per_epoch = len(batch_sampler)
best_mean_iou = -1.0
best_model_epoch = -1
best_model_iter = -1
train_reader_cost = 0.0
train_batch_cost = 0.0
for epoch in range(start_epoch, num_epochs):
timer.start()
for step, data in enumerate(loader):
timer.start()
iter = 0
while iter < iters:
for data in loader:
iter += 1
if iter > iters:
break
train_reader_cost += timer.elapsed_time()
images = data[0]
labels = data[1].astype('int64')
......@@ -101,64 +104,63 @@ def train(model,
model.clear_gradients()
avg_loss += loss.numpy()[0]
lr = optimizer.current_step_lr()
num_steps += 1
train_batch_cost += timer.elapsed_time()
if num_steps % log_steps == 0 and ParallelEnv().local_rank == 0:
avg_loss /= log_steps
avg_train_reader_cost = train_reader_cost / log_steps
avg_train_batch_cost = train_batch_cost / log_steps
if (iter) % log_iters == 0 and ParallelEnv().local_rank == 0:
avg_loss /= log_iters
avg_train_reader_cost = train_reader_cost / log_iters
avg_train_batch_cost = train_batch_cost / log_iters
train_reader_cost = 0.0
train_batch_cost = 0.0
remain_steps = total_steps - num_steps
eta = calculate_eta(remain_steps, avg_train_batch_cost)
remain_iters = iters - iter
eta = calculate_eta(remain_iters, avg_train_batch_cost)
logger.info(
"[TRAIN] Epoch={}/{}, Step={}/{}, loss={:.4f}, lr={:.6f}, batch_cost={:.4f}, reader_cost={:.4f} | ETA {}"
.format(epoch + 1, num_epochs, step + 1, steps_per_epoch,
"[TRAIN] epoch={}, iter={}/{}, loss={:.4f}, lr={:.6f}, batch_cost={:.4f}, reader_cost={:.4f} | ETA {}"
.format((iter - 1) // iters_per_epoch + 1, iter, iters,
avg_loss * nranks, lr, avg_train_batch_cost,
avg_train_reader_cost, eta))
if use_vdl:
log_writer.add_scalar('Train/loss', avg_loss * nranks,
num_steps)
log_writer.add_scalar('Train/lr', lr, num_steps)
log_writer.add_scalar('Train/loss', avg_loss * nranks, iter)
log_writer.add_scalar('Train/lr', lr, iter)
log_writer.add_scalar('Train/batch_cost',
avg_train_batch_cost, num_steps)
avg_train_batch_cost, iter)
log_writer.add_scalar('Train/reader_cost',
avg_train_reader_cost, num_steps)
avg_train_reader_cost, iter)
avg_loss = 0.0
timer.restart()
if ((epoch + 1) % save_interval_epochs == 0
or epoch + 1 == num_epochs) and ParallelEnv().local_rank == 0:
current_save_dir = os.path.join(save_dir,
"epoch_{}".format(epoch + 1))
if not os.path.isdir(current_save_dir):
os.makedirs(current_save_dir)
fluid.save_dygraph(model.state_dict(),
os.path.join(current_save_dir, 'model'))
fluid.save_dygraph(optimizer.state_dict(),
os.path.join(current_save_dir, 'model'))
if (iter % save_interval_iters == 0
or iter == iters) and ParallelEnv().local_rank == 0:
current_save_dir = os.path.join(save_dir,
"iter_{}".format(iter))
if not os.path.isdir(current_save_dir):
os.makedirs(current_save_dir)
fluid.save_dygraph(model.state_dict(),
os.path.join(current_save_dir, 'model'))
fluid.save_dygraph(optimizer.state_dict(),
os.path.join(current_save_dir, 'model'))
if eval_dataset is not None:
mean_iou, avg_acc = evaluate(
model,
eval_dataset,
model_dir=current_save_dir,
num_classes=num_classes,
ignore_index=ignore_index,
epoch_id=epoch + 1)
if mean_iou > best_mean_iou:
best_mean_iou = mean_iou
best_model_epoch = epoch + 1
best_model_dir = os.path.join(save_dir, "best_model")
fluid.save_dygraph(model.state_dict(),
os.path.join(best_model_dir, 'model'))
logger.info(
'Current evaluated best model in eval_dataset is epoch_{}, miou={:4f}'
.format(best_model_epoch, best_mean_iou))
if eval_dataset is not None:
mean_iou, avg_acc = evaluate(
model,
eval_dataset,
model_dir=current_save_dir,
num_classes=num_classes,
ignore_index=ignore_index,
iter_id=iter)
if mean_iou > best_mean_iou:
best_mean_iou = mean_iou
best_model_iter = iter
best_model_dir = os.path.join(save_dir, "best_model")
fluid.save_dygraph(
model.state_dict(),
os.path.join(best_model_dir, 'model'))
logger.info(
'Current evaluated best model in eval_dataset is iter_{}, miou={:4f}'
.format(best_model_iter, best_mean_iou))
if use_vdl:
log_writer.add_scalar('Evaluate/mIoU', mean_iou, epoch + 1)
log_writer.add_scalar('Evaluate/aAcc', avg_acc, epoch + 1)
model.train()
if use_vdl:
log_writer.add_scalar('Evaluate/mIoU', mean_iou, iter)
log_writer.add_scalar('Evaluate/aAcc', avg_acc, iter)
model.train()
if use_vdl:
log_writer.close()
......@@ -30,22 +30,22 @@ def evaluate(model,
model_dir=None,
num_classes=None,
ignore_index=255,
epoch_id=None):
iter_id=None):
ckpt_path = os.path.join(model_dir, 'model')
para_state_dict, opti_state_dict = fluid.load_dygraph(ckpt_path)
model.set_dict(para_state_dict)
model.eval()
total_steps = len(eval_dataset)
total_iters = len(eval_dataset)
conf_mat = ConfusionMatrix(num_classes, streaming=True)
logger.info(
"Start to evaluating(total_samples={}, total_steps={})...".format(
len(eval_dataset), total_steps))
"Start to evaluating(total_samples={}, total_iters={})...".format(
len(eval_dataset), total_iters))
timer = Timer()
timer.start()
for step, (im, im_info, label) in tqdm.tqdm(
enumerate(eval_dataset), total=total_steps):
for iter, (im, im_info, label) in tqdm.tqdm(
enumerate(eval_dataset), total=total_iters):
im = to_variable(im)
pred, _ = model(im)
pred = pred.numpy().astype('float32')
......@@ -67,12 +67,12 @@ def evaluate(model,
conf_mat.calculate(pred=pred, label=label, ignore=mask)
_, iou = conf_mat.mean_iou()
time_step = timer.elapsed_time()
remain_step = total_steps - step - 1
time_iter = timer.elapsed_time()
remain_iter = total_iters - iter - 1
logger.debug(
"[EVAL] Epoch={}, Step={}/{}, iou={:4f}, sec/step={:.4f} | ETA {}".
format(epoch_id, step + 1, total_steps, iou, time_step,
calculate_eta(remain_step, time_step)))
"[EVAL] iter_id={}, iter={}/{}, iou={:4f}, sec/iter={:.4f} | ETA {}"
.format(iter_id, iter + 1, total_iters, iou, time_iter,
calculate_eta(remain_iter, time_iter)))
timer.restart()
category_iou, miou = conf_mat.mean_iou()
......
......@@ -61,11 +61,11 @@ def parse_args():
default=[512, 512],
type=int)
parser.add_argument(
'--num_epochs',
dest='num_epochs',
help='Number epochs for training',
'--iters',
dest='iters',
help='iters for training',
type=int,
default=100)
default=10000)
parser.add_argument(
'--batch_size',
dest='batch_size',
......@@ -91,9 +91,9 @@ def parse_args():
type=str,
default=None)
parser.add_argument(
'--save_interval_epochs',
dest='save_interval_epochs',
help='The interval epochs for save a model snapshot',
'--save_interval_iters',
dest='save_interval_iters',
help='The interval iters for save a model snapshot',
type=int,
default=5)
parser.add_argument(
......@@ -114,9 +114,9 @@ def parse_args():
help='Eval while training',
action='store_true')
parser.add_argument(
'--log_steps',
dest='log_steps',
help='Display logging information at every log_steps',
'--log_iters',
dest='log_iters',
help='Display logging information at every log_iters',
default=10,
type=int)
parser.add_argument(
......@@ -174,11 +174,10 @@ def main(args):
# Creat optimizer
# todo, may less one than len(loader)
num_steps_each_epoch = len(train_dataset) // (
num_iters_each_epoch = len(train_dataset) // (
args.batch_size * ParallelEnv().nranks)
decay_step = args.num_epochs * num_steps_each_epoch
lr_decay = fluid.layers.polynomial_decay(
args.learning_rate, decay_step, end_learning_rate=0, power=0.9)
args.learning_rate, args.iters, end_learning_rate=0, power=0.9)
optimizer = fluid.optimizer.Momentum(
lr_decay,
momentum=0.9,
......@@ -192,12 +191,12 @@ def main(args):
eval_dataset=eval_dataset,
optimizer=optimizer,
save_dir=args.save_dir,
num_epochs=args.num_epochs,
iters=args.iters,
batch_size=args.batch_size,
pretrained_model=args.pretrained_model,
resume_model=args.resume_model,
save_interval_epochs=args.save_interval_epochs,
log_steps=args.log_steps,
save_interval_iters=args.save_interval_iters,
log_iters=args.log_iters,
num_classes=train_dataset.num_classes,
num_workers=args.num_workers,
use_vdl=args.use_vdl)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册