From f087abe199178b41f0a1a906334faf273e922557 Mon Sep 17 00:00:00 2001 From: chenguowei01 Date: Fri, 21 Aug 2020 17:58:03 +0800 Subject: [PATCH] train by iters --- dygraph/core/infer.py | 2 +- dygraph/core/train.py | 116 +++++++++++++++++++++--------------------- dygraph/core/val.py | 22 ++++---- dygraph/train.py | 31 ++++++----- 4 files changed, 86 insertions(+), 85 deletions(-) diff --git a/dygraph/core/infer.py b/dygraph/core/infer.py index f86823bc..499890d2 100644 --- a/dygraph/core/infer.py +++ b/dygraph/core/infer.py @@ -56,7 +56,7 @@ def infer(model, test_dataset=None, model_dir=None, save_dir='output'): raise Exception("Unexpected info '{}' in im_info".format( info[0])) - im_file = im_path.replace(test_dataset.data_dir, '') + im_file = im_path.replace(test_dataset.dataset_root, '') if im_file[0] == '/': im_file = im_file[1:] # save added image diff --git a/dygraph/core/train.py b/dygraph/core/train.py index 94a3ee4e..9f3f83c4 100644 --- a/dygraph/core/train.py +++ b/dygraph/core/train.py @@ -32,21 +32,21 @@ def train(model, eval_dataset=None, optimizer=None, save_dir='output', - num_epochs=100, + iters=10000, batch_size=2, pretrained_model=None, resume_model=None, - save_interval_epochs=1, - log_steps=10, + save_interval_iters=1000, + log_iters=10, num_classes=None, num_workers=8, use_vdl=False): ignore_index = model.ignore_index nranks = ParallelEnv().nranks - start_epoch = 0 + start_iter = 0 if resume_model is not None: - start_epoch = resume(model, optimizer, resume_model) + start_iter = resume(model, optimizer, resume_model) elif pretrained_model is not None: load_pretrained_model(model, pretrained_model) @@ -75,16 +75,19 @@ def train(model, timer = Timer() avg_loss = 0.0 - steps_per_epoch = len(batch_sampler) - total_steps = steps_per_epoch * (num_epochs - start_epoch) - num_steps = 0 + iters_per_epoch = len(batch_sampler) best_mean_iou = -1.0 - best_model_epoch = -1 + best_model_iter = -1 train_reader_cost = 0.0 train_batch_cost = 0.0 - for epoch in range(start_epoch, num_epochs): - timer.start() - for step, data in enumerate(loader): + timer.start() + + iter = 0 + while iter < iters: + for data in loader: + iter += 1 + if iter > iters: + break train_reader_cost += timer.elapsed_time() images = data[0] labels = data[1].astype('int64') @@ -101,64 +104,63 @@ def train(model, model.clear_gradients() avg_loss += loss.numpy()[0] lr = optimizer.current_step_lr() - num_steps += 1 train_batch_cost += timer.elapsed_time() - if num_steps % log_steps == 0 and ParallelEnv().local_rank == 0: - avg_loss /= log_steps - avg_train_reader_cost = train_reader_cost / log_steps - avg_train_batch_cost = train_batch_cost / log_steps + if (iter) % log_iters == 0 and ParallelEnv().local_rank == 0: + avg_loss /= log_iters + avg_train_reader_cost = train_reader_cost / log_iters + avg_train_batch_cost = train_batch_cost / log_iters train_reader_cost = 0.0 train_batch_cost = 0.0 - remain_steps = total_steps - num_steps - eta = calculate_eta(remain_steps, avg_train_batch_cost) + remain_iters = iters - iter + eta = calculate_eta(remain_iters, avg_train_batch_cost) logger.info( - "[TRAIN] Epoch={}/{}, Step={}/{}, loss={:.4f}, lr={:.6f}, batch_cost={:.4f}, reader_cost={:.4f} | ETA {}" - .format(epoch + 1, num_epochs, step + 1, steps_per_epoch, + "[TRAIN] epoch={}, iter={}/{}, loss={:.4f}, lr={:.6f}, batch_cost={:.4f}, reader_cost={:.4f} | ETA {}" + .format((iter - 1) // iters_per_epoch + 1, iter, iters, avg_loss * nranks, lr, avg_train_batch_cost, avg_train_reader_cost, eta)) if use_vdl: - log_writer.add_scalar('Train/loss', avg_loss * nranks, - num_steps) - log_writer.add_scalar('Train/lr', lr, num_steps) + log_writer.add_scalar('Train/loss', avg_loss * nranks, iter) + log_writer.add_scalar('Train/lr', lr, iter) log_writer.add_scalar('Train/batch_cost', - avg_train_batch_cost, num_steps) + avg_train_batch_cost, iter) log_writer.add_scalar('Train/reader_cost', - avg_train_reader_cost, num_steps) + avg_train_reader_cost, iter) avg_loss = 0.0 timer.restart() - if ((epoch + 1) % save_interval_epochs == 0 - or epoch + 1 == num_epochs) and ParallelEnv().local_rank == 0: - current_save_dir = os.path.join(save_dir, - "epoch_{}".format(epoch + 1)) - if not os.path.isdir(current_save_dir): - os.makedirs(current_save_dir) - fluid.save_dygraph(model.state_dict(), - os.path.join(current_save_dir, 'model')) - fluid.save_dygraph(optimizer.state_dict(), - os.path.join(current_save_dir, 'model')) + if (iter % save_interval_iters == 0 + or iter == iters) and ParallelEnv().local_rank == 0: + current_save_dir = os.path.join(save_dir, + "iter_{}".format(iter)) + if not os.path.isdir(current_save_dir): + os.makedirs(current_save_dir) + fluid.save_dygraph(model.state_dict(), + os.path.join(current_save_dir, 'model')) + fluid.save_dygraph(optimizer.state_dict(), + os.path.join(current_save_dir, 'model')) - if eval_dataset is not None: - mean_iou, avg_acc = evaluate( - model, - eval_dataset, - model_dir=current_save_dir, - num_classes=num_classes, - ignore_index=ignore_index, - epoch_id=epoch + 1) - if mean_iou > best_mean_iou: - best_mean_iou = mean_iou - best_model_epoch = epoch + 1 - best_model_dir = os.path.join(save_dir, "best_model") - fluid.save_dygraph(model.state_dict(), - os.path.join(best_model_dir, 'model')) - logger.info( - 'Current evaluated best model in eval_dataset is epoch_{}, miou={:4f}' - .format(best_model_epoch, best_mean_iou)) + if eval_dataset is not None: + mean_iou, avg_acc = evaluate( + model, + eval_dataset, + model_dir=current_save_dir, + num_classes=num_classes, + ignore_index=ignore_index, + iter_id=iter) + if mean_iou > best_mean_iou: + best_mean_iou = mean_iou + best_model_iter = iter + best_model_dir = os.path.join(save_dir, "best_model") + fluid.save_dygraph( + model.state_dict(), + os.path.join(best_model_dir, 'model')) + logger.info( + 'Current evaluated best model in eval_dataset is iter_{}, miou={:4f}' + .format(best_model_iter, best_mean_iou)) - if use_vdl: - log_writer.add_scalar('Evaluate/mIoU', mean_iou, epoch + 1) - log_writer.add_scalar('Evaluate/aAcc', avg_acc, epoch + 1) - model.train() + if use_vdl: + log_writer.add_scalar('Evaluate/mIoU', mean_iou, iter) + log_writer.add_scalar('Evaluate/aAcc', avg_acc, iter) + model.train() if use_vdl: log_writer.close() diff --git a/dygraph/core/val.py b/dygraph/core/val.py index a35f0709..e5e8dd4b 100644 --- a/dygraph/core/val.py +++ b/dygraph/core/val.py @@ -30,22 +30,22 @@ def evaluate(model, model_dir=None, num_classes=None, ignore_index=255, - epoch_id=None): + iter_id=None): ckpt_path = os.path.join(model_dir, 'model') para_state_dict, opti_state_dict = fluid.load_dygraph(ckpt_path) model.set_dict(para_state_dict) model.eval() - total_steps = len(eval_dataset) + total_iters = len(eval_dataset) conf_mat = ConfusionMatrix(num_classes, streaming=True) logger.info( - "Start to evaluating(total_samples={}, total_steps={})...".format( - len(eval_dataset), total_steps)) + "Start to evaluating(total_samples={}, total_iters={})...".format( + len(eval_dataset), total_iters)) timer = Timer() timer.start() - for step, (im, im_info, label) in tqdm.tqdm( - enumerate(eval_dataset), total=total_steps): + for iter, (im, im_info, label) in tqdm.tqdm( + enumerate(eval_dataset), total=total_iters): im = to_variable(im) pred, _ = model(im) pred = pred.numpy().astype('float32') @@ -67,12 +67,12 @@ def evaluate(model, conf_mat.calculate(pred=pred, label=label, ignore=mask) _, iou = conf_mat.mean_iou() - time_step = timer.elapsed_time() - remain_step = total_steps - step - 1 + time_iter = timer.elapsed_time() + remain_iter = total_iters - iter - 1 logger.debug( - "[EVAL] Epoch={}, Step={}/{}, iou={:4f}, sec/step={:.4f} | ETA {}". - format(epoch_id, step + 1, total_steps, iou, time_step, - calculate_eta(remain_step, time_step))) + "[EVAL] iter_id={}, iter={}/{}, iou={:4f}, sec/iter={:.4f} | ETA {}" + .format(iter_id, iter + 1, total_iters, iou, time_iter, + calculate_eta(remain_iter, time_iter))) timer.restart() category_iou, miou = conf_mat.mean_iou() diff --git a/dygraph/train.py b/dygraph/train.py index da93d0ec..bea7e60b 100644 --- a/dygraph/train.py +++ b/dygraph/train.py @@ -61,11 +61,11 @@ def parse_args(): default=[512, 512], type=int) parser.add_argument( - '--num_epochs', - dest='num_epochs', - help='Number epochs for training', + '--iters', + dest='iters', + help='iters for training', type=int, - default=100) + default=10000) parser.add_argument( '--batch_size', dest='batch_size', @@ -91,9 +91,9 @@ def parse_args(): type=str, default=None) parser.add_argument( - '--save_interval_epochs', - dest='save_interval_epochs', - help='The interval epochs for save a model snapshot', + '--save_interval_iters', + dest='save_interval_iters', + help='The interval iters for save a model snapshot', type=int, default=5) parser.add_argument( @@ -114,9 +114,9 @@ def parse_args(): help='Eval while training', action='store_true') parser.add_argument( - '--log_steps', - dest='log_steps', - help='Display logging information at every log_steps', + '--log_iters', + dest='log_iters', + help='Display logging information at every log_iters', default=10, type=int) parser.add_argument( @@ -174,11 +174,10 @@ def main(args): # Creat optimizer # todo, may less one than len(loader) - num_steps_each_epoch = len(train_dataset) // ( + num_iters_each_epoch = len(train_dataset) // ( args.batch_size * ParallelEnv().nranks) - decay_step = args.num_epochs * num_steps_each_epoch lr_decay = fluid.layers.polynomial_decay( - args.learning_rate, decay_step, end_learning_rate=0, power=0.9) + args.learning_rate, args.iters, end_learning_rate=0, power=0.9) optimizer = fluid.optimizer.Momentum( lr_decay, momentum=0.9, @@ -192,12 +191,12 @@ def main(args): eval_dataset=eval_dataset, optimizer=optimizer, save_dir=args.save_dir, - num_epochs=args.num_epochs, + iters=args.iters, batch_size=args.batch_size, pretrained_model=args.pretrained_model, resume_model=args.resume_model, - save_interval_epochs=args.save_interval_epochs, - log_steps=args.log_steps, + save_interval_iters=args.save_interval_iters, + log_iters=args.log_iters, num_classes=train_dataset.num_classes, num_workers=args.num_workers, use_vdl=args.use_vdl) -- GitLab