diff --git a/dygraph/train.py b/dygraph/train.py index 88b1ccb64bcb7f9862a4f81a17ee1cd392db36ae..52aa032c454a5cc72ddf9cc3b27cee5b415511eb 100644 --- a/dygraph/train.py +++ b/dygraph/train.py @@ -26,6 +26,8 @@ import models import utils.logging as logging from utils import get_environ_info from utils import load_pretrained_model +from utils import resume +from utils import Timer, calculate_eta from val import evaluate @@ -78,7 +80,13 @@ def parse_args(): parser.add_argument( '--pretrained_model', dest='pretrained_model', - help='The path of pretrained weight', + help='The path of pretrained model', + type=str, + default=None) + parser.add_argument( + '--resume_model', + dest='resume_model', + help='The path of resume model', type=str, default=None) parser.add_argument( @@ -104,6 +112,17 @@ def parse_args(): dest='do_eval', help='Eval while training', action='store_true') + parser.add_argument( + '--log_steps', + dest='log_steps', + help='Display logging information at every log_steps', + default=10, + type=int) + parser.add_argument( + '--use_vdl', + dest='use_vdl', + help='Whether to record the data to VisualDL during training', + action='store_true') return parser.parse_args() @@ -117,13 +136,20 @@ def train(model, num_epochs=100, batch_size=2, pretrained_model=None, + resume_model=None, save_interval_epochs=1, + log_steps=10, num_classes=None, - num_workers=8): + num_workers=8, + use_vdl=False): ignore_index = model.ignore_index nranks = ParallelEnv().nranks - load_pretrained_model(model, pretrained_model) + start_epoch = 0 + if resume_model is not None: + start_epoch = resume(model, optimizer, resume_model) + elif pretrained_model is not None: + load_pretrained_model(model, pretrained_model) if not os.path.isdir(save_dir): if os.path.exists(save_dir): @@ -144,9 +170,19 @@ def train(model, return_list=True, ) - num_steps_each_epoch = len(train_dataset) // batch_size + if use_vdl: + from visualdl import LogWriter + log_writer = LogWriter(save_dir) - for epoch in range(num_epochs): + timer = Timer() + timer.start() + avg_loss = 0.0 + steps_per_epoch = len(batch_sampler) + total_steps = steps_per_epoch * (num_epochs - start_epoch) + num_steps = 0 + best_mean_iou = -1.0 + best_model_epoch = 1 + for epoch in range(start_epoch, num_epochs): for step, data in enumerate(loader): images = data[0] labels = data[1].astype('int64') @@ -160,22 +196,37 @@ def train(model, loss.backward() optimizer.minimize(loss) model.clear_gradients() - logging.info("[TRAIN] Epoch={}/{}, Step={}/{}, loss={}".format( - epoch + 1, num_epochs, step + 1, len(batch_sampler), - loss.numpy())) + avg_loss += loss.numpy()[0] + lr = optimizer.current_step_lr() + num_steps += 1 + if num_steps % log_steps == 0 and ParallelEnv().local_rank == 0: + avg_loss /= log_steps + time_step = timer.elapsed_time() / log_steps + remain_steps = total_steps - num_steps + logging.info( + "[TRAIN] Epoch={}/{}, Step={}/{}, loss={:.4f}, lr={:.6f}, sec/step={:.4f} | ETA {}" + .format(epoch + 1, num_epochs, step + 1, steps_per_epoch, + avg_loss, lr, time_step, + calculate_eta(remain_steps, time_step))) + if use_vdl: + log_writer.add_scalar('Train/loss', avg_loss, num_steps) + log_writer.add_scalar('Train/lr', lr, num_steps) + avg_loss = 0.0 + timer.restart() if ((epoch + 1) % save_interval_epochs == 0 - or num_steps_each_epoch == num_epochs - 1 - ) and ParallelEnv().local_rank == 0: + or epoch + 1 == num_epochs) and ParallelEnv().local_rank == 0: current_save_dir = os.path.join(save_dir, "epoch_{}".format(epoch + 1)) if not os.path.isdir(current_save_dir): os.makedirs(current_save_dir) fluid.save_dygraph(model.state_dict(), os.path.join(current_save_dir, 'model')) + fluid.save_dygraph(optimizer.state_dict(), + os.path.join(current_save_dir, 'model')) if eval_dataset is not None: - evaluate( + mean_iou, mean_acc = evaluate( model, eval_dataset, places=places, @@ -184,7 +235,24 @@ def train(model, batch_size=batch_size, ignore_index=ignore_index, epoch_id=epoch + 1) + if mean_iou > best_mean_iou: + best_mean_iou = mean_iou + best_model_epoch = epoch + 1 + best_model_dir = os.path.join(save_dir, "best_model") + fluid.save_dygraph(model.state_dict(), + os.path.join(best_model_dir, 'model')) + logging.info( + 'Current evaluated best model in eval_dataset is epoch_{}, miou={:4f}' + .format(best_model_epoch, best_mean_iou)) + + if use_vdl: + log_writer.add_scalar('Evaluate/mean_iou', mean_iou, + epoch + 1) + log_writer.add_scalar('Evaluate/mean_acc', mean_acc, + epoch + 1) model.train() + if use_vdl: + log_writer.close() def main(args): @@ -223,7 +291,9 @@ def main(args): num_classes=train_dataset.num_classes, ignore_index=255) # Creat optimizer - num_steps_each_epoch = len(train_dataset) // args.batch_size + # todo, may less one than len(loader) + num_steps_each_epoch = len(train_dataset) // ( + args.batch_size * ParallelEnv().nranks) decay_step = args.num_epochs * num_steps_each_epoch lr_decay = fluid.layers.polynomial_decay( args.learning_rate, decay_step, end_learning_rate=0, power=0.9) @@ -243,9 +313,12 @@ def main(args): num_epochs=args.num_epochs, batch_size=args.batch_size, pretrained_model=args.pretrained_model, + resume_model=args.resume_model, save_interval_epochs=args.save_interval_epochs, + log_steps=args.log_steps, num_classes=train_dataset.num_classes, - num_workers=args.num_workers) + num_workers=args.num_workers, + use_vdl=args.use_vdl) if __name__ == '__main__': diff --git a/dygraph/utils/__init__.py b/dygraph/utils/__init__.py index 7579cf7f0ed9f051b154d7bc2f99fc25ac246d4a..68a8136a647f50dac8ab122530c71c82cca53f79 100644 --- a/dygraph/utils/__init__.py +++ b/dygraph/utils/__init__.py @@ -16,3 +16,4 @@ from . import logging from . import download from .metrics import ConfusionMatrix from .utils import * +from .timer import Timer, calculate_eta diff --git a/dygraph/utils/timer.py b/dygraph/utils/timer.py new file mode 100644 index 0000000000000000000000000000000000000000..4ebbddc9a154de4a36d6b6d9b437e14382031c49 --- /dev/null +++ b/dygraph/utils/timer.py @@ -0,0 +1,60 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import time + + +class Timer(object): + """ Simple timer class for measuring time consuming """ + + def __init__(self): + self._start_time = 0.0 + self._end_time = 0.0 + self._elapsed_time = 0.0 + self._is_running = False + + def start(self): + self._is_running = True + self._start_time = time.time() + + def restart(self): + self.start() + + def stop(self): + self._is_running = False + self._end_time = time.time() + + def elapsed_time(self): + self._end_time = time.time() + self._elapsed_time = self._end_time - self._start_time + if not self.is_running: + return 0.0 + + return self._elapsed_time + + @property + def is_running(self): + return self._is_running + + +def calculate_eta(remaining_step, speed): + if remaining_step < 0: + remaining_step = 0 + remaining_time = int(remaining_step * speed) + result = "{:0>2}:{:0>2}:{:0>2}" + arr = [] + for i in range(2, -1, -1): + arr.append(int(remaining_time / 60**i)) + remaining_time %= 60**i + return result.format(*arr) diff --git a/dygraph/utils/utils.py b/dygraph/utils/utils.py index 7a450b352e0dcf98c1eeaa093878c9b3ba649dfd..46e204dd2e91f319c788eb43ca50602308ce1954 100644 --- a/dygraph/utils/utils.py +++ b/dygraph/utils/utils.py @@ -49,7 +49,7 @@ def get_environ_info(): def load_pretrained_model(model, pretrained_model): if pretrained_model is not None: - logging.info('Load pretrained model!') + logging.info('Load pretrained model from {}'.format(pretrained_model)) if os.path.exists(pretrained_model): ckpt_path = os.path.join(pretrained_model, 'model') para_state_dict, _ = fluid.load_dygraph(ckpt_path) @@ -74,8 +74,30 @@ def load_pretrained_model(model, pretrained_model): else: raise ValueError( - 'The pretrained model directory is not Found: {}'.formnat( + 'The pretrained model directory is not Found: {}'.format( pretrained_model)) + else: + logging.info('No pretrained model to load, train from scratch') + + +def resume(model, optimizer, resume_model): + if resume_model is not None: + logging.info('Resume model from {}'.format(resume_model)) + if os.path.exists(resume_model): + ckpt_path = os.path.join(resume_model, 'model') + para_state_dict, opti_state_dict = fluid.load_dygraph(ckpt_path) + model.set_dict(para_state_dict) + optimizer.set_dict(opti_state_dict) + epoch = resume_model.split('_')[-1] + if epoch.isdigit(): + epoch = int(epoch) + return epoch + else: + raise ValueError( + 'The resume model directory is not Found: {}'.format( + resume_model)) + else: + logging.info('No model need to resume') def visualize(image, result, save_dir=None, weight=0.6): diff --git a/dygraph/val.py b/dygraph/val.py index 358bcd83b3e32cc0f86b334ec5b09748e593ee1e..77965f3f8a040d1bfa2f1c6cfaa3a838ddc937c7 100644 --- a/dygraph/val.py +++ b/dygraph/val.py @@ -29,6 +29,7 @@ import models import utils.logging as logging from utils import get_environ_info from utils import ConfusionMatrix +from utils import Timer, calculate_eta def parse_args(): @@ -96,12 +97,14 @@ def evaluate(model, places=places, return_list=True, ) - total_steps = math.ceil(len(eval_dataset) * 1.0 / batch_size) + total_steps = len(batch_sampler) conf_mat = ConfusionMatrix(num_classes, streaming=True) logging.info( "Start to evaluating(total_samples={}, total_steps={})...".format( len(eval_dataset), total_steps)) + timer = Timer() + timer.start() for step, data in enumerate(loader): images = data[0] labels = data[1].astype('int64') @@ -113,8 +116,13 @@ def evaluate(model, conf_mat.calculate(pred=pred, label=labels, ignore=mask) _, iou = conf_mat.mean_iou() - logging.info("[EVAL] Epoch={}, Step={}/{}, iou={}".format( - epoch_id, step + 1, total_steps, iou)) + time_step = timer.elapsed_time() + remain_step = total_steps - step - 1 + logging.info( + "[EVAL] Epoch={}, Step={}/{}, iou={:4f}, sec/step={:.4f} | ETA {}". + format(epoch_id, step + 1, total_steps, iou, time_step, + calculate_eta(remain_step, time_step))) + timer.restart() category_iou, miou = conf_mat.mean_iou() category_acc, macc = conf_mat.accuracy() @@ -123,6 +131,7 @@ def evaluate(model, logging.info("[EVAL] Category IoU: " + str(category_iou)) logging.info("[EVAL] Category Acc: " + str(category_acc)) logging.info("[EVAL] Kappa:{:.4f} ".format(conf_mat.kappa())) + return miou, macc def main(args):