From 11428859579aafc5e45d7aabdb9a00dfb55d1064 Mon Sep 17 00:00:00 2001 From: FlyingQianMM <245467267@qq.com> Date: Tue, 12 May 2020 21:31:21 +0800 Subject: [PATCH] mv resume_checkpoint into net_initialize --- paddlex/cv/models/base.py | 55 ++++++++------- paddlex/cv/models/classifier.py | 26 +++---- paddlex/cv/models/deeplabv3p.py | 26 +++---- paddlex/cv/models/faster_rcnn.py | 25 +++---- paddlex/cv/models/mask_rcnn.py | 25 +++---- paddlex/cv/models/yolo_v3.py | 26 +++---- paddlex/utils/utils.py | 117 ++++++++++++++++++++++++++++++- 7 files changed, 191 insertions(+), 109 deletions(-) diff --git a/paddlex/cv/models/base.py b/paddlex/cv/models/base.py index beb51d7..c1e2fc8 100644 --- a/paddlex/cv/models/base.py +++ b/paddlex/cv/models/base.py @@ -70,6 +70,8 @@ class BaseAPI: self.sync_bn = False # 当前模型状态 self.status = 'Normal' + # 已完成迭代轮数,为恢复训练时的起始轮数 + self.completed_epochs = 0 def _get_single_card_bs(self, batch_size): if batch_size % len(self.places) == 0: @@ -182,22 +184,36 @@ class BaseAPI: fuse_bn=False, save_dir='.', sensitivities_file=None, - eval_metric_loss=0.05): - pretrain_dir = osp.join(save_dir, 'pretrain') - if not os.path.isdir(pretrain_dir): - if os.path.exists(pretrain_dir): - os.remove(pretrain_dir) - os.makedirs(pretrain_dir) - if hasattr(self, 'backbone'): - backbone = self.backbone - else: - backbone = self.__class__.__name__ - pretrain_weights = get_pretrain_weights( - pretrain_weights, self.model_type, backbone, pretrain_dir) + eval_metric_loss=0.05, + resume_checkpoint=None): + if not resume_checkpoint: + pretrain_dir = osp.join(save_dir, 'pretrain') + if not os.path.isdir(pretrain_dir): + if os.path.exists(pretrain_dir): + os.remove(pretrain_dir) + os.makedirs(pretrain_dir) + if hasattr(self, 'backbone'): + backbone = self.backbone + else: + backbone = self.__class__.__name__ + pretrain_weights = get_pretrain_weights( + pretrain_weights, self.model_type, backbone, pretrain_dir) if startup_prog is None: startup_prog = fluid.default_startup_program() self.exe.run(startup_prog) - if pretrain_weights is not None: + if resume_checkpoint: + logging.info( + "Resume checkpoint from {}.".format(resume_checkpoint), + use_color=True) + paddlex.utils.utils.load_pretrain_weights( + self.exe, self.train_prog, resume_checkpoint, resume=True) + if not osp.exists(osp.join(resume_checkpoint, "model.yml")): + raise Exception( + "There's not model.yml in {}".format(resume_checkpoint)) + with open(osp.join(resume_checkpoint, "model.yml")) as f: + info = yaml.load(f.read(), Loader=yaml.Loader) + self.completed_epochs = info['completed_epochs'] + elif pretrain_weights is not None: logging.info( "Load pretrain weights from {}.".format(pretrain_weights), use_color=True) @@ -226,17 +242,6 @@ class BaseAPI: use_color=True) self.status = 'Prune' - def resume_checkpoint(self, path, startup_prog=None): - if not osp.isdir(path): - raise Exception("Model pretrain path {} does not " - "exists.".format(path)) - if osp.exists(osp.join(path, 'model.pdparams')): - path = osp.join(path, 'model') - if startup_prog is None: - startup_prog = fluid.default_startup_program() - self.exe.run(startup_prog) - fluid.load(self.train_prog, path, executor=self.exe) - def get_model_info(self): info = dict() info['version'] = paddlex.__version__ @@ -272,6 +277,7 @@ class BaseAPI: name = op.__class__.__name__ attr = op.__dict__ info['Transforms'].append({name: attr}) + info['completed_epochs'] = self.completed_epochs return info def save_model(self, save_dir): @@ -513,6 +519,7 @@ class BaseAPI: return_details=True) logging.info('[EVAL] Finished, Epoch={}, {} .'.format( i + 1, dict2str(self.eval_metrics))) + self.completed_epochs += 1 # 保存最优模型 best_accuracy_key = list(self.eval_metrics.keys())[0] current_accuracy = self.eval_metrics[best_accuracy_key] diff --git a/paddlex/cv/models/classifier.py b/paddlex/cv/models/classifier.py index aaf439b..c1a0f69 100644 --- a/paddlex/cv/models/classifier.py +++ b/paddlex/cv/models/classifier.py @@ -157,24 +157,16 @@ class BaseClassifier(BaseAPI): # 构建训练、验证、预测网络 self.build_program() # 初始化网络权重 + self.net_initialize( + startup_prog=fluid.default_startup_program(), + pretrain_weights=pretrain_weights, + save_dir=save_dir, + sensitivities_file=sensitivities_file, + eval_metric_loss=eval_metric_loss, + resume_checkpoint=resume_checkpoint) + start_epoch = 0 if resume_checkpoint: - self.resume_checkpoint( - path=resume_checkpoint, - startup_prog=fluid.default_startup_program()) - scope = fluid.global_scope() - v = scope.find_var('@LR_DECAY_COUNTER@') - step = np.array(v.get_tensor())[0] if v else 0 - num_steps_each_epoch = train_dataset.num_samples // train_batch_size - start_epoch = step // num_steps_each_epoch + 1 - else: - self.net_initialize( - startup_prog=fluid.default_startup_program(), - pretrain_weights=pretrain_weights, - save_dir=save_dir, - sensitivities_file=sensitivities_file, - eval_metric_loss=eval_metric_loss) - start_epoch = 0 - + start_epoch = self.completed_epochs # 训练 self.train_loop( start_epoch=start_epoch, diff --git a/paddlex/cv/models/deeplabv3p.py b/paddlex/cv/models/deeplabv3p.py index f1ccf10..74f877b 100644 --- a/paddlex/cv/models/deeplabv3p.py +++ b/paddlex/cv/models/deeplabv3p.py @@ -281,24 +281,16 @@ class DeepLabv3p(BaseAPI): # 构建训练、验证、预测网络 self.build_program() # 初始化网络权重 + self.net_initialize( + startup_prog=fluid.default_startup_program(), + pretrain_weights=pretrain_weights, + save_dir=save_dir, + sensitivities_file=sensitivities_file, + eval_metric_loss=eval_metric_loss, + resume_checkpoint=resume_checkpoint) + start_epoch = 0 if resume_checkpoint: - self.resume_checkpoint( - path=resume_checkpoint, - startup_prog=fluid.default_startup_program()) - scope = fluid.global_scope() - v = scope.find_var('@LR_DECAY_COUNTER@') - step = np.array(v.get_tensor())[0] if v else 0 - num_steps_each_epoch = train_dataset.num_samples // train_batch_size - start_epoch = step // num_steps_each_epoch + 1 - else: - self.net_initialize( - startup_prog=fluid.default_startup_program(), - pretrain_weights=pretrain_weights, - save_dir=save_dir, - sensitivities_file=sensitivities_file, - eval_metric_loss=eval_metric_loss) - start_epoch = 0 - + start_epoch = self.completed_epochs # 训练 self.train_loop( start_epoch=start_epoch, diff --git a/paddlex/cv/models/faster_rcnn.py b/paddlex/cv/models/faster_rcnn.py index 1d97d84..45a2927 100644 --- a/paddlex/cv/models/faster_rcnn.py +++ b/paddlex/cv/models/faster_rcnn.py @@ -229,23 +229,16 @@ class FasterRCNN(BaseAPI): fuse_bn = True if self.with_fpn and self.backbone in ['ResNet18', 'ResNet50']: fuse_bn = False + self.net_initialize( + startup_prog=fluid.default_startup_program(), + pretrain_weights=pretrain_weights, + save_dir=save_dir, + sensitivities_file=sensitivities_file, + eval_metric_loss=eval_metric_loss, + resume_checkpoint=resume_checkpoint) + start_epoch = 0 if resume_checkpoint: - self.resume_checkpoint( - path=resume_checkpoint, - startup_prog=fluid.default_startup_program()) - scope = fluid.global_scope() - v = scope.find_var('@LR_DECAY_COUNTER@') - step = np.array(v.get_tensor())[0] if v else 0 - num_steps_each_epoch = train_dataset.num_samples // train_batch_size - start_epoch = step // num_steps_each_epoch + 1 - else: - self.net_initialize( - startup_prog=fluid.default_startup_program(), - pretrain_weights=pretrain_weights, - fuse_bn=fuse_bn, - save_dir=save_dir) - start_epoch = 0 - + start_epoch = self.completed_epochs # 训练 self.train_loop( start_epoch=start_epoch, diff --git a/paddlex/cv/models/mask_rcnn.py b/paddlex/cv/models/mask_rcnn.py index fd75df4..e1130ee 100644 --- a/paddlex/cv/models/mask_rcnn.py +++ b/paddlex/cv/models/mask_rcnn.py @@ -196,23 +196,16 @@ class MaskRCNN(FasterRCNN): fuse_bn = True if self.with_fpn and self.backbone in ['ResNet18', 'ResNet50']: fuse_bn = False + self.net_initialize( + startup_prog=fluid.default_startup_program(), + pretrain_weights=pretrain_weights, + save_dir=save_dir, + sensitivities_file=sensitivities_file, + eval_metric_loss=eval_metric_loss, + resume_checkpoint=resume_checkpoint) + start_epoch = 0 if resume_checkpoint: - self.resume_checkpoint( - path=resume_checkpoint, - startup_prog=fluid.default_startup_program()) - scope = fluid.global_scope() - v = scope.find_var('@LR_DECAY_COUNTER@') - step = np.array(v.get_tensor())[0] if v else 0 - num_steps_each_epoch = train_dataset.num_samples // train_batch_size - start_epoch = step // num_steps_each_epoch + 1 - else: - self.net_initialize( - startup_prog=fluid.default_startup_program(), - pretrain_weights=pretrain_weights, - fuse_bn=fuse_bn, - save_dir=save_dir) - start_epoch = 0 - + start_epoch = self.completed_epochs # 训练 self.train_loop( start_epoch=start_epoch, diff --git a/paddlex/cv/models/yolo_v3.py b/paddlex/cv/models/yolo_v3.py index 3ea1317..3b27d00 100644 --- a/paddlex/cv/models/yolo_v3.py +++ b/paddlex/cv/models/yolo_v3.py @@ -233,24 +233,16 @@ class YOLOv3(BaseAPI): # 构建训练、验证、预测网络 self.build_program() # 初始化网络权重 + self.net_initialize( + startup_prog=fluid.default_startup_program(), + pretrain_weights=pretrain_weights, + save_dir=save_dir, + sensitivities_file=sensitivities_file, + eval_metric_loss=eval_metric_loss, + resume_checkpoint=resume_checkpoint) + start_epoch = 0 if resume_checkpoint: - self.resume_checkpoint( - path=resume_checkpoint, - startup_prog=fluid.default_startup_program()) - scope = fluid.global_scope() - v = scope.find_var('@LR_DECAY_COUNTER@') - step = np.array(v.get_tensor())[0] if v else 0 - num_steps_each_epoch = train_dataset.num_samples // train_batch_size - start_epoch = step // num_steps_each_epoch + 1 - else: - self.net_initialize( - startup_prog=fluid.default_startup_program(), - pretrain_weights=pretrain_weights, - save_dir=save_dir, - sensitivities_file=sensitivities_file, - eval_metric_loss=eval_metric_loss) - start_epoch = 0 - + start_epoch = self.completed_epochs # 训练 self.train_loop( start_epoch=start_epoch, diff --git a/paddlex/utils/utils.py b/paddlex/utils/utils.py index 69e4565..cbdd6a5 100644 --- a/paddlex/utils/utils.py +++ b/paddlex/utils/utils.py @@ -170,11 +170,85 @@ def load_pdparams(exe, main_prog, model_dir): len(vars_to_load), model_dir)) -def load_pretrain_weights(exe, main_prog, weights_dir, fuse_bn=False): +def is_persistable(var): + import paddle.fluid as fluid + from paddle.fluid.proto.framework_pb2 import VarType + + if var.desc.type() == fluid.core.VarDesc.VarType.FEED_MINIBATCH or \ + var.desc.type() == fluid.core.VarDesc.VarType.FETCH_LIST or \ + var.desc.type() == fluid.core.VarDesc.VarType.READER: + return False + return var.persistable + + +def is_belong_to_optimizer(var): + import paddle.fluid as fluid + from paddle.fluid.proto.framework_pb2 import VarType + + if not (isinstance(var, fluid.framework.Parameter) + or var.desc.need_check_feed()): + return is_persistable(var) + return False + + +def load_pdopt(exe, main_prog, model_dir): + import paddle.fluid as fluid + + optimizer_var_list = list() + vars_to_load = list() + import pickle + with open(osp.join(model_dir, 'model.pdopt'), 'rb') as f: + opt_dict = pickle.load(f) if six.PY2 else pickle.load( + f, encoding='latin1') + optimizer_var_list = list( + filter(is_belong_to_optimizer, main_prog.list_vars())) + exception_message = "the training process can not be resumed due to optimizer set now and last time is different. Recommend to use `pretrain_weights` instead of `resume_checkpoint`" + if len(optimizer_var_list) > 0: + for var in optimizer_var_list: + if var.name not in opt_dict: + raise Exception( + "{} is not in saved paddlex optimizer, {}".format( + var.name, exception_message)) + if var.shape != opt_dict[var.name].shape: + raise Exception( + "Shape of optimizer variable {} doesn't match.(Last: {}, Now: {}), {}" + .format(var.name, opt_dict[var.name].shape, + var.shape), exception_message) + optimizer_varname_list = [var.name for var in optimizer_var_list] + for k, v in opt_dict.items(): + if k not in optimizer_varname_list: + raise Exception( + "{} in saved paddlex optimizer is not in the model, {}". + format(k, exception_message)) + fluid.io.set_program_state(main_prog, opt_dict) + + if len(optimizer_var_list) == 0: + raise Exception( + "There is no optimizer parameters in the model, please set the optimizer!" + ) + else: + logging.info( + "There are {} optimizer parameters in {} are loaded.".format( + len(optimizer_var_list), model_dir)) + + +def load_pretrain_weights(exe, + main_prog, + weights_dir, + fuse_bn=False, + resume=False): if not osp.exists(weights_dir): raise Exception("Path {} not exists.".format(weights_dir)) if osp.exists(osp.join(weights_dir, "model.pdparams")): - return load_pdparams(exe, main_prog, weights_dir) + load_pdparams(exe, main_prog, weights_dir) + if resume: + if osp.exists(osp.join(weights_dir, "model.pdopt")): + load_pdopt(exe, main_prog, weights_dir) + else: + raise Exception( + "Optimizer file {} does not exist. Stop resumming training. Recommend to use `pretrain_weights` instead of `resume_checkpoint`" + .format(osp.join(weights_dir, "model.pdopt"))) + return import paddle.fluid as fluid vars_to_load = list() for var in main_prog.list_vars(): @@ -209,6 +283,45 @@ def load_pretrain_weights(exe, main_prog, weights_dir, fuse_bn=False): len(vars_to_load), weights_dir)) if fuse_bn: fuse_bn_weights(exe, main_prog, weights_dir) + if resume: + exception_message = "the training process can not be resumed due to optimizer set now and last time is different. Recommend to use `pretrain_weights` instead of `resume_checkpoint`" + optimizer_var_list = list( + filter(is_belong_to_optimizer, main_prog.list_vars())) + if len(optimizer_var_list) > 0: + for var in optimizer_var_list: + if not osp.exists(osp.join(weights_dir, var.name)): + raise Exception( + "Optimizer parameter {} doesn't exist, {}".format( + osp.join(weights_dir, var.name), + exception_message)) + pretrained_shape = parse_param_file( + osp.join(weights_dir, var.name)) + actual_shape = tuple(var.shape) + if pretrained_shape != actual_shape: + raise Exception( + "Shape of optimizer variable {} doesn't match.(Last: {}, Now: {}), {}" + .format(var.name, opt_dict[var.name].shape, + var.shape), exception_message) + optimizer_varname_list = [var.name for var in optimizer_var_list] + if os.exists(osp.join(weights_dir, 'learning_rate') + ) and 'learning_rate' not in optimizer_varname_list: + raise Exception( + "Optimizer parameter {}/learning_rate is not in the model, {}" + .format(weights_dir, exception_message)) + fluid.io.load_vars( + executor=exe, + dirname=weights_dir, + main_program=main_prog, + vars=optimizer_var_list) + + if len(optimizer_var_list) == 0: + raise Exception( + "There is no optimizer parameters in the model, please set the optimizer!" + ) + else: + logging.info( + "There are {} optimizer parameters in {} are loaded.".format( + len(optimizer_var_list), weights_dir)) class EarlyStop: -- GitLab