提交 11428859 编写于 作者: F FlyingQianMM

mv resume_checkpoint into net_initialize

上级 76c1af21
......@@ -70,6 +70,8 @@ class BaseAPI:
self.sync_bn = False
# 当前模型状态
self.status = 'Normal'
# 已完成迭代轮数,为恢复训练时的起始轮数
self.completed_epochs = 0
def _get_single_card_bs(self, batch_size):
if batch_size % len(self.places) == 0:
......@@ -182,22 +184,36 @@ class BaseAPI:
fuse_bn=False,
save_dir='.',
sensitivities_file=None,
eval_metric_loss=0.05):
pretrain_dir = osp.join(save_dir, 'pretrain')
if not os.path.isdir(pretrain_dir):
if os.path.exists(pretrain_dir):
os.remove(pretrain_dir)
os.makedirs(pretrain_dir)
if hasattr(self, 'backbone'):
backbone = self.backbone
else:
backbone = self.__class__.__name__
pretrain_weights = get_pretrain_weights(
pretrain_weights, self.model_type, backbone, pretrain_dir)
eval_metric_loss=0.05,
resume_checkpoint=None):
if not resume_checkpoint:
pretrain_dir = osp.join(save_dir, 'pretrain')
if not os.path.isdir(pretrain_dir):
if os.path.exists(pretrain_dir):
os.remove(pretrain_dir)
os.makedirs(pretrain_dir)
if hasattr(self, 'backbone'):
backbone = self.backbone
else:
backbone = self.__class__.__name__
pretrain_weights = get_pretrain_weights(
pretrain_weights, self.model_type, backbone, pretrain_dir)
if startup_prog is None:
startup_prog = fluid.default_startup_program()
self.exe.run(startup_prog)
if pretrain_weights is not None:
if resume_checkpoint:
logging.info(
"Resume checkpoint from {}.".format(resume_checkpoint),
use_color=True)
paddlex.utils.utils.load_pretrain_weights(
self.exe, self.train_prog, resume_checkpoint, resume=True)
if not osp.exists(osp.join(resume_checkpoint, "model.yml")):
raise Exception(
"There's not model.yml in {}".format(resume_checkpoint))
with open(osp.join(resume_checkpoint, "model.yml")) as f:
info = yaml.load(f.read(), Loader=yaml.Loader)
self.completed_epochs = info['completed_epochs']
elif pretrain_weights is not None:
logging.info(
"Load pretrain weights from {}.".format(pretrain_weights),
use_color=True)
......@@ -226,17 +242,6 @@ class BaseAPI:
use_color=True)
self.status = 'Prune'
def resume_checkpoint(self, path, startup_prog=None):
if not osp.isdir(path):
raise Exception("Model pretrain path {} does not "
"exists.".format(path))
if osp.exists(osp.join(path, 'model.pdparams')):
path = osp.join(path, 'model')
if startup_prog is None:
startup_prog = fluid.default_startup_program()
self.exe.run(startup_prog)
fluid.load(self.train_prog, path, executor=self.exe)
def get_model_info(self):
info = dict()
info['version'] = paddlex.__version__
......@@ -272,6 +277,7 @@ class BaseAPI:
name = op.__class__.__name__
attr = op.__dict__
info['Transforms'].append({name: attr})
info['completed_epochs'] = self.completed_epochs
return info
def save_model(self, save_dir):
......@@ -513,6 +519,7 @@ class BaseAPI:
return_details=True)
logging.info('[EVAL] Finished, Epoch={}, {} .'.format(
i + 1, dict2str(self.eval_metrics)))
self.completed_epochs += 1
# 保存最优模型
best_accuracy_key = list(self.eval_metrics.keys())[0]
current_accuracy = self.eval_metrics[best_accuracy_key]
......
......@@ -157,24 +157,16 @@ class BaseClassifier(BaseAPI):
# 构建训练、验证、预测网络
self.build_program()
# 初始化网络权重
self.net_initialize(
startup_prog=fluid.default_startup_program(),
pretrain_weights=pretrain_weights,
save_dir=save_dir,
sensitivities_file=sensitivities_file,
eval_metric_loss=eval_metric_loss,
resume_checkpoint=resume_checkpoint)
start_epoch = 0
if resume_checkpoint:
self.resume_checkpoint(
path=resume_checkpoint,
startup_prog=fluid.default_startup_program())
scope = fluid.global_scope()
v = scope.find_var('@LR_DECAY_COUNTER@')
step = np.array(v.get_tensor())[0] if v else 0
num_steps_each_epoch = train_dataset.num_samples // train_batch_size
start_epoch = step // num_steps_each_epoch + 1
else:
self.net_initialize(
startup_prog=fluid.default_startup_program(),
pretrain_weights=pretrain_weights,
save_dir=save_dir,
sensitivities_file=sensitivities_file,
eval_metric_loss=eval_metric_loss)
start_epoch = 0
start_epoch = self.completed_epochs
# 训练
self.train_loop(
start_epoch=start_epoch,
......
......@@ -281,24 +281,16 @@ class DeepLabv3p(BaseAPI):
# 构建训练、验证、预测网络
self.build_program()
# 初始化网络权重
self.net_initialize(
startup_prog=fluid.default_startup_program(),
pretrain_weights=pretrain_weights,
save_dir=save_dir,
sensitivities_file=sensitivities_file,
eval_metric_loss=eval_metric_loss,
resume_checkpoint=resume_checkpoint)
start_epoch = 0
if resume_checkpoint:
self.resume_checkpoint(
path=resume_checkpoint,
startup_prog=fluid.default_startup_program())
scope = fluid.global_scope()
v = scope.find_var('@LR_DECAY_COUNTER@')
step = np.array(v.get_tensor())[0] if v else 0
num_steps_each_epoch = train_dataset.num_samples // train_batch_size
start_epoch = step // num_steps_each_epoch + 1
else:
self.net_initialize(
startup_prog=fluid.default_startup_program(),
pretrain_weights=pretrain_weights,
save_dir=save_dir,
sensitivities_file=sensitivities_file,
eval_metric_loss=eval_metric_loss)
start_epoch = 0
start_epoch = self.completed_epochs
# 训练
self.train_loop(
start_epoch=start_epoch,
......
......@@ -229,23 +229,16 @@ class FasterRCNN(BaseAPI):
fuse_bn = True
if self.with_fpn and self.backbone in ['ResNet18', 'ResNet50']:
fuse_bn = False
self.net_initialize(
startup_prog=fluid.default_startup_program(),
pretrain_weights=pretrain_weights,
save_dir=save_dir,
sensitivities_file=sensitivities_file,
eval_metric_loss=eval_metric_loss,
resume_checkpoint=resume_checkpoint)
start_epoch = 0
if resume_checkpoint:
self.resume_checkpoint(
path=resume_checkpoint,
startup_prog=fluid.default_startup_program())
scope = fluid.global_scope()
v = scope.find_var('@LR_DECAY_COUNTER@')
step = np.array(v.get_tensor())[0] if v else 0
num_steps_each_epoch = train_dataset.num_samples // train_batch_size
start_epoch = step // num_steps_each_epoch + 1
else:
self.net_initialize(
startup_prog=fluid.default_startup_program(),
pretrain_weights=pretrain_weights,
fuse_bn=fuse_bn,
save_dir=save_dir)
start_epoch = 0
start_epoch = self.completed_epochs
# 训练
self.train_loop(
start_epoch=start_epoch,
......
......@@ -196,23 +196,16 @@ class MaskRCNN(FasterRCNN):
fuse_bn = True
if self.with_fpn and self.backbone in ['ResNet18', 'ResNet50']:
fuse_bn = False
self.net_initialize(
startup_prog=fluid.default_startup_program(),
pretrain_weights=pretrain_weights,
save_dir=save_dir,
sensitivities_file=sensitivities_file,
eval_metric_loss=eval_metric_loss,
resume_checkpoint=resume_checkpoint)
start_epoch = 0
if resume_checkpoint:
self.resume_checkpoint(
path=resume_checkpoint,
startup_prog=fluid.default_startup_program())
scope = fluid.global_scope()
v = scope.find_var('@LR_DECAY_COUNTER@')
step = np.array(v.get_tensor())[0] if v else 0
num_steps_each_epoch = train_dataset.num_samples // train_batch_size
start_epoch = step // num_steps_each_epoch + 1
else:
self.net_initialize(
startup_prog=fluid.default_startup_program(),
pretrain_weights=pretrain_weights,
fuse_bn=fuse_bn,
save_dir=save_dir)
start_epoch = 0
start_epoch = self.completed_epochs
# 训练
self.train_loop(
start_epoch=start_epoch,
......
......@@ -233,24 +233,16 @@ class YOLOv3(BaseAPI):
# 构建训练、验证、预测网络
self.build_program()
# 初始化网络权重
self.net_initialize(
startup_prog=fluid.default_startup_program(),
pretrain_weights=pretrain_weights,
save_dir=save_dir,
sensitivities_file=sensitivities_file,
eval_metric_loss=eval_metric_loss,
resume_checkpoint=resume_checkpoint)
start_epoch = 0
if resume_checkpoint:
self.resume_checkpoint(
path=resume_checkpoint,
startup_prog=fluid.default_startup_program())
scope = fluid.global_scope()
v = scope.find_var('@LR_DECAY_COUNTER@')
step = np.array(v.get_tensor())[0] if v else 0
num_steps_each_epoch = train_dataset.num_samples // train_batch_size
start_epoch = step // num_steps_each_epoch + 1
else:
self.net_initialize(
startup_prog=fluid.default_startup_program(),
pretrain_weights=pretrain_weights,
save_dir=save_dir,
sensitivities_file=sensitivities_file,
eval_metric_loss=eval_metric_loss)
start_epoch = 0
start_epoch = self.completed_epochs
# 训练
self.train_loop(
start_epoch=start_epoch,
......
......@@ -170,11 +170,85 @@ def load_pdparams(exe, main_prog, model_dir):
len(vars_to_load), model_dir))
def load_pretrain_weights(exe, main_prog, weights_dir, fuse_bn=False):
def is_persistable(var):
import paddle.fluid as fluid
from paddle.fluid.proto.framework_pb2 import VarType
if var.desc.type() == fluid.core.VarDesc.VarType.FEED_MINIBATCH or \
var.desc.type() == fluid.core.VarDesc.VarType.FETCH_LIST or \
var.desc.type() == fluid.core.VarDesc.VarType.READER:
return False
return var.persistable
def is_belong_to_optimizer(var):
import paddle.fluid as fluid
from paddle.fluid.proto.framework_pb2 import VarType
if not (isinstance(var, fluid.framework.Parameter)
or var.desc.need_check_feed()):
return is_persistable(var)
return False
def load_pdopt(exe, main_prog, model_dir):
import paddle.fluid as fluid
optimizer_var_list = list()
vars_to_load = list()
import pickle
with open(osp.join(model_dir, 'model.pdopt'), 'rb') as f:
opt_dict = pickle.load(f) if six.PY2 else pickle.load(
f, encoding='latin1')
optimizer_var_list = list(
filter(is_belong_to_optimizer, main_prog.list_vars()))
exception_message = "the training process can not be resumed due to optimizer set now and last time is different. Recommend to use `pretrain_weights` instead of `resume_checkpoint`"
if len(optimizer_var_list) > 0:
for var in optimizer_var_list:
if var.name not in opt_dict:
raise Exception(
"{} is not in saved paddlex optimizer, {}".format(
var.name, exception_message))
if var.shape != opt_dict[var.name].shape:
raise Exception(
"Shape of optimizer variable {} doesn't match.(Last: {}, Now: {}), {}"
.format(var.name, opt_dict[var.name].shape,
var.shape), exception_message)
optimizer_varname_list = [var.name for var in optimizer_var_list]
for k, v in opt_dict.items():
if k not in optimizer_varname_list:
raise Exception(
"{} in saved paddlex optimizer is not in the model, {}".
format(k, exception_message))
fluid.io.set_program_state(main_prog, opt_dict)
if len(optimizer_var_list) == 0:
raise Exception(
"There is no optimizer parameters in the model, please set the optimizer!"
)
else:
logging.info(
"There are {} optimizer parameters in {} are loaded.".format(
len(optimizer_var_list), model_dir))
def load_pretrain_weights(exe,
main_prog,
weights_dir,
fuse_bn=False,
resume=False):
if not osp.exists(weights_dir):
raise Exception("Path {} not exists.".format(weights_dir))
if osp.exists(osp.join(weights_dir, "model.pdparams")):
return load_pdparams(exe, main_prog, weights_dir)
load_pdparams(exe, main_prog, weights_dir)
if resume:
if osp.exists(osp.join(weights_dir, "model.pdopt")):
load_pdopt(exe, main_prog, weights_dir)
else:
raise Exception(
"Optimizer file {} does not exist. Stop resumming training. Recommend to use `pretrain_weights` instead of `resume_checkpoint`"
.format(osp.join(weights_dir, "model.pdopt")))
return
import paddle.fluid as fluid
vars_to_load = list()
for var in main_prog.list_vars():
......@@ -209,6 +283,45 @@ def load_pretrain_weights(exe, main_prog, weights_dir, fuse_bn=False):
len(vars_to_load), weights_dir))
if fuse_bn:
fuse_bn_weights(exe, main_prog, weights_dir)
if resume:
exception_message = "the training process can not be resumed due to optimizer set now and last time is different. Recommend to use `pretrain_weights` instead of `resume_checkpoint`"
optimizer_var_list = list(
filter(is_belong_to_optimizer, main_prog.list_vars()))
if len(optimizer_var_list) > 0:
for var in optimizer_var_list:
if not osp.exists(osp.join(weights_dir, var.name)):
raise Exception(
"Optimizer parameter {} doesn't exist, {}".format(
osp.join(weights_dir, var.name),
exception_message))
pretrained_shape = parse_param_file(
osp.join(weights_dir, var.name))
actual_shape = tuple(var.shape)
if pretrained_shape != actual_shape:
raise Exception(
"Shape of optimizer variable {} doesn't match.(Last: {}, Now: {}), {}"
.format(var.name, opt_dict[var.name].shape,
var.shape), exception_message)
optimizer_varname_list = [var.name for var in optimizer_var_list]
if os.exists(osp.join(weights_dir, 'learning_rate')
) and 'learning_rate' not in optimizer_varname_list:
raise Exception(
"Optimizer parameter {}/learning_rate is not in the model, {}"
.format(weights_dir, exception_message))
fluid.io.load_vars(
executor=exe,
dirname=weights_dir,
main_program=main_prog,
vars=optimizer_var_list)
if len(optimizer_var_list) == 0:
raise Exception(
"There is no optimizer parameters in the model, please set the optimizer!"
)
else:
logging.info(
"There are {} optimizer parameters in {} are loaded.".format(
len(optimizer_var_list), weights_dir))
class EarlyStop:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册