From 020d1072189d21d982c0c9dd830a930b759401aa Mon Sep 17 00:00:00 2001 From: wuyefeilin <30919197+wuyefeilin@users.noreply.github.com> Date: Fri, 22 May 2020 12:06:22 +0800 Subject: [PATCH] Save load update (#257) * update model save load * first add * update model save and load * update train.py * update LaneNet model saving and loading * adapt slim to paddle-1.8 * update distillation save and load * update nas model save and load * update model load op * update utils.py * update load_model_utils.py * update model saving and loading --- contrib/HumanSeg/utils/utils.py | 8 +- contrib/LaneNet/eval.py | 5 +- contrib/LaneNet/train.py | 102 ++-------------- contrib/LaneNet/utils/load_model_utils.py | 126 ++++++++++++++++++++ contrib/LaneNet/vis.py | 32 +++-- contrib/RemoteSensing/utils/utils.py | 8 +- pdseg/eval.py | 9 +- pdseg/export_model.py | 14 ++- pdseg/train.py | 128 ++++---------------- pdseg/utils/load_model_utils.py | 128 ++++++++++++++++++++ pdseg/vis.py | 28 +++-- slim/distillation/train_distill.py | 136 +++++----------------- slim/nas/eval_nas.py | 5 +- slim/nas/train_nas.py | 102 +++------------- slim/prune/train_prune.py | 62 ++-------- slim/quantization/train_quant.py | 89 ++++++++------ 16 files changed, 453 insertions(+), 529 deletions(-) create mode 100644 contrib/LaneNet/utils/load_model_utils.py create mode 100644 pdseg/utils/load_model_utils.py diff --git a/contrib/HumanSeg/utils/utils.py b/contrib/HumanSeg/utils/utils.py index 0e09e8b2..f640c74b 100644 --- a/contrib/HumanSeg/utils/utils.py +++ b/contrib/HumanSeg/utils/utils.py @@ -205,11 +205,9 @@ def load_pretrained_weights(exe, main_prog, weights_dir, fuse_bn=False): vars_to_load.append(var) logging.debug("Weight {} will be load".format(var.name)) - fluid.io.load_vars( - executor=exe, - dirname=weights_dir, - main_program=main_prog, - vars=vars_to_load) + params_dict = fluid.io.load_program_state( + weights_dir, var_list=vars_to_load) + fluid.io.set_program_state(main_prog, params_dict) if len(vars_to_load) == 0: logging.warning( "There is no pretrain weights loaded, maybe you should check you pretrain model!" diff --git a/contrib/LaneNet/eval.py b/contrib/LaneNet/eval.py index 025fb4e7..08f5cfe3 100644 --- a/contrib/LaneNet/eval.py +++ b/contrib/LaneNet/eval.py @@ -122,7 +122,10 @@ def evaluate(cfg, ckpt_dir=None, use_gpu=False, use_mpio=False, **kwargs): if ckpt_dir is not None: print('load test model:', ckpt_dir) - fluid.io.load_params(exe, ckpt_dir, main_program=test_prog) + try: + fluid.load(test_prog, os.path.join(ckpt_dir, 'model'), exe) + except: + fluid.io.load_params(exe, ckpt_dir, main_program=test_prog) # Use streaming confusion matrix to calculate mean_iou np.set_printoptions( diff --git a/contrib/LaneNet/train.py b/contrib/LaneNet/train.py index d9d22ba9..d9e42184 100644 --- a/contrib/LaneNet/train.py +++ b/contrib/LaneNet/train.py @@ -40,10 +40,10 @@ from pdseg.utils.timer import Timer, calculate_eta from reader import LaneNetDataset from models.model_builder import build_model from models.model_builder import ModelPhase -from models.model_builder import parse_shape_from_file from eval import evaluate from vis import visualize from utils import dist_utils +from utils.load_model_utils import load_pretrained_weights def parse_args(): @@ -101,37 +101,6 @@ def parse_args(): return parser.parse_args() -def save_vars(executor, dirname, program=None, vars=None): - """ - Temporary resolution for Win save variables compatability. - Will fix in PaddlePaddle v1.5.2 - """ - - save_program = fluid.Program() - save_block = save_program.global_block() - - for each_var in vars: - # NOTE: don't save the variable which type is RAW - if each_var.type == fluid.core.VarDesc.VarType.RAW: - continue - new_var = save_block.create_var( - name=each_var.name, - shape=each_var.shape, - dtype=each_var.dtype, - type=each_var.type, - lod_level=each_var.lod_level, - persistable=True) - file_path = os.path.join(dirname, new_var.name) - file_path = os.path.normpath(file_path) - save_block.append_op( - type='save', - inputs={'X': [new_var]}, - outputs={}, - attrs={'file_path': file_path}) - - executor.run(save_program) - - def save_checkpoint(exe, program, ckpt_name): """ Save checkpoint for evaluation or resume training @@ -141,29 +110,22 @@ def save_checkpoint(exe, program, ckpt_name): if not os.path.isdir(ckpt_dir): os.makedirs(ckpt_dir) - save_vars( - exe, - ckpt_dir, - program, - vars=list(filter(fluid.io.is_persistable, program.list_vars()))) + fluid.save(program, os.path.join(ckpt_dir, 'model')) return ckpt_dir def load_checkpoint(exe, program): """ - Load checkpoiont from pretrained model directory for resume training + Load checkpoiont for resuming training """ - - print('Resume model training from:', cfg.TRAIN.RESUME_MODEL_DIR) - if not os.path.exists(cfg.TRAIN.RESUME_MODEL_DIR): - raise ValueError("TRAIN.PRETRAIN_MODEL {} not exist!".format( - cfg.TRAIN.RESUME_MODEL_DIR)) - - fluid.io.load_persistables( - exe, cfg.TRAIN.RESUME_MODEL_DIR, main_program=program) - model_path = cfg.TRAIN.RESUME_MODEL_DIR + print('Resume model training from:', model_path) + if not os.path.exists(model_path): + raise ValueError( + "TRAIN.PRETRAIN_MODEL {} not exist!".format(model_path)) + fluid.load(program, os.path.join(model_path, 'model'), exe) + # Check is path ended by path spearator if model_path[-1] == os.sep: model_path = model_path[0:-1] @@ -178,7 +140,6 @@ def load_checkpoint(exe, program): else: raise ValueError("Resume model path is not valid!") print("Model checkpoint loaded successfully!") - return begin_epoch @@ -271,44 +232,7 @@ def train(cfg): begin_epoch = load_checkpoint(exe, train_prog) # Load pretrained model elif os.path.exists(cfg.TRAIN.PRETRAINED_MODEL_DIR): - print_info('Pretrained model dir: ', cfg.TRAIN.PRETRAINED_MODEL_DIR) - load_vars = [] - load_fail_vars = [] - - def var_shape_matched(var, shape): - """ - Check whehter persitable variable shape is match with current network - """ - var_exist = os.path.exists( - os.path.join(cfg.TRAIN.PRETRAINED_MODEL_DIR, var.name)) - if var_exist: - var_shape = parse_shape_from_file( - os.path.join(cfg.TRAIN.PRETRAINED_MODEL_DIR, var.name)) - if var_shape != shape: - print(var.name, var_shape, shape) - return var_shape == shape - return False - - for x in train_prog.list_vars(): - if isinstance(x, fluid.framework.Parameter): - shape = tuple(fluid.global_scope().find_var( - x.name).get_tensor().shape()) - if var_shape_matched(x, shape): - load_vars.append(x) - else: - load_fail_vars.append(x) - - fluid.io.load_vars( - exe, dirname=cfg.TRAIN.PRETRAINED_MODEL_DIR, vars=load_vars) - for var in load_vars: - print_info("Parameter[{}] loaded sucessfully!".format(var.name)) - for var in load_fail_vars: - print_info( - "Parameter[{}] don't exist or shape does not match current network, skip" - " to load it.".format(var.name)) - print_info("{}/{} pretrained parameters loaded successfully!".format( - len(load_vars), - len(load_vars) + len(load_fail_vars))) + load_pretrained_weights(exe, train_prog, cfg.TRAIN.PRETRAINED_MODEL_DIR) else: print_info( 'Pretrained model dir {} not exists, training from scratch...'. @@ -393,8 +317,7 @@ def train(cfg): avg_emb_loss, avg_acc, avg_fp, avg_fn, speed, calculate_eta(all_step - step, speed))) if args.use_vdl: - log_writer.add_scalar('Train/loss', avg_loss, - step) + log_writer.add_scalar('Train/loss', avg_loss, step) log_writer.add_scalar('Train/lr', lr[0], step) log_writer.add_scalar('Train/speed', speed, step) sys.stdout.flush() @@ -423,8 +346,7 @@ def train(cfg): use_gpu=args.use_gpu, use_mpio=args.use_mpio) if args.use_vdl: - log_writer.add_scalar('Evaluate/accuracy', accuracy, - step) + log_writer.add_scalar('Evaluate/accuracy', accuracy, step) log_writer.add_scalar('Evaluate/fp', fp, step) log_writer.add_scalar('Evaluate/fn', fn, step) diff --git a/contrib/LaneNet/utils/load_model_utils.py b/contrib/LaneNet/utils/load_model_utils.py new file mode 100644 index 00000000..402e5917 --- /dev/null +++ b/contrib/LaneNet/utils/load_model_utils.py @@ -0,0 +1,126 @@ +# coding: utf8 +# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import os.path as osp + +import six +import numpy as np + + +def parse_param_file(param_file, return_shape=True): + from paddle.fluid.proto.framework_pb2 import VarType + f = open(param_file, 'rb') + version = np.fromstring(f.read(4), dtype='int32') + lod_level = np.fromstring(f.read(8), dtype='int64') + for i in range(int(lod_level)): + _size = np.fromstring(f.read(8), dtype='int64') + _ = f.read(_size) + version = np.fromstring(f.read(4), dtype='int32') + tensor_desc = VarType.TensorDesc() + tensor_desc_size = np.fromstring(f.read(4), dtype='int32') + tensor_desc.ParseFromString(f.read(int(tensor_desc_size))) + tensor_shape = tuple(tensor_desc.dims) + if return_shape: + f.close() + return tuple(tensor_desc.dims) + if tensor_desc.data_type != 5: + raise Exception( + "Unexpected data type while parse {}".format(param_file)) + data_size = 4 + for i in range(len(tensor_shape)): + data_size *= tensor_shape[i] + weight = np.fromstring(f.read(data_size), dtype='float32') + f.close() + return np.reshape(weight, tensor_shape) + + +def load_pdparams(exe, main_prog, model_dir): + import paddle.fluid as fluid + from paddle.fluid.proto.framework_pb2 import VarType + from paddle.fluid.framework import Program + + vars_to_load = list() + vars_not_load = list() + import pickle + with open(osp.join(model_dir, 'model.pdparams'), 'rb') as f: + params_dict = pickle.load(f) if six.PY2 else pickle.load( + f, encoding='latin1') + unused_vars = list() + for var in main_prog.list_vars(): + if not isinstance(var, fluid.framework.Parameter): + continue + if var.name not in params_dict: + print("{} is not in saved model".format(var.name)) + vars_not_load.append(var.name) + continue + if var.shape != params_dict[var.name].shape: + unused_vars.append(var.name) + vars_not_load.append(var.name) + print( + "[SKIP] Shape of pretrained weight {} doesn't match.(Pretrained: {}, Actual: {})" + .format(var.name, params_dict[var.name].shape, var.shape)) + continue + vars_to_load.append(var) + for var_name in unused_vars: + del params_dict[var_name] + fluid.io.set_program_state(main_prog, params_dict) + + if len(vars_to_load) == 0: + print( + "There is no pretrain weights loaded, maybe you should check you pretrain model!" + ) + else: + print("There are {}/{} varaibles in {} are loaded.".format( + len(vars_to_load), + len(vars_to_load) + len(vars_not_load), model_dir)) + + +def load_pretrained_weights(exe, main_prog, weights_dir): + if not osp.exists(weights_dir): + raise Exception("Path {} not exists.".format(weights_dir)) + if osp.exists(osp.join(weights_dir, "model.pdparams")): + return load_pdparams(exe, main_prog, weights_dir) + import paddle.fluid as fluid + vars_to_load = list() + vars_not_load = list() + for var in main_prog.list_vars(): + if not isinstance(var, fluid.framework.Parameter): + continue + if not osp.exists(osp.join(weights_dir, var.name)): + print("[SKIP] Pretrained weight {}/{} doesn't exist".format( + weights_dir, var.name)) + vars_not_load.append(var) + continue + pretrained_shape = parse_param_file(osp.join(weights_dir, var.name)) + actual_shape = tuple(var.shape) + if pretrained_shape != actual_shape: + print( + "[SKIP] Shape of pretrained weight {}/{} doesn't match.(Pretrained: {}, Actual: {})" + .format(weights_dir, var.name, pretrained_shape, actual_shape)) + vars_not_load.append(var) + continue + vars_to_load.append(var) + params_dict = fluid.io.load_program_state( + weights_dir, var_list=vars_to_load) + fluid.io.set_program_state(main_prog, params_dict) + if len(vars_to_load) == 0: + print( + "There is no pretrain weights loaded, maybe you should check you pretrain model!" + ) + else: + print("There are {}/{} varaibles in {} are loaded.".format( + len(vars_to_load), + len(vars_to_load) + len(vars_not_load), weights_dir)) diff --git a/contrib/LaneNet/vis.py b/contrib/LaneNet/vis.py index 59425875..40ac1730 100644 --- a/contrib/LaneNet/vis.py +++ b/contrib/LaneNet/vis.py @@ -45,6 +45,7 @@ from models.model_builder import ModelPhase from utils import lanenet_postprocess import matplotlib.pyplot as plt + def parse_args(): parser = argparse.ArgumentParser(description='PaddeSeg visualization tools') parser.add_argument( @@ -106,7 +107,6 @@ def minmax_scale(input_arr): return output_arr - def visualize(cfg, vis_file_list=None, use_gpu=False, @@ -119,7 +119,6 @@ def visualize(cfg, if vis_file_list is None: vis_file_list = cfg.DATASET.TEST_FILE_LIST - dataset = LaneNetDataset( file_list=vis_file_list, mode=ModelPhase.VISUAL, @@ -139,7 +138,12 @@ def visualize(cfg, ckpt_dir = cfg.TEST.TEST_MODEL if not ckpt_dir else ckpt_dir - fluid.io.load_params(exe, ckpt_dir, main_program=test_prog) + if ckpt_dir is not None: + print('load test model:', ckpt_dir) + try: + fluid.load(test_prog, os.path.join(ckpt_dir, 'model'), exe) + except: + fluid.io.load_params(exe, ckpt_dir, main_program=test_prog) save_dir = os.path.join(vis_dir, 'visual_results') makedirs(save_dir) @@ -161,22 +165,26 @@ def visualize(cfg, for i in range(num_imgs): gt_image = org_imgs[i] - binary_seg_image, instance_seg_image = segLogits[i].squeeze(-1), emLogits[i].transpose((1,2,0)) + binary_seg_image, instance_seg_image = segLogits[i].squeeze( + -1), emLogits[i].transpose((1, 2, 0)) postprocess_result = postprocessor.postprocess( binary_seg_result=binary_seg_image, instance_seg_result=instance_seg_image, - source_image=gt_image - ) - pred_binary_fn = os.path.join(save_dir, to_png_fn(img_names[i], name='_pred_binary')) - pred_lane_fn = os.path.join(save_dir, to_png_fn(img_names[i], name='_pred_lane')) - pred_instance_fn = os.path.join(save_dir, to_png_fn(img_names[i], name='_pred_instance')) + source_image=gt_image) + pred_binary_fn = os.path.join( + save_dir, to_png_fn(img_names[i], name='_pred_binary')) + pred_lane_fn = os.path.join( + save_dir, to_png_fn(img_names[i], name='_pred_lane')) + pred_instance_fn = os.path.join( + save_dir, to_png_fn(img_names[i], name='_pred_instance')) dirname = os.path.dirname(pred_binary_fn) makedirs(dirname) mask_image = postprocess_result['mask_image'] for i in range(4): - instance_seg_image[:, :, i] = minmax_scale(instance_seg_image[:, :, i]) + instance_seg_image[:, :, i] = minmax_scale( + instance_seg_image[:, :, i]) embedding_image = np.array(instance_seg_image).astype(np.uint8) plt.figure('mask_image') @@ -189,13 +197,13 @@ def visualize(cfg, plt.imshow(binary_seg_image * 255, cmap='gray') plt.show() - cv2.imwrite(pred_binary_fn, np.array(binary_seg_image * 255).astype(np.uint8)) + cv2.imwrite(pred_binary_fn, + np.array(binary_seg_image * 255).astype(np.uint8)) cv2.imwrite(pred_lane_fn, postprocess_result['source_image']) cv2.imwrite(pred_instance_fn, mask_image) print(pred_lane_fn, 'saved!') - if __name__ == '__main__': args = parse_args() if args.cfg_file is not None: diff --git a/contrib/RemoteSensing/utils/utils.py b/contrib/RemoteSensing/utils/utils.py index d9715192..4ab1e021 100644 --- a/contrib/RemoteSensing/utils/utils.py +++ b/contrib/RemoteSensing/utils/utils.py @@ -201,11 +201,9 @@ def load_pretrain_weights(exe, main_prog, weights_dir, fuse_bn=False): vars_to_load.append(var) logging.debug("Weight {} will be load".format(var.name)) - fluid.io.load_vars( - executor=exe, - dirname=weights_dir, - main_program=main_prog, - vars=vars_to_load) + params_dict = fluid.io.load_program_state( + weights_dir, var_list=vars_to_load) + fluid.io.set_program_state(main_prog, params_dict) if len(vars_to_load) == 0: logging.warning( "There is no pretrain weights loaded, maybe you should check you pretrain model!" diff --git a/pdseg/eval.py b/pdseg/eval.py index 426e52f9..abb00f0e 100644 --- a/pdseg/eval.py +++ b/pdseg/eval.py @@ -22,13 +22,9 @@ import os os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0" import sys -import time import argparse -import functools import pprint -import cv2 import numpy as np -import paddle import paddle.fluid as fluid from utils.config import cfg @@ -116,7 +112,10 @@ def evaluate(cfg, ckpt_dir=None, use_gpu=False, use_mpio=False, **kwargs): if ckpt_dir is not None: print('load test model:', ckpt_dir) - fluid.io.load_params(exe, ckpt_dir, main_program=test_prog) + try: + fluid.load(test_prog, os.path.join(ckpt_dir, 'model'), exe) + except: + fluid.io.load_params(exe, ckpt_dir, main_program=test_prog) # Use streaming confusion matrix to calculate mean_iou np.set_printoptions( diff --git a/pdseg/export_model.py b/pdseg/export_model.py index 93e4b494..b7c82680 100644 --- a/pdseg/export_model.py +++ b/pdseg/export_model.py @@ -49,6 +49,7 @@ def parse_args(): sys.exit(1) return parser.parse_args() + def export_inference_config(): deploy_cfg = '''DEPLOY: USE_GPU : 1 @@ -66,9 +67,8 @@ def export_inference_config(): PREDICTOR_MODE : "ANALYSIS" BATCH_SIZE : 1 ''' % (cfg.FREEZE.SAVE_DIR, cfg.FREEZE.MODEL_FILENAME, - cfg.FREEZE.PARAMS_FILENAME, cfg.EVAL_CROP_SIZE, - cfg.MEAN, cfg.STD, cfg.DATASET.IMAGE_TYPE, - cfg.DATASET.NUM_CLASSES, len(cfg.STD)) + cfg.FREEZE.PARAMS_FILENAME, cfg.EVAL_CROP_SIZE, cfg.MEAN, cfg.STD, + cfg.DATASET.IMAGE_TYPE, cfg.DATASET.NUM_CLASSES, len(cfg.STD)) if not os.path.exists(cfg.FREEZE.SAVE_DIR): os.mkdir(cfg.FREEZE.SAVE_DIR) yaml_path = os.path.join(cfg.FREEZE.SAVE_DIR, 'deploy.yaml') @@ -94,7 +94,13 @@ def export_inference_model(args): infer_prog = infer_prog.clone(for_test=True) if os.path.exists(cfg.TEST.TEST_MODEL): - fluid.io.load_params(exe, cfg.TEST.TEST_MODEL, main_program=infer_prog) + print('load test model:', cfg.TEST.TEST_MODEL) + try: + fluid.load(infer_prog, os.path.join(cfg.TEST.TEST_MODEL, 'model'), + exe) + except: + fluid.io.load_params( + exe, cfg.TEST.TEST_MODEL, main_program=infer_prog) else: print("TEST.TEST_MODEL diretory is empty!") exit(-1) diff --git a/pdseg/train.py b/pdseg/train.py index e1c498a4..e1d1dd4e 100644 --- a/pdseg/train.py +++ b/pdseg/train.py @@ -26,9 +26,7 @@ import argparse import pprint import random import shutil -import functools -import paddle import numpy as np import paddle.fluid as fluid from paddle.fluid import profiler @@ -39,10 +37,10 @@ from metrics import ConfusionMatrix from reader import SegDataset from models.model_builder import build_model from models.model_builder import ModelPhase -from models.model_builder import parse_shape_from_file from eval import evaluate from vis import visualize from utils import dist_utils +from utils.load_model_utils import load_pretrained_weights def parse_args(): @@ -118,38 +116,7 @@ def parse_args(): return parser.parse_args() -def save_vars(executor, dirname, program=None, vars=None): - """ - Temporary resolution for Win save variables compatability. - Will fix in PaddlePaddle v1.5.2 - """ - - save_program = fluid.Program() - save_block = save_program.global_block() - - for each_var in vars: - # NOTE: don't save the variable which type is RAW - if each_var.type == fluid.core.VarDesc.VarType.RAW: - continue - new_var = save_block.create_var( - name=each_var.name, - shape=each_var.shape, - dtype=each_var.dtype, - type=each_var.type, - lod_level=each_var.lod_level, - persistable=True) - file_path = os.path.join(dirname, new_var.name) - file_path = os.path.normpath(file_path) - save_block.append_op( - type='save', - inputs={'X': [new_var]}, - outputs={}, - attrs={'file_path': file_path}) - - executor.run(save_program) - - -def save_checkpoint(exe, program, ckpt_name): +def save_checkpoint(program, ckpt_name): """ Save checkpoint for evaluation or resume training """ @@ -158,29 +125,22 @@ def save_checkpoint(exe, program, ckpt_name): if not os.path.isdir(ckpt_dir): os.makedirs(ckpt_dir) - save_vars( - exe, - ckpt_dir, - program, - vars=list(filter(fluid.io.is_persistable, program.list_vars()))) + fluid.save(program, os.path.join(ckpt_dir, 'model')) return ckpt_dir def load_checkpoint(exe, program): """ - Load checkpoiont from pretrained model directory for resume training + Load checkpoiont for resuming training """ - - print('Resume model training from:', cfg.TRAIN.RESUME_MODEL_DIR) - if not os.path.exists(cfg.TRAIN.RESUME_MODEL_DIR): - raise ValueError("TRAIN.PRETRAIN_MODEL {} not exist!".format( - cfg.TRAIN.RESUME_MODEL_DIR)) - - fluid.io.load_persistables( - exe, cfg.TRAIN.RESUME_MODEL_DIR, main_program=program) - model_path = cfg.TRAIN.RESUME_MODEL_DIR + print('Resume model training from:', model_path) + if not os.path.exists(model_path): + raise ValueError( + "TRAIN.PRETRAIN_MODEL {} not exist!".format(model_path)) + fluid.load(program, os.path.join(model_path, 'model'), exe) + # Check is path ended by path spearator if model_path[-1] == os.sep: model_path = model_path[0:-1] @@ -195,7 +155,6 @@ def load_checkpoint(exe, program): else: raise ValueError("Resume model path is not valid!") print("Model checkpoint loaded successfully!") - return begin_epoch @@ -247,8 +206,6 @@ def train(cfg): yield item[0], item[1], item[2] # Get device environment - # places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places() - # place = places[0] gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0)) place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace() places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places() @@ -304,42 +261,7 @@ def train(cfg): begin_epoch = load_checkpoint(exe, train_prog) # Load pretrained model elif os.path.exists(cfg.TRAIN.PRETRAINED_MODEL_DIR): - print_info('Pretrained model dir: ', cfg.TRAIN.PRETRAINED_MODEL_DIR) - load_vars = [] - load_fail_vars = [] - - def var_shape_matched(var, shape): - """ - Check whehter persitable variable shape is match with current network - """ - var_exist = os.path.exists( - os.path.join(cfg.TRAIN.PRETRAINED_MODEL_DIR, var.name)) - if var_exist: - var_shape = parse_shape_from_file( - os.path.join(cfg.TRAIN.PRETRAINED_MODEL_DIR, var.name)) - return var_shape == shape - return False - - for x in train_prog.list_vars(): - if isinstance(x, fluid.framework.Parameter): - shape = tuple(fluid.global_scope().find_var( - x.name).get_tensor().shape()) - if var_shape_matched(x, shape): - load_vars.append(x) - else: - load_fail_vars.append(x) - - fluid.io.load_vars( - exe, dirname=cfg.TRAIN.PRETRAINED_MODEL_DIR, vars=load_vars) - for var in load_vars: - print_info("Parameter[{}] loaded sucessfully!".format(var.name)) - for var in load_fail_vars: - print_info( - "Parameter[{}] don't exist or shape does not match current network, skip" - " to load it.".format(var.name)) - print_info("{}/{} pretrained parameters loaded successfully!".format( - len(load_vars), - len(load_vars) + len(load_fail_vars))) + load_pretrained_weights(exe, train_prog, cfg.TRAIN.PRETRAINED_MODEL_DIR) else: print_info( 'Pretrained model dir {} not exists, training from scratch...'. @@ -418,12 +340,9 @@ def train(cfg): step) log_writer.add_scalar('Train/mean_acc', mean_acc, step) - log_writer.add_scalar('Train/loss', avg_loss, - step) - log_writer.add_scalar('Train/lr', lr[0], - step) - log_writer.add_scalar('Train/step/sec', speed, - step) + log_writer.add_scalar('Train/loss', avg_loss, step) + log_writer.add_scalar('Train/lr', lr[0], step) + log_writer.add_scalar('Train/step/sec', speed, step) sys.stdout.flush() avg_loss = 0.0 cm.zero_matrix() @@ -445,12 +364,9 @@ def train(cfg): ).format(epoch, step, lr[0], avg_loss, speed, calculate_eta(all_step - step, speed))) if args.use_vdl: - log_writer.add_scalar('Train/loss', avg_loss, - step) - log_writer.add_scalar('Train/lr', lr[0], - step) - log_writer.add_scalar('Train/speed', speed, - step) + log_writer.add_scalar('Train/loss', avg_loss, step) + log_writer.add_scalar('Train/lr', lr[0], step) + log_writer.add_scalar('Train/speed', speed, step) sys.stdout.flush() avg_loss = 0.0 timer.restart() @@ -470,7 +386,7 @@ def train(cfg): if (epoch % cfg.TRAIN.SNAPSHOT_EPOCH == 0 or epoch == cfg.SOLVER.NUM_EPOCHS) and cfg.TRAINER_ID == 0: - ckpt_dir = save_checkpoint(exe, train_prog, epoch) + ckpt_dir = save_checkpoint(train_prog, epoch) if args.do_eval: print("Evaluation start") @@ -480,10 +396,8 @@ def train(cfg): use_gpu=args.use_gpu, use_mpio=args.use_mpio) if args.use_vdl: - log_writer.add_scalar('Evaluate/mean_iou', mean_iou, - step) - log_writer.add_scalar('Evaluate/mean_acc', mean_acc, - step) + log_writer.add_scalar('Evaluate/mean_iou', mean_iou, step) + log_writer.add_scalar('Evaluate/mean_acc', mean_acc, step) if mean_iou > best_mIoU: best_mIoU = mean_iou @@ -505,7 +419,7 @@ def train(cfg): # save final model if cfg.TRAINER_ID == 0: - save_checkpoint(exe, train_prog, 'final') + save_checkpoint(train_prog, 'final') def main(args): diff --git a/pdseg/utils/load_model_utils.py b/pdseg/utils/load_model_utils.py new file mode 100644 index 00000000..5a02bcb0 --- /dev/null +++ b/pdseg/utils/load_model_utils.py @@ -0,0 +1,128 @@ +# coding: utf8 +# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import os.path as osp + +import six +import numpy as np + + +def parse_param_file(param_file, return_shape=True): + from paddle.fluid.proto.framework_pb2 import VarType + f = open(param_file, 'rb') + version = np.fromstring(f.read(4), dtype='int32') + lod_level = np.fromstring(f.read(8), dtype='int64') + for i in range(int(lod_level)): + _size = np.fromstring(f.read(8), dtype='int64') + _ = f.read(_size) + version = np.fromstring(f.read(4), dtype='int32') + tensor_desc = VarType.TensorDesc() + tensor_desc_size = np.fromstring(f.read(4), dtype='int32') + tensor_desc.ParseFromString(f.read(int(tensor_desc_size))) + tensor_shape = tuple(tensor_desc.dims) + if return_shape: + f.close() + return tuple(tensor_desc.dims) + if tensor_desc.data_type != 5: + raise Exception( + "Unexpected data type while parse {}".format(param_file)) + data_size = 4 + for i in range(len(tensor_shape)): + data_size *= tensor_shape[i] + weight = np.fromstring(f.read(data_size), dtype='float32') + f.close() + return np.reshape(weight, tensor_shape) + + +def load_pdparams(exe, main_prog, model_dir): + import paddle.fluid as fluid + from paddle.fluid.proto.framework_pb2 import VarType + from paddle.fluid.framework import Program + + vars_to_load = list() + vars_not_load = list() + import pickle + with open(osp.join(model_dir, 'model.pdparams'), 'rb') as f: + params_dict = pickle.load(f) if six.PY2 else pickle.load( + f, encoding='latin1') + unused_vars = list() + for var in main_prog.list_vars(): + if not isinstance(var, fluid.framework.Parameter): + continue + if var.name not in params_dict: + print("{} is not in saved model".format(var.name)) + vars_not_load.append(var.name) + continue + if var.shape != params_dict[var.name].shape: + unused_vars.append(var.name) + vars_not_load.append(var.name) + print( + "[SKIP] Shape of pretrained weight {} doesn't match.(Pretrained: {}, Actual: {})" + .format(var.name, params_dict[var.name].shape, var.shape)) + continue + vars_to_load.append(var) + for var_name in unused_vars: + del params_dict[var_name] + fluid.io.set_program_state(main_prog, params_dict) + + if len(vars_to_load) == 0: + print( + "There is no pretrain weights loaded, maybe you should check you pretrain model!" + ) + else: + print("There are {}/{} varaibles in {} are loaded.".format( + len(vars_to_load), + len(vars_to_load) + len(vars_not_load), model_dir)) + + +def load_pretrained_weights(exe, main_prog, weights_dir): + if not osp.exists(weights_dir): + raise Exception("Path {} not exists.".format(weights_dir)) + if osp.exists(osp.join(weights_dir, "model.pdparams")): + return load_pdparams(exe, main_prog, weights_dir) + import paddle.fluid as fluid + vars_to_load = list() + vars_not_load = list() + for var in main_prog.list_vars(): + if not isinstance(var, fluid.framework.Parameter): + continue + if not osp.exists(osp.join(weights_dir, var.name)): + print("[SKIP] Pretrained weight {}/{} doesn't exist".format( + weights_dir, var.name)) + vars_not_load.append(var) + continue + pretrained_shape = parse_param_file(osp.join(weights_dir, var.name)) + actual_shape = tuple(var.shape) + if pretrained_shape != actual_shape: + print( + "[SKIP] Shape of pretrained weight {}/{} doesn't match.(Pretrained: {}, Actual: {})" + .format(weights_dir, var.name, pretrained_shape, actual_shape)) + vars_not_load.append(var) + continue + vars_to_load.append(var) + + params_dict = fluid.io.load_program_state( + weights_dir, var_list=vars_to_load) + fluid.io.set_program_state(main_prog, params_dict) + + if len(vars_to_load) == 0: + print( + "There is no pretrain weights loaded, maybe you should check you pretrain model!" + ) + else: + print("There are {}/{} varaibles in {} are loaded.".format( + len(vars_to_load), + len(vars_to_load) + len(vars_not_load), weights_dir)) diff --git a/pdseg/vis.py b/pdseg/vis.py index 0dc30273..badaec6c 100644 --- a/pdseg/vis.py +++ b/pdseg/vis.py @@ -115,7 +115,12 @@ def visualize(cfg, ckpt_dir = cfg.TEST.TEST_MODEL if not ckpt_dir else ckpt_dir - fluid.io.load_params(exe, ckpt_dir, main_program=test_prog) + if ckpt_dir is not None: + print('load test model:', ckpt_dir) + try: + fluid.load(test_prog, os.path.join(ckpt_dir, 'model'), exe) + except: + fluid.io.load_params(exe, ckpt_dir, main_program=test_prog) save_dir = vis_dir makedirs(save_dir) @@ -169,18 +174,13 @@ def visualize(cfg, print("VisualDL visualization epoch", epoch) pred_mask_np = np.array(pred_mask.convert("RGB")) - log_writer.add_image( - "Predict/{}".format(img_name), - pred_mask_np, - epoch) + log_writer.add_image("Predict/{}".format(img_name), + pred_mask_np, epoch) # Original image # BGR->RGB - img = cv2.imread( - os.path.join(cfg.DATASET.DATA_DIR, img_name))[..., ::-1] - log_writer.add_image( - "Images/{}".format(img_name), - img, - epoch) + img = cv2.imread(os.path.join(cfg.DATASET.DATA_DIR, + img_name))[..., ::-1] + log_writer.add_image("Images/{}".format(img_name), img, epoch) # add ground truth (label) images grt = grts[i] if grt is not None: @@ -189,10 +189,8 @@ def visualize(cfg, grt_pil.putpalette(color_map) grt_pil = grt_pil.resize((org_shape[1], org_shape[0])) grt = np.array(grt_pil.convert("RGB")) - log_writer.add_image( - "Label/{}".format(img_name), - grt, - epoch) + log_writer.add_image("Label/{}".format(img_name), grt, + epoch) # If in local_test mode, only visualize 5 images just for testing # procedure diff --git a/slim/distillation/train_distill.py b/slim/distillation/train_distill.py index 995cab1f..88557605 100644 --- a/slim/distillation/train_distill.py +++ b/slim/distillation/train_distill.py @@ -44,6 +44,7 @@ from model_builder import parse_shape_from_file from eval import evaluate from vis import visualize from utils import dist_utils +from utils.load_model_utils import load_pretrained_weights import solver from paddleslim.dist.single_distiller import merge, l2_loss @@ -116,38 +117,7 @@ def parse_args(): return parser.parse_args() -def save_vars(executor, dirname, program=None, vars=None): - """ - Temporary resolution for Win save variables compatability. - Will fix in PaddlePaddle v1.5.2 - """ - - save_program = fluid.Program() - save_block = save_program.global_block() - - for each_var in vars: - # NOTE: don't save the variable which type is RAW - if each_var.type == fluid.core.VarDesc.VarType.RAW: - continue - new_var = save_block.create_var( - name=each_var.name, - shape=each_var.shape, - dtype=each_var.dtype, - type=each_var.type, - lod_level=each_var.lod_level, - persistable=True) - file_path = os.path.join(dirname, new_var.name) - file_path = os.path.normpath(file_path) - save_block.append_op( - type='save', - inputs={'X': [new_var]}, - outputs={}, - attrs={'file_path': file_path}) - - executor.run(save_program) - - -def save_checkpoint(exe, program, ckpt_name): +def save_checkpoint(program, ckpt_name): """ Save checkpoint for evaluation or resume training """ @@ -156,29 +126,22 @@ def save_checkpoint(exe, program, ckpt_name): if not os.path.isdir(ckpt_dir): os.makedirs(ckpt_dir) - save_vars( - exe, - ckpt_dir, - program, - vars=list(filter(fluid.io.is_persistable, program.list_vars()))) + fluid.save(program, os.path.join(ckpt_dir, 'model')) return ckpt_dir def load_checkpoint(exe, program): """ - Load checkpoiont from pretrained model directory for resume training + Load checkpoiont for resuming training """ - - print('Resume model training from:', cfg.TRAIN.RESUME_MODEL_DIR) - if not os.path.exists(cfg.TRAIN.RESUME_MODEL_DIR): - raise ValueError("TRAIN.PRETRAIN_MODEL {} not exist!".format( - cfg.TRAIN.RESUME_MODEL_DIR)) - - fluid.io.load_persistables( - exe, cfg.TRAIN.RESUME_MODEL_DIR, main_program=program) - model_path = cfg.TRAIN.RESUME_MODEL_DIR + print('Resume model training from:', model_path) + if not os.path.exists(model_path): + raise ValueError( + "TRAIN.PRETRAIN_MODEL {} not exist!".format(model_path)) + fluid.load(program, os.path.join(model_path, 'model'), exe) + # Check is path ended by path spearator if model_path[-1] == os.sep: model_path = model_path[0:-1] @@ -193,7 +156,6 @@ def load_checkpoint(exe, program): else: raise ValueError("Resume model path is not valid!") print("Model checkpoint loaded successfully!") - return begin_epoch @@ -289,7 +251,11 @@ def train(cfg): ckpt_dir = cfg.SLIM.KNOWLEDGE_DISTILL_TEACHER_MODEL_DIR assert ckpt_dir is not None print('load teacher model:', ckpt_dir) - fluid.io.load_params(exe, ckpt_dir, main_program=teacher_program) + if os.path.exists(ckpt_dir): + try: + fluid.load(teacher_program, os.path.join(ckpt_dir, 'model'), exe) + except: + fluid.io.load_params(exe, ckpt_dir, main_program=teacher_program) # cfg = load_config(FLAGS.config) cfg.update_from_file(args.cfg_file) @@ -355,42 +321,8 @@ def train(cfg): begin_epoch = load_checkpoint(exe, fluid.default_main_program()) # Load pretrained model elif os.path.exists(cfg.TRAIN.PRETRAINED_MODEL_DIR): - print_info('Pretrained model dir: ', cfg.TRAIN.PRETRAINED_MODEL_DIR) - load_vars = [] - load_fail_vars = [] - - def var_shape_matched(var, shape): - """ - Check whehter persitable variable shape is match with current network - """ - var_exist = os.path.exists( - os.path.join(cfg.TRAIN.PRETRAINED_MODEL_DIR, var.name)) - if var_exist: - var_shape = parse_shape_from_file( - os.path.join(cfg.TRAIN.PRETRAINED_MODEL_DIR, var.name)) - return var_shape == shape - return False - - for x in fluid.default_main_program().list_vars(): - if isinstance(x, fluid.framework.Parameter): - shape = tuple(fluid.global_scope().find_var( - x.name).get_tensor().shape()) - if var_shape_matched(x, shape): - load_vars.append(x) - else: - load_fail_vars.append(x) - - fluid.io.load_vars( - exe, dirname=cfg.TRAIN.PRETRAINED_MODEL_DIR, vars=load_vars) - for var in load_vars: - print_info("Parameter[{}] loaded sucessfully!".format(var.name)) - for var in load_fail_vars: - print_info( - "Parameter[{}] don't exist or shape does not match current network, skip" - " to load it.".format(var.name)) - print_info("{}/{} pretrained parameters loaded successfully!".format( - len(load_vars), - len(load_vars) + len(load_fail_vars))) + load_pretrained_weights(exe, fluid.default_main_program(), + cfg.TRAIN.PRETRAINED_MODEL_DIR) else: print_info( 'Pretrained model dir {} not exists, training from scratch...'. @@ -475,12 +407,9 @@ def train(cfg): step) log_writer.add_scalar('Train/mean_acc', mean_acc, step) - log_writer.add_scalar('Train/loss', avg_loss, - step) - log_writer.add_scalar('Train/lr', lr[0], - step) - log_writer.add_scalar('Train/step/sec', speed, - step) + log_writer.add_scalar('Train/loss', avg_loss, step) + log_writer.add_scalar('Train/lr', lr[0], step) + log_writer.add_scalar('Train/step/sec', speed, step) sys.stdout.flush() avg_loss = 0.0 cm.zero_matrix() @@ -503,16 +432,13 @@ def train(cfg): speed = args.log_steps / timer.elapsed_time() print(( "epoch={} step={} lr={:.5f} loss={:.4f} teacher loss={:.4f} distill loss={:.4f} step/sec={:.3f} | ETA {}" - ).format(epoch, step, lr[0], avg_loss, - avg_t_loss, avg_d_loss, speed, + ).format(epoch, step, lr[0], avg_loss, avg_t_loss, + avg_d_loss, speed, calculate_eta(all_step - step, speed))) if args.use_vdl: - log_writer.add_scalar('Train/loss', avg_loss, - step) - log_writer.add_scalar('Train/lr', lr[0], - step) - log_writer.add_scalar('Train/speed', speed, - step) + log_writer.add_scalar('Train/loss', avg_loss, step) + log_writer.add_scalar('Train/lr', lr[0], step) + log_writer.add_scalar('Train/speed', speed, step) sys.stdout.flush() avg_loss = 0.0 avg_t_loss = 0.0 @@ -527,7 +453,7 @@ def train(cfg): if (epoch % cfg.TRAIN.SNAPSHOT_EPOCH == 0 or epoch == cfg.SOLVER.NUM_EPOCHS) and cfg.TRAINER_ID == 0: - ckpt_dir = save_checkpoint(exe, fluid.default_main_program(), epoch) + ckpt_dir = save_checkpoint(fluid.default_main_program(), epoch) if args.do_eval: print("Evaluation start") @@ -537,10 +463,8 @@ def train(cfg): use_gpu=args.use_gpu, use_mpio=args.use_mpio) if args.use_vdl: - log_writer.add_scalar('Evaluate/mean_iou', mean_iou, - step) - log_writer.add_scalar('Evaluate/mean_acc', mean_acc, - step) + log_writer.add_scalar('Evaluate/mean_iou', mean_iou, step) + log_writer.add_scalar('Evaluate/mean_acc', mean_acc, step) if mean_iou > best_mIoU: best_mIoU = mean_iou @@ -560,11 +484,11 @@ def train(cfg): ckpt_dir=ckpt_dir, log_writer=log_writer) if cfg.TRAINER_ID == 0: - ckpt_dir = save_checkpoint(exe, fluid.default_main_program(), epoch) + ckpt_dir = save_checkpoint(fluid.default_main_program(), epoch) # save final model if cfg.TRAINER_ID == 0: - save_checkpoint(exe, fluid.default_main_program(), 'final') + save_checkpoint(fluid.default_main_program(), 'final') def main(args): diff --git a/slim/nas/eval_nas.py b/slim/nas/eval_nas.py index 7f8663df..066915c7 100644 --- a/slim/nas/eval_nas.py +++ b/slim/nas/eval_nas.py @@ -123,7 +123,10 @@ def evaluate(cfg, ckpt_dir=None, use_gpu=False, use_mpio=False, **kwargs): if ckpt_dir is not None: print('load test model:', ckpt_dir) - fluid.io.load_params(exe, ckpt_dir, main_program=test_prog) + try: + fluid.load(test_prog, os.path.join(ckpt_dir, 'model'), exe) + except: + fluid.io.load_params(exe, ckpt_dir, main_program=test_prog) # Use streaming confusion matrix to calculate mean_iou np.set_printoptions( diff --git a/slim/nas/train_nas.py b/slim/nas/train_nas.py index f4cd8f81..42739d99 100644 --- a/slim/nas/train_nas.py +++ b/slim/nas/train_nas.py @@ -47,6 +47,7 @@ from model_builder import parse_shape_from_file from eval_nas import evaluate from vis import visualize from utils import dist_utils +from utils.load_model_utils import load_pretrained_weights from mobilenetv2_search_space import MobileNetV2SpaceSeg from paddleslim.nas.search_space.search_space_factory import SearchSpaceFactory @@ -116,38 +117,7 @@ def parse_args(): return parser.parse_args() -def save_vars(executor, dirname, program=None, vars=None): - """ - Temporary resolution for Win save variables compatability. - Will fix in PaddlePaddle v1.5.2 - """ - - save_program = fluid.Program() - save_block = save_program.global_block() - - for each_var in vars: - # NOTE: don't save the variable which type is RAW - if each_var.type == fluid.core.VarDesc.VarType.RAW: - continue - new_var = save_block.create_var( - name=each_var.name, - shape=each_var.shape, - dtype=each_var.dtype, - type=each_var.type, - lod_level=each_var.lod_level, - persistable=True) - file_path = os.path.join(dirname, new_var.name) - file_path = os.path.normpath(file_path) - save_block.append_op( - type='save', - inputs={'X': [new_var]}, - outputs={}, - attrs={'file_path': file_path}) - - executor.run(save_program) - - -def save_checkpoint(exe, program, ckpt_name): +def save_checkpoint(program, ckpt_name): """ Save checkpoint for evaluation or resume training """ @@ -156,29 +126,22 @@ def save_checkpoint(exe, program, ckpt_name): if not os.path.isdir(ckpt_dir): os.makedirs(ckpt_dir) - save_vars( - exe, - ckpt_dir, - program, - vars=list(filter(fluid.io.is_persistable, program.list_vars()))) + fluid.save(program, os.path.join(ckpt_dir, 'model')) return ckpt_dir def load_checkpoint(exe, program): """ - Load checkpoiont from pretrained model directory for resume training + Load checkpoiont for resuming training """ - - print('Resume model training from:', cfg.TRAIN.RESUME_MODEL_DIR) - if not os.path.exists(cfg.TRAIN.RESUME_MODEL_DIR): - raise ValueError("TRAIN.PRETRAIN_MODEL {} not exist!".format( - cfg.TRAIN.RESUME_MODEL_DIR)) - - fluid.io.load_persistables( - exe, cfg.TRAIN.RESUME_MODEL_DIR, main_program=program) - model_path = cfg.TRAIN.RESUME_MODEL_DIR + print('Resume model training from:', model_path) + if not os.path.exists(model_path): + raise ValueError( + "TRAIN.PRETRAIN_MODEL {} not exist!".format(model_path)) + fluid.load(program, os.path.join(model_path, 'model'), exe) + # Check is path ended by path spearator if model_path[-1] == os.sep: model_path = model_path[0:-1] @@ -193,7 +156,6 @@ def load_checkpoint(exe, program): else: raise ValueError("Resume model path is not valid!") print("Model checkpoint loaded successfully!") - return begin_epoch @@ -245,8 +207,6 @@ def train(cfg): yield item[0], item[1], item[2] # Get device environment - # places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places() - # place = places[0] gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0)) place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace() places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places() @@ -326,43 +286,8 @@ def train(cfg): begin_epoch = load_checkpoint(exe, train_prog) # Load pretrained model elif os.path.exists(cfg.TRAIN.PRETRAINED_MODEL_DIR): - print_info('Pretrained model dir: ', cfg.TRAIN.PRETRAINED_MODEL_DIR) - load_vars = [] - load_fail_vars = [] - - def var_shape_matched(var, shape): - """ - Check whehter persitable variable shape is match with current network - """ - var_exist = os.path.exists( - os.path.join(cfg.TRAIN.PRETRAINED_MODEL_DIR, var.name)) - if var_exist: - var_shape = parse_shape_from_file( - os.path.join(cfg.TRAIN.PRETRAINED_MODEL_DIR, var.name)) - return var_shape == shape - return False - - for x in train_prog.list_vars(): - if isinstance(x, fluid.framework.Parameter): - shape = tuple(fluid.global_scope().find_var( - x.name).get_tensor().shape()) - if var_shape_matched(x, shape): - load_vars.append(x) - else: - load_fail_vars.append(x) - - fluid.io.load_vars( - exe, dirname=cfg.TRAIN.PRETRAINED_MODEL_DIR, vars=load_vars) - for var in load_vars: - print_info("Parameter[{}] loaded sucessfully!".format(var.name)) - for var in load_fail_vars: - print_info( - "Parameter[{}] don't exist or shape does not match current network, skip" - " to load it.".format(var.name)) - print_info( - "{}/{} pretrained parameters loaded successfully!".format( - len(load_vars), - len(load_vars) + len(load_fail_vars))) + load_pretrained_weights(exe, train_prog, + cfg.TRAIN.PRETRAINED_MODEL_DIR) else: print_info( 'Pretrained model dir {} not exists, training from scratch...'. @@ -419,8 +344,7 @@ def train(cfg): except Exception as e: print(e) if epoch > cfg.SLIM.NAS_START_EVAL_EPOCH: - ckpt_dir = save_checkpoint(exe, train_prog, - '{}_tmp'.format(port)) + ckpt_dir = save_checkpoint(train_prog, '{}_tmp'.format(port)) _, mean_iou, _, mean_acc = evaluate( cfg=cfg, arch=arch, diff --git a/slim/prune/train_prune.py b/slim/prune/train_prune.py index 6c41e74b..ef54c6c8 100644 --- a/slim/prune/train_prune.py +++ b/slim/prune/train_prune.py @@ -46,6 +46,7 @@ from models.model_builder import parse_shape_from_file from eval_prune import evaluate from vis import visualize from utils import dist_utils +from utils.load_model_utils import load_pretrained_weights from paddleslim.prune import Pruner, save_model from paddleslim.analysis import flops @@ -285,42 +286,7 @@ def train(cfg): begin_epoch = load_checkpoint(exe, train_prog) # Load pretrained model elif os.path.exists(cfg.TRAIN.PRETRAINED_MODEL_DIR): - print_info('Pretrained model dir: ', cfg.TRAIN.PRETRAINED_MODEL_DIR) - load_vars = [] - load_fail_vars = [] - - def var_shape_matched(var, shape): - """ - Check whehter persitable variable shape is match with current network - """ - var_exist = os.path.exists( - os.path.join(cfg.TRAIN.PRETRAINED_MODEL_DIR, var.name)) - if var_exist: - var_shape = parse_shape_from_file( - os.path.join(cfg.TRAIN.PRETRAINED_MODEL_DIR, var.name)) - return var_shape == shape - return False - - for x in train_prog.list_vars(): - if isinstance(x, fluid.framework.Parameter): - shape = tuple(fluid.global_scope().find_var( - x.name).get_tensor().shape()) - if var_shape_matched(x, shape): - load_vars.append(x) - else: - load_fail_vars.append(x) - - fluid.io.load_vars( - exe, dirname=cfg.TRAIN.PRETRAINED_MODEL_DIR, vars=load_vars) - for var in load_vars: - print_info("Parameter[{}] loaded sucessfully!".format(var.name)) - for var in load_fail_vars: - print_info( - "Parameter[{}] don't exist or shape does not match current network, skip" - " to load it.".format(var.name)) - print_info("{}/{} pretrained parameters loaded successfully!".format( - len(load_vars), - len(load_vars) + len(load_fail_vars))) + load_pretrained_weights(exe, train_prog, cfg.TRAIN.PRETRAINED_MODEL_DIR) else: print_info( 'Pretrained model dir {} not exists, training from scratch...'. @@ -409,12 +375,9 @@ def train(cfg): step) log_writer.add_scalar('Train/mean_acc', mean_acc, step) - log_writer.add_scalar('Train/loss', avg_loss, - step) - log_writer.add_scalar('Train/lr', lr[0], - step) - log_writer.add_scalar('Train/step/sec', speed, - step) + log_writer.add_scalar('Train/loss', avg_loss, step) + log_writer.add_scalar('Train/lr', lr[0], step) + log_writer.add_scalar('Train/step/sec', speed, step) sys.stdout.flush() avg_loss = 0.0 cm.zero_matrix() @@ -436,12 +399,9 @@ def train(cfg): ).format(epoch, step, lr[0], avg_loss, speed, calculate_eta(all_step - step, speed))) if args.use_vdl: - log_writer.add_scalar('Train/loss', avg_loss, - step) - log_writer.add_scalar('Train/lr', lr[0], - step) - log_writer.add_scalar('Train/speed', speed, - step) + log_writer.add_scalar('Train/loss', avg_loss, step) + log_writer.add_scalar('Train/lr', lr[0], step) + log_writer.add_scalar('Train/speed', speed, step) sys.stdout.flush() avg_loss = 0.0 timer.restart() @@ -464,10 +424,8 @@ def train(cfg): use_gpu=args.use_gpu, use_mpio=args.use_mpio) if args.use_vdl: - log_writer.add_scalar('Evaluate/mean_iou', mean_iou, - step) - log_writer.add_scalar('Evaluate/mean_acc', mean_acc, - step) + log_writer.add_scalar('Evaluate/mean_iou', mean_iou, step) + log_writer.add_scalar('Evaluate/mean_acc', mean_acc, step) # Use VisualDL to visualize results if args.use_vdl and cfg.DATASET.VIS_FILE_LIST is not None: diff --git a/slim/quantization/train_quant.py b/slim/quantization/train_quant.py index 1034b723..50017918 100644 --- a/slim/quantization/train_quant.py +++ b/slim/quantization/train_quant.py @@ -40,7 +40,8 @@ from models.model_builder import parse_shape_from_file from eval_quant import evaluate from vis import visualize from utils import dist_utils -from train import save_vars, save_checkpoint, load_checkpoint, update_best_model, print_info +from utils.load_model_utils import load_pretrained_weights +from train import update_best_model, print_info from paddleslim.quant import quant_aware @@ -103,6 +104,55 @@ def parse_args(): return parser.parse_args() +def save_checkpoint(exe, program, ckpt_name): + """ + Save checkpoint for evaluation or resume training + """ + ckpt_dir = os.path.join(cfg.TRAIN.MODEL_SAVE_DIR, str(ckpt_name)) + print("Save model checkpoint to {}".format(ckpt_dir)) + if not os.path.isdir(ckpt_dir): + os.makedirs(ckpt_dir) + + fluid.io.save_vars( + exe, + ckpt_dir, + program, + vars=list(filter(fluid.io.is_persistable, program.list_vars()))) + return ckpt_dir + + +def load_checkpoint(exe, program): + """ + Load checkpoiont from pretrained model directory for resume training + """ + + print('Resume model training from:', cfg.TRAIN.RESUME_MODEL_DIR) + if not os.path.exists(cfg.TRAIN.RESUME_MODEL_DIR): + raise ValueError("TRAIN.PRETRAIN_MODEL {} not exist!".format( + cfg.TRAIN.RESUME_MODEL_DIR)) + + fluid.io.load_persistables( + exe, cfg.TRAIN.RESUME_MODEL_DIR, main_program=program) + + model_path = cfg.TRAIN.RESUME_MODEL_DIR + # Check is path ended by path spearator + if model_path[-1] == os.sep: + model_path = model_path[0:-1] + epoch_name = os.path.basename(model_path) + # If resume model is final model + if epoch_name == 'final': + begin_epoch = cfg.SOLVER.NUM_EPOCHS + # If resume model path is end of digit, restore epoch status + elif epoch_name.isdigit(): + epoch = int(epoch_name) + begin_epoch = epoch + 1 + else: + raise ValueError("Resume model path is not valid!") + print("Model checkpoint loaded successfully!") + + return begin_epoch + + def train_quant(cfg): startup_prog = fluid.Program() train_prog = fluid.Program() @@ -182,42 +232,7 @@ def train_quant(cfg): begin_epoch = load_checkpoint(exe, train_prog) # Load pretrained model elif os.path.exists(cfg.TRAIN.PRETRAINED_MODEL_DIR): - print_info('Pretrained model dir: ', cfg.TRAIN.PRETRAINED_MODEL_DIR) - load_vars = [] - load_fail_vars = [] - - def var_shape_matched(var, shape): - """ - Check whehter persitable variable shape is match with current network - """ - var_exist = os.path.exists( - os.path.join(cfg.TRAIN.PRETRAINED_MODEL_DIR, var.name)) - if var_exist: - var_shape = parse_shape_from_file( - os.path.join(cfg.TRAIN.PRETRAINED_MODEL_DIR, var.name)) - return var_shape == shape - return False - - for x in train_prog.list_vars(): - if isinstance(x, fluid.framework.Parameter): - shape = tuple(fluid.global_scope().find_var( - x.name).get_tensor().shape()) - if var_shape_matched(x, shape): - load_vars.append(x) - else: - load_fail_vars.append(x) - - fluid.io.load_vars( - exe, dirname=cfg.TRAIN.PRETRAINED_MODEL_DIR, vars=load_vars) - for var in load_vars: - print_info("Parameter[{}] loaded sucessfully!".format(var.name)) - for var in load_fail_vars: - print_info( - "Parameter[{}] don't exist or shape does not match current network, skip" - " to load it.".format(var.name)) - print_info("{}/{} pretrained parameters loaded successfully!".format( - len(load_vars), - len(load_vars) + len(load_fail_vars))) + load_pretrained_weights(exe, train_prog, cfg.TRAIN.PRETRAINED_MODEL_DIR) else: print_info( 'Pretrained model dir {} not exists, training from scratch...'. -- GitLab