提交 88325a1e 编写于 作者: M meixiaowei

support load pretrain ckpt and weight initializer modify

上级 59d5c069
...@@ -46,6 +46,7 @@ Parameters for both training and evaluating can be set in config.py. ...@@ -46,6 +46,7 @@ Parameters for both training and evaluating can be set in config.py.
"momentum": 0.9, # momentum optimizer "momentum": 0.9, # momentum optimizer
"weight_decay": 1e-4, # weight decay "weight_decay": 1e-4, # weight decay
"epoch_size": 120, # epoch sizes for training "epoch_size": 120, # epoch sizes for training
"pretrain_epoch_size": 0, # epoch size of pretrain checkpoint
"buffer_size": 1000, # number of queue size in data preprocessing "buffer_size": 1000, # number of queue size in data preprocessing
"image_height": 224, # image height "image_height": 224, # image height
"image_width": 224, # image width "image_width": 224, # image width
...@@ -68,10 +69,10 @@ Parameters for both training and evaluating can be set in config.py. ...@@ -68,10 +69,10 @@ Parameters for both training and evaluating can be set in config.py.
``` ```
# distributed training # distributed training
sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [PRETRAINED_PATH](optional)
# standalone training # standalone training
sh run_standalone_train.sh [DATASET_PATH] sh run_standalone_train.sh [DATASET_PATH] [PRETRAINED_PATH](optional)
``` ```
#### Launch #### Launch
...@@ -79,9 +80,15 @@ sh run_standalone_train.sh [DATASET_PATH] ...@@ -79,9 +80,15 @@ sh run_standalone_train.sh [DATASET_PATH]
```bash ```bash
# distributed training example(8p) # distributed training example(8p)
sh run_distribute_train.sh rank_table_8p.json dataset/ilsvrc sh run_distribute_train.sh rank_table_8p.json dataset/ilsvrc
If you want to load pretrained ckpt file,
sh run_distribute_train.sh rank_table_8p.json dataset/ilsvrc ./ckpt/pretrained.ckpt
# standalone training example(1p) # standalone training example(1p)
sh run_standalone_train.sh dataset/ilsvrc sh run_standalone_train.sh dataset/ilsvrc
f you want to load pretrained ckpt file,
sh run_standalone_train.sh dataset/ilsvrc ./ckpt/pretrained.ckpt
``` ```
> About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/en/master/advanced_use/distributed_training.html). > About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/en/master/advanced_use/distributed_training.html).
......
...@@ -24,6 +24,7 @@ config = ed({ ...@@ -24,6 +24,7 @@ config = ed({
"momentum": 0.9, "momentum": 0.9,
"weight_decay": 1e-4, "weight_decay": 1e-4,
"epoch_size": 120, "epoch_size": 120,
"pretrain_epoch_size": 0,
"buffer_size": 1000, "buffer_size": 1000,
"image_height": 224, "image_height": 224,
"image_width": 224, "image_width": 224,
......
...@@ -21,7 +21,7 @@ def linear_warmup_lr(current_step, warmup_steps, base_lr, init_lr): ...@@ -21,7 +21,7 @@ def linear_warmup_lr(current_step, warmup_steps, base_lr, init_lr):
lr = float(init_lr) + lr_inc * current_step lr = float(init_lr) + lr_inc * current_step
return lr return lr
def warmup_cosine_annealing_lr(lr, steps_per_epoch, warmup_epochs, max_epoch): def warmup_cosine_annealing_lr(lr, steps_per_epoch, warmup_epochs, max_epoch=120, global_step=0):
""" """
generate learning rate array with cosine generate learning rate array with cosine
...@@ -30,6 +30,7 @@ def warmup_cosine_annealing_lr(lr, steps_per_epoch, warmup_epochs, max_epoch): ...@@ -30,6 +30,7 @@ def warmup_cosine_annealing_lr(lr, steps_per_epoch, warmup_epochs, max_epoch):
steps_per_epoch(int): steps size of one epoch steps_per_epoch(int): steps size of one epoch
warmup_epochs(int): number of warmup epochs warmup_epochs(int): number of warmup epochs
max_epoch(int): total epochs of training max_epoch(int): total epochs of training
global_step(int): the current start index of lr array
Returns: Returns:
np.array, learning rate array np.array, learning rate array
""" """
...@@ -49,4 +50,7 @@ def warmup_cosine_annealing_lr(lr, steps_per_epoch, warmup_epochs, max_epoch): ...@@ -49,4 +50,7 @@ def warmup_cosine_annealing_lr(lr, steps_per_epoch, warmup_epochs, max_epoch):
decayed = linear_decay * cosine_decay + 0.00001 decayed = linear_decay * cosine_decay + 0.00001
lr = base_lr * decayed lr = base_lr * decayed
lr_each_step.append(lr) lr_each_step.append(lr)
return np.array(lr_each_step).astype(np.float32)
lr_each_step = np.array(lr_each_step).astype(np.float32)
learning_rate = lr_each_step[global_step:]
return learning_rate
...@@ -14,9 +14,9 @@ ...@@ -14,9 +14,9 @@
# limitations under the License. # limitations under the License.
# ============================================================================ # ============================================================================
if [ $# != 2 ] if [ $# != 2 ] && [ $# != 3 ]
then then
echo "Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH]" echo "Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [PRETRAINED_PATH](optional)"
exit 1 exit 1
fi fi
...@@ -31,6 +31,11 @@ PATH1=$(get_real_path $1) ...@@ -31,6 +31,11 @@ PATH1=$(get_real_path $1)
PATH2=$(get_real_path $2) PATH2=$(get_real_path $2)
echo $PATH1 echo $PATH1
echo $PATH2 echo $PATH2
if [ $# == 3 ]
then
PATH3=$(get_real_path $3)
echo $PATH3
fi
if [ ! -f $PATH1 ] if [ ! -f $PATH1 ]
then then
...@@ -44,6 +49,12 @@ then ...@@ -44,6 +49,12 @@ then
exit 1 exit 1
fi fi
if [ $# == 3 ] && [ ! -f $PATH3 ]
then
echo "error: PRETRAINED_PATH=$PATH3 is not a file"
exit 1
fi
ulimit -u unlimited ulimit -u unlimited
export DEVICE_NUM=8 export DEVICE_NUM=8
export RANK_SIZE=8 export RANK_SIZE=8
...@@ -61,6 +72,15 @@ do ...@@ -61,6 +72,15 @@ do
cd ./train_parallel$i || exit cd ./train_parallel$i || exit
echo "start training for rank $RANK_ID, device $DEVICE_ID" echo "start training for rank $RANK_ID, device $DEVICE_ID"
env > env.log env > env.log
python train.py --do_train=True --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 &> log & if [ $# == 2 ]
then
python train.py --do_train=True --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 &> log &
fi
if [ $# == 3 ]
then
python train.py --do_train=True --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 --pre_trained=$PATH3 &> log &
fi
cd .. cd ..
done done
...@@ -14,9 +14,9 @@ ...@@ -14,9 +14,9 @@
# limitations under the License. # limitations under the License.
# ============================================================================ # ============================================================================
if [ $# != 1 ] if [ $# != 1 ] && [ $# != 2 ]
then then
echo "Usage: sh run_standalone_train.sh [DATASET_PATH]" echo "Usage: sh run_standalone_train.sh [DATASET_PATH] [PRETRAINED_PATH](optional)"
exit 1 exit 1
fi fi
...@@ -29,12 +29,23 @@ get_real_path(){ ...@@ -29,12 +29,23 @@ get_real_path(){
} }
PATH1=$(get_real_path $1) PATH1=$(get_real_path $1)
echo $PATH1 echo $PATH1
if [ $# == 2 ]
then
PATH2=$(get_real_path $2)
echo $PATH2
fi
if [ ! -d $PATH1 ] if [ ! -d $PATH1 ]
then then
echo "error: DATASET_PATH=$PATH1 is not a directory" echo "error: DATASET_PATH=$PATH1 is not a directory"
exit 1 exit 1
fi fi
if [ $# == 2 ] && [ ! -f $PATH2 ]
then
echo "error: PRETRAINED_PATH=$PATH2 is not a file"
exit 1
fi
ulimit -u unlimited ulimit -u unlimited
export DEVICE_NUM=1 export DEVICE_NUM=1
...@@ -52,5 +63,13 @@ cp *.sh ./train ...@@ -52,5 +63,13 @@ cp *.sh ./train
cd ./train || exit cd ./train || exit
echo "start training for device $DEVICE_ID" echo "start training for device $DEVICE_ID"
env > env.log env > env.log
python train.py --do_train=True --dataset_path=$PATH1 &> log & if [ $# == 1 ]
then
python train.py --do_train=True --dataset_path=$PATH1 &> log &
fi
if [ $# == 2 ]
then
python train.py --do_train=True --dataset_path=$PATH1 --pre_trained=$PATH2 &> log &
fi
cd .. cd ..
...@@ -44,6 +44,7 @@ parser.add_argument('--device_num', type=int, default=1, help='Device num.') ...@@ -44,6 +44,7 @@ parser.add_argument('--device_num', type=int, default=1, help='Device num.')
parser.add_argument('--do_train', type=bool, default=True, help='Do train or not.') parser.add_argument('--do_train', type=bool, default=True, help='Do train or not.')
parser.add_argument('--do_eval', type=bool, default=False, help='Do eval or not.') parser.add_argument('--do_eval', type=bool, default=False, help='Do eval or not.')
parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path') parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path')
parser.add_argument('--pre_trained', type=str, default=None, help='Pretrained checkpoint path')
args_opt = parser.parse_args() args_opt = parser.parse_args()
device_id = int(os.getenv('DEVICE_ID')) device_id = int(os.getenv('DEVICE_ID'))
...@@ -64,11 +65,11 @@ if __name__ == '__main__': ...@@ -64,11 +65,11 @@ if __name__ == '__main__':
if isinstance(cell, nn.Conv2d): if isinstance(cell, nn.Conv2d):
cell.weight.default_input = weight_init.initializer(weight_init.XavierUniform(), cell.weight.default_input = weight_init.initializer(weight_init.XavierUniform(),
cell.weight.default_input.shape(), cell.weight.default_input.shape(),
cell.weight.default_input.dtype()) cell.weight.default_input.dtype()).to_tensor()
if isinstance(cell, nn.Dense): if isinstance(cell, nn.Dense):
cell.weight.default_input = weight_init.initializer(weight_init.TruncatedNormal(), cell.weight.default_input = weight_init.initializer(weight_init.TruncatedNormal(),
cell.weight.default_input.shape(), cell.weight.default_input.shape(),
cell.weight.default_input.dtype()) cell.weight.default_input.dtype()).to_tensor()
if not config.label_smooth: if not config.label_smooth:
config.label_smooth_factor = 0.0 config.label_smooth_factor = 0.0
loss = CrossEntropy(smooth_factor=config.label_smooth_factor, num_classes=config.class_num) loss = CrossEntropy(smooth_factor=config.label_smooth_factor, num_classes=config.class_num)
...@@ -77,9 +78,13 @@ if __name__ == '__main__': ...@@ -77,9 +78,13 @@ if __name__ == '__main__':
repeat_num=epoch_size, batch_size=config.batch_size) repeat_num=epoch_size, batch_size=config.batch_size)
step_size = dataset.get_dataset_size() step_size = dataset.get_dataset_size()
loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False)
if args_opt.pre_trained:
param_dict = load_checkpoint(args_opt.pre_trained)
load_param_into_net(net, param_dict)
# learning rate strategy with cosine # learning rate strategy with cosine
lr = Tensor(warmup_cosine_annealing_lr(config.lr, step_size, config.warmup_epochs, config.epoch_size)) lr = Tensor(warmup_cosine_annealing_lr(config.lr, step_size, config.warmup_epochs, 120,
config.pretrain_epoch_size*step_size))
opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum, opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum,
config.weight_decay, config.loss_scale) config.weight_decay, config.loss_scale)
model = Model(net, loss_fn=loss, optimizer=opt, amp_level='O2', keep_batchnorm_fp32=False, model = Model(net, loss_fn=loss, optimizer=opt, amp_level='O2', keep_batchnorm_fp32=False,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册