提交 987517fd 编写于 作者: S stephon

add multi_node and amp train in script

上级 8bb9fb7e
...@@ -14,8 +14,8 @@ Global: ...@@ -14,8 +14,8 @@ Global:
use_visualdl: False use_visualdl: False
infer_img: doc/imgs_en/img_10.jpg infer_img: doc/imgs_en/img_10.jpg
save_res_path: ./output/det_db/predicts_db.txt save_res_path: ./output/det_db/predicts_db.txt
#amp related
AMP: use_amp: True
scale_loss: 1024.0 scale_loss: 1024.0
use_dynamic_loss_scaling: True use_dynamic_loss_scaling: True
......
===========================train_params=========================== ===========================train_params===========================
model_name:ocr_det model_name:ocr_det
python:python3.7 python:python3.7
gpu_list:0|0,1 gpu_list:0|0,1|10.21.226.181,10.21.226.133;0,1
Global.use_gpu:True|True Global.use_gpu:True|True
Global.auto_cast:null Global.auto_cast:fp32|amp
Global.epoch_num:lite_train_infer=1|whole_train_infer=300 Global.epoch_num:lite_train_infer=1|whole_train_infer=300
Global.save_model_dir:./output/ Global.save_model_dir:./output/
Train.loader.batch_size_per_card:lite_train_infer=2|whole_train_infer=4 Train.loader.batch_size_per_card:lite_train_infer=2|whole_train_infer=4
......
...@@ -253,6 +253,11 @@ else ...@@ -253,6 +253,11 @@ else
env=" " env=" "
fi fi
for autocast in ${autocast_list[*]}; do for autocast in ${autocast_list[*]}; do
if [ ${autocast} = "amp" ]; then
set_amp_config="Gloabl.use_amp=True Global.scale_loss=1024.0 Global.use_dynamic_loss_scaling=True"
else
set_amp_config=" "
fi
for trainer in ${trainer_list[*]}; do for trainer in ${trainer_list[*]}; do
flag_quant=False flag_quant=False
if [ ${trainer} = ${pact_key} ]; then if [ ${trainer} = ${pact_key} ]; then
...@@ -279,7 +284,6 @@ else ...@@ -279,7 +284,6 @@ else
if [ ${run_train} = "null" ]; then if [ ${run_train} = "null" ]; then
continue continue
fi fi
set_autocast=$(func_set_params "${autocast_key}" "${autocast}") set_autocast=$(func_set_params "${autocast_key}" "${autocast}")
set_epoch=$(func_set_params "${epoch_key}" "${epoch_num}") set_epoch=$(func_set_params "${epoch_key}" "${epoch_num}")
set_pretrain=$(func_set_params "${pretrain_model_key}" "${pretrain_model_value}") set_pretrain=$(func_set_params "${pretrain_model_key}" "${pretrain_model_value}")
...@@ -295,11 +299,11 @@ else ...@@ -295,11 +299,11 @@ else
set_save_model=$(func_set_params "${save_model_key}" "${save_log}") set_save_model=$(func_set_params "${save_model_key}" "${save_log}")
if [ ${#gpu} -le 2 ];then # train with cpu or single gpu if [ ${#gpu} -le 2 ];then # train with cpu or single gpu
cmd="${python} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1} " cmd="${python} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config} "
elif [ ${#gpu} -le 15 ];then # train with multi-gpu elif [ ${#gpu} -le 15 ];then # train with multi-gpu
cmd="${python} -m paddle.distributed.launch --gpus=${gpu} ${run_train} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1}" cmd="${python} -m paddle.distributed.launch --gpus=${gpu} ${run_train} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config}"
else # train with multi-machine else # train with multi-machine
cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_autocast} ${set_batchsize} ${set_train_params1}" cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config}"
fi fi
# run train # run train
eval "unset CUDA_VISIBLE_DEVICES" eval "unset CUDA_VISIBLE_DEVICES"
......
...@@ -103,16 +103,16 @@ def main(config, device, logger, vdl_writer): ...@@ -103,16 +103,16 @@ def main(config, device, logger, vdl_writer):
logger.info('valid dataloader has {} iters'.format( logger.info('valid dataloader has {} iters'.format(
len(valid_dataloader))) len(valid_dataloader)))
use_amp = True if "AMP" in config else False use_amp = config["Global"].get("use_amp", False)
if use_amp: if use_amp:
AMP_RELATED_FLAGS_SETTING = { AMP_RELATED_FLAGS_SETTING = {
'FLAGS_cudnn_batchnorm_spatial_persistent': 1, 'FLAGS_cudnn_batchnorm_spatial_persistent': 1,
'FLAGS_max_inplace_grad_add': 8, 'FLAGS_max_inplace_grad_add': 8,
} }
paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING) paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING)
scale_loss = config["AMP"].get("scale_loss", 1.0) scale_loss = config["Global"].get("scale_loss", 1.0)
use_dynamic_loss_scaling = config["AMP"].get("use_dynamic_loss_scaling", use_dynamic_loss_scaling = config["Global"].get(
False) "use_dynamic_loss_scaling", False)
scaler = paddle.amp.GradScaler( scaler = paddle.amp.GradScaler(
init_loss_scaling=scale_loss, init_loss_scaling=scale_loss,
use_dynamic_loss_scaling=use_dynamic_loss_scaling) use_dynamic_loss_scaling=use_dynamic_loss_scaling)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册