diff --git a/test_tipc/supplementary/readme.md b/test_tipc/supplementary/readme.md index b630b0f30b23b71c0dd21def2a45fee01023fe82..a378fc5f357d0deb54d5a2de93d8ca6de034fa24 100644 --- a/test_tipc/supplementary/readme.md +++ b/test_tipc/supplementary/readme.md @@ -47,6 +47,13 @@ bash test_tipc/test_train_python.sh ./test_tipc/train_infer_python_PACT.txt 'lit bash test_tipc/test_train_python.sh ./test_tipc/train_infer_python_FPGM.txt 'lite_train_lite_infer' ``` +多机多卡的运行配置文件分别为 `train_infer_python_fleet.txt`, `train_infer_python_FPGM_fleet.txt` 和 `train_infer_python_PACT_fleet.txt`。 +运行时,需要修改配置文件中的 `gpu_list:xx.xx.xx.xx,yy.yy.yy.yy;0,1`。 将 `xx.xx.xx.xx` 替换为具体的 `ip` 地址,各个`ip`地址之间用`,`分隔。 另外,和单机训练 +不同,启动多机多卡训练需要在多机的每个节点上分别运行命令。以多机多卡量化训练为例,指令如下: +``` +bash test_tipc/test_train_python.sh ./test_tipc/train_infer_python_PACT_fleet.txt 'lite_train_lite_infer' +``` + 运行相应指令后,在`test_tipc/output`文件夹下自动会保存运行日志。如'lite_train_lite_infer'模式运行后,在test_tipc/extra_output文件夹有以下文件: ``` diff --git a/test_tipc/supplementary/test_tipc/test_train_python.sh b/test_tipc/supplementary/test_tipc/test_train_python.sh index f922b57bba7de97d3631524c6f1bd1fac7395e76..ed709c1c4be886d8101e50108ad02714874ea14f 100644 --- a/test_tipc/supplementary/test_tipc/test_train_python.sh +++ b/test_tipc/supplementary/test_tipc/test_train_python.sh @@ -35,7 +35,6 @@ use_share_conv_key=$(func_parser_key "${lines[13]}") use_share_conv_list=$(func_parser_value "${lines[13]}") run_train_py=$(func_parser_value "${lines[14]}") - LOG_PATH="./test_tipc/extra_output" mkdir -p ${LOG_PATH} status_log="${LOG_PATH}/results_python.log" @@ -98,6 +97,8 @@ if [ ${MODE} = "lite_train_lite_infer" ] || [ ${MODE} = "whole_train_whole_infer cmd="${python} ${run_train_py} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_checkpoints} ${set_autocast} ${set_batchsize} ${set_use_custom_op} ${set_model_type} ${set_use_share_conv} ${set_amp_config}" elif [ ${#ips} -le 26 ];then # train with multi-gpu cmd="${python} -m paddle.distributed.launch --gpus=${gpu} ${run_train_py} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_checkpoints} ${set_autocast} ${set_batchsize} ${set_use_custom_op} ${set_model_type} ${set_use_share_conv} ${set_amp_config}" + else + cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train_py} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_checkpoints} ${set_autocast} ${set_batchsize} ${set_use_custom_op} ${set_model_type} ${set_use_share_conv} ${set_amp_config}" fi # run train diff --git a/test_tipc/supplementary/test_tipc/train_infer_python_FPGM.txt b/test_tipc/supplementary/test_tipc/train_infer_python_FPGM.txt index 4c2e28b91e24b34d1bded93cddebe83e0874ae29..ccbd27ffbcb11a0b70f480738186fadf6fc09ded 100644 --- a/test_tipc/supplementary/test_tipc/train_infer_python_FPGM.txt +++ b/test_tipc/supplementary/test_tipc/train_infer_python_FPGM.txt @@ -4,9 +4,9 @@ python:python3.7 gpu_list:0|0,1 use_gpu:True|True AMP.use_amp:True|False -epoch:lite_train_lite_infer=20|whole_train_whole_infer=1000 +epoch:lite_train_lite_infer=2|whole_train_whole_infer=1000 save_model_dir:./output/ -TRAIN.batch_size:lite_train_lite_infer=2|whole_train_whole_infer=4 +TRAIN.batch_size:lite_train_lite_infer=1280|whole_train_whole_infer=1280 pretrained_model:null checkpoints:null use_custom_relu:False|True diff --git a/test_tipc/supplementary/test_tipc/train_infer_python_FPGM_fleet.txt b/test_tipc/supplementary/test_tipc/train_infer_python_FPGM_fleet.txt new file mode 100644 index 0000000000000000000000000000000000000000..be2b2117d732816bb4f2f037e27a866eb8e58f19 --- /dev/null +++ b/test_tipc/supplementary/test_tipc/train_infer_python_FPGM_fleet.txt @@ -0,0 +1,17 @@ +===========================train_params=========================== +model_name:ch_PPOCRv2_det +python:python3.7 +gpu_list:xx.xx.xx.xx,yy.yy.yy.yy;0,1 +use_gpu:True +AMP.use_amp:True|False +epoch:lite_train_lite_infer=2|whole_train_whole_infer=1000 +save_model_dir:./output/ +TRAIN.batch_size:lite_train_lite_infer=1280|whole_train_whole_infer=1280 +pretrained_model:null +checkpoints:null +use_custom_relu:False|True +model_type:cls|cls_distill|cls_distill_multiopt +MODEL.siamese:False|True +norm_train:train.py -c mv3_large_x0_5.yml -o prune_train=True +quant_train:False +prune_train:False diff --git a/test_tipc/supplementary/test_tipc/train_infer_python_PACT.txt b/test_tipc/supplementary/test_tipc/train_infer_python_PACT.txt index 079cddf878712b2ba3af3a19f97be3bb5a0896da..24d291b4b3b49ab90fcb2eb3fd2b5ae2ece226e9 100644 --- a/test_tipc/supplementary/test_tipc/train_infer_python_PACT.txt +++ b/test_tipc/supplementary/test_tipc/train_infer_python_PACT.txt @@ -4,9 +4,9 @@ python:python3.7 gpu_list:0|0,1 use_gpu:True|True AMP.use_amp:True|False -epoch:lite_train_lite_infer=20|whole_train_whole_infer=1000 +epoch:lite_train_lite_infer=2|whole_train_whole_infer=1000 save_model_dir:./output/ -TRAIN.batch_size:lite_train_lite_infer=2|whole_train_whole_infer=4 +TRAIN.batch_size:lite_train_lite_infer=1280|whole_train_whole_infer=1280 pretrained_model:null checkpoints:null use_custom_relu:False|True diff --git a/test_tipc/supplementary/test_tipc/train_infer_python_PACT_fleet.txt b/test_tipc/supplementary/test_tipc/train_infer_python_PACT_fleet.txt new file mode 100644 index 0000000000000000000000000000000000000000..93f06d76336efd1ea7fb94fbb0263e569760086f --- /dev/null +++ b/test_tipc/supplementary/test_tipc/train_infer_python_PACT_fleet.txt @@ -0,0 +1,17 @@ +===========================train_params=========================== +model_name:ch_PPOCRv2_det +python:python3.7 +gpu_list:xx.xx.xx.xx,yy.yy.yy.yy;0,1 +use_gpu:True +AMP.use_amp:True|False +epoch:lite_train_lite_infer=2|whole_train_whole_infer=1000 +save_model_dir:./output/ +TRAIN.batch_size:lite_train_lite_infer=1280|whole_train_whole_infer=1280 +pretrained_model:null +checkpoints:null +use_custom_relu:False|True +model_type:cls|cls_distill|cls_distill_multiopt +MODEL.siamese:False|True +norm_train:train.py -c mv3_large_x0_5.yml -o quant_train=True +quant_train:False +prune_train:False diff --git a/test_tipc/supplementary/test_tipc/train_infer_python_fleet.txt b/test_tipc/supplementary/test_tipc/train_infer_python_fleet.txt new file mode 100644 index 0000000000000000000000000000000000000000..00b9e8234bc5140188077f0b447d706603f612b7 --- /dev/null +++ b/test_tipc/supplementary/test_tipc/train_infer_python_fleet.txt @@ -0,0 +1,17 @@ +===========================train_params=========================== +model_name:ch_PPOCRv2_det +python:python3.7 +gpu_list:xx.xx.xx.xx,yy.yy.yy.yy;0,1 +use_gpu:True +AMP.use_amp:True|False +epoch:lite_train_lite_infer=2|whole_train_whole_infer=1000 +save_model_dir:./output/ +TRAIN.batch_size:lite_train_lite_infer=1280|whole_train_whole_infer=1280 +pretrained_model:null +checkpoints:null +use_custom_relu:False|True +model_type:cls|cls_distill|cls_distill_multiopt +MODEL.siamese:False|True +norm_train: train.py -c mv3_large_x0_5.yml -o +quant_train:False +prune_train:False