From 5c72d5a12b295d9242faaf37421e1b1257a00bc0 Mon Sep 17 00:00:00 2001 From: shangliang Xu Date: Tue, 7 Jun 2022 14:44:29 +0800 Subject: [PATCH] [TIPC] add dist train infer (#6141) --- test_tipc/README.md | 1 + ...n_s_300e_coco_train_fleet_infer_python.txt | 53 +++++++++++++ .../docs/test_train_fleet_inference_python.md | 76 +++++++++++++++++++ test_tipc/prepare.sh | 22 +++--- test_tipc/test_train_inference_python.sh | 10 ++- 5 files changed, 149 insertions(+), 13 deletions(-) create mode 100644 test_tipc/configs/ppyoloe/ppyoloe_crn_s_300e_coco_train_fleet_infer_python.txt create mode 100644 test_tipc/docs/test_train_fleet_inference_python.md diff --git a/test_tipc/README.md b/test_tipc/README.md index 097b61cd3..8df5e2230 100644 --- a/test_tipc/README.md +++ b/test_tipc/README.md @@ -105,6 +105,7 @@ bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/yolov3/yolov3_ ## 4. 开始测试 各功能测试中涉及混合精度、裁剪、量化等训练相关,及mkldnn、Tensorrt等多种预测相关参数配置,请点击下方相应链接了解更多细节和使用教程: - [test_train_inference_python 使用](docs/test_train_inference_python.md) :测试基于Python的模型训练、评估、推理等基本功能,包括裁剪、量化、蒸馏。 +- [test_train_fleet_inference_python 使用](./docs/test_train_fleet_inference_python.md):测试基于Python的多机多卡训练与推理等基本功能。 - [test_inference_cpp 使用](docs/test_inference_cpp.md):测试基于C++的模型推理。 - [test_serving 使用](./):测试基于Paddle Serving的服务化部署功能。 - [test_lite_arm_cpu_cpp 使用](./):测试基于Paddle-Lite的ARM CPU端c++预测部署功能。 diff --git a/test_tipc/configs/ppyoloe/ppyoloe_crn_s_300e_coco_train_fleet_infer_python.txt b/test_tipc/configs/ppyoloe/ppyoloe_crn_s_300e_coco_train_fleet_infer_python.txt new file mode 100644 index 000000000..775a96fb1 --- /dev/null +++ b/test_tipc/configs/ppyoloe/ppyoloe_crn_s_300e_coco_train_fleet_infer_python.txt @@ -0,0 +1,53 @@ +===========================train_params=========================== +model_name:ppyoloe_crn_s_300e_coco +python:python3.7 +gpu_list:192.168.0.1,192.168.0.2;0,1 +use_gpu:True +auto_cast:null +epoch:lite_train_lite_infer=1|lite_train_whole_infer=1|whole_train_whole_infer=300 +save_dir:null +TrainReader.batch_size:lite_train_lite_infer=2|lite_train_whole_infer=2|whole_train_whole_infer=2 +pretrain_weights:https://paddledet.bj.bcebos.com/models/ppyoloe_crn_s_300e_coco.pdparams +trained_model_name:model_final.pdparams +train_infer_img_dir:./dataset/coco/test2017/ +filename:null +## +trainer:norm_train +norm_train:tools/train.py -c configs/ppyoloe/ppyoloe_crn_s_300e_coco.yml -o +pact_train:tools/train.py -c configs/ppyoloe/ppyoloe_crn_s_300e_coco.yml --slim_config _template_pact -o +fpgm_train:tools/train.py -c configs/ppyoloe/ppyoloe_crn_s_300e_coco.yml --slim_config _template_fpgm -o +distill_train:null +null:null +null:null +## +===========================eval_params=========================== +eval:tools/eval.py -c configs/ppyoloe/ppyoloe_crn_s_300e_coco.yml -o +null:null +## +===========================infer_params=========================== +--output_dir:./output_inference +weights:https://paddledet.bj.bcebos.com/models/ppyoloe_crn_s_300e_coco.pdparams +norm_export:tools/export_model.py -c configs/ppyoloe/ppyoloe_crn_s_300e_coco.yml -o +pact_export:tools/export_model.py -c configs/ppyoloe/ppyoloe_crn_s_300e_coco.yml --slim_config _template_pact -o +fpgm_export:tools/export_model.py -c configs/ppyoloe/ppyoloe_crn_s_300e_coco.yml --slim_config _template_fpgm -o +distill_export:null +export1:null +export2:null +kl_quant_export:tools/post_quant.py -c configs/ppyoloe/ppyoloe_crn_s_300e_coco.yml --slim_config _template_kl_quant -o +## +infer_mode:norm +infer_quant:False +inference:./deploy/python/infer.py +--device:gpu|cpu +--enable_mkldnn:False +--cpu_threads:4 +--batch_size:1|2 +--use_tensorrt:null +--run_mode:paddle +--model_dir: +--image_dir:./dataset/coco/test2017/ +--save_log_path:null +--run_benchmark:False +--trt_max_shape:1600 +===========================infer_benchmark_params=========================== +numpy_infer_input:3x640x640.npy \ No newline at end of file diff --git a/test_tipc/docs/test_train_fleet_inference_python.md b/test_tipc/docs/test_train_fleet_inference_python.md new file mode 100644 index 000000000..0b9bb6580 --- /dev/null +++ b/test_tipc/docs/test_train_fleet_inference_python.md @@ -0,0 +1,76 @@ +# Linux GPU/CPU 多机多卡训练推理测试 + +Linux GPU/CPU 多机多卡训练推理测试的主程序为`test_train_fleet_inference_python.sh`,可以测试基于Python的模型训练、评估、推理等基本功能。 + +## 1. 测试结论汇总 + +- 训练相关: + +| 算法名称 | 模型名称 | 多机多卡 | +|:--------:| :----: | :----: | +| PP-YOLOE | ppyoloe_crn_s_300e_coco | 分布式训练 | + + +- 推理相关: + +| 算法名称 | 模型名称 | device_CPU | device_GPU | batchsize | +|:--------:|:------------------------:| :----: | :----: |:---------:| +| PP-YOLOE | ppyoloe_crn_s_300e_coco | 支持 | 支持 | 1, 2 | + + +## 2. 测试流程 + +运行环境配置请参考[文档](./install.md)的内容配置TIPC的运行环境。 + +### 2.1 功能测试 + +#### 2.1.1 修改配置文件 + +首先,修改配置文件中的`ip`设置: 假设两台机器的`ip`地址分别为`192.168.0.1`和`192.168.0.2`,则对应的配置文件`gpu_list`字段需要修改为`gpu_list:192.168.0.1,192.168.0.2;0,1`; `ip`地址查看命令为`ifconfig`。 + + +#### 2.1.2 准备数据 + +运行`prepare.sh`准备数据和模型,以配置文件`test_tipc/configs/ppyoloe/ppyoloe_crn_s_300e_coco_train_fleet_infer_python.txt`为例,数据准备命令如下所示。 + +```shell +bash test_tipc/prepare.sh test_tipc/configs/ppyoloe/ppyoloe_crn_s_300e_coco_train_fleet_infer_python.txt lite_train_lite_infer +``` + +**注意:** 由于是多机训练,这里需要在所有的节点上均运行启动上述命令,准备数据。 + +#### 2.1.3 修改起始端口并开始测试 + +在多机的节点上使用下面的命令设置分布式的起始端口(否则后面运行的时候会由于无法找到运行端口而hang住),一般建议设置在`10000~20000`之间。 + +```shell +export FLAGS_START_PORT=17000 +``` + +以配置文件`test_tipc/configs/ppyoloe/ppyoloe_crn_s_300e_coco_train_fleet_infer_python.txt`为例,测试方法如下所示。 + +```shell +bash test_tipc/test_train_inference_python.sh test_tipc/configs/ppyoloe/ppyoloe_crn_s_300e_coco_train_fleet_infer_python.txt lite_train_lite_infer +``` + +**注意:** 由于是多机训练,这里需要在所有的节点上均运行启动上述命令进行测试。 + + +#### 2.1.4 输出结果 + +输出结果如下,表示命令运行成功。 + +```bash + Run successfully with command - python3.7 -m paddle.distributed.launch --ips=192.168.0.1,192.168.0.2 --gpus=0,1 + tools/train.py -c configs/ppyoloe/ppyoloe_crn_s_300e_coco.yml -o log_iter=1 use_gpu=True save_dir=./test_tipc/outpu +t/ppyoloe_crn_s_300e_coco/norm_train_gpus_0,1_autocast_null_nodes_2 epoch=1 pretrain_weights=https://paddledet.bj.bc +ebos.com/models/ppyoloe_crn_s_300e_coco.pdparams TrainReader.batch_size=2 filename=ppyoloe_crn_s_300e_coco ! + + ...... + Run successfully with command - python3.7 ./deploy/python/infer.py --device=cpu --enable_mkldnn=False --cpu_threads +=4 --model_dir=./test_tipc/output/ppyoloe_crn_s_300e_coco/norm_train_gpus_0,1_autocast_null_nodes_2/ppyoloe_crn_s_30 +0e_coco --batch_size=2 --image_dir=./dataset/coco/test2017/ --run_benchmark=False --trt_max_shape=1600 > ./test_tipc +/output/ppyoloe_crn_s_300e_coco/python_infer_cpu_usemkldnn_False_threads_4_precision_fluid_batchsize_2.log 2>&1 ! +``` + +**注意:** 由于分布式训练时,仅在`trainer_id=0`所在的节点中保存模型,因此其他的节点中在运行模型导出与推理时会报错,为正常现象。 diff --git a/test_tipc/prepare.sh b/test_tipc/prepare.sh index 37efb0306..ca854ed71 100644 --- a/test_tipc/prepare.sh +++ b/test_tipc/prepare.sh @@ -22,15 +22,15 @@ if [ ${MODE} = "whole_train_whole_infer" ];then eval "${python} ./dataset/coco/download_coco.py" elif [ ${MODE} = "cpp_infer" ];then # download coco lite data - wget -nc -P ./dataset/coco/ https://paddledet.bj.bcebos.com/data/tipc/coco_tipc.tar + wget -nc -P ./dataset/coco/ https://paddledet.bj.bcebos.com/data/tipc/coco_tipc.tar --no-check-certificate cd ./dataset/coco/ && tar -xvf coco_tipc.tar && mv -n coco_tipc/* . rm -rf coco_tipc/ && cd ../../ # download wider_face lite data - wget -nc -P ./dataset/wider_face/ https://paddledet.bj.bcebos.com/data/tipc/wider_tipc.tar + wget -nc -P ./dataset/wider_face/ https://paddledet.bj.bcebos.com/data/tipc/wider_tipc.tar --no-check-certificate cd ./dataset/wider_face/ && tar -xvf wider_tipc.tar && mv -n wider_tipc/* . rm -rf wider_tipc/ && cd ../../ # download spine lite data - wget -nc -P ./dataset/spine_coco/ https://paddledet.bj.bcebos.com/data/tipc/spine_tipc.tar + wget -nc -P ./dataset/spine_coco/ https://paddledet.bj.bcebos.com/data/tipc/spine_tipc.tar --no-check-certificate cd ./dataset/spine_coco/ && tar -xvf spine_tipc.tar && mv -n spine_tipc/* . rm -rf spine_tipc/ && cd ../../ if [[ ${model_name} =~ "s2anet" ]]; then @@ -38,7 +38,7 @@ elif [ ${MODE} = "cpp_infer" ];then cd ../../ fi # download mot lite data - wget -nc -P ./dataset/mot/ https://paddledet.bj.bcebos.com/data/tipc/mot_tipc.tar + wget -nc -P ./dataset/mot/ https://paddledet.bj.bcebos.com/data/tipc/mot_tipc.tar --no-check-certificate cd ./dataset/mot/ && tar -xvf mot_tipc.tar && mv -n mot_tipc/* . rm -rf mot_tipc/ && cd ../../ @@ -50,7 +50,7 @@ elif [ ${MODE} = "cpp_infer" ];then echo "################### Opencv already exists, skip downloading. ###################" else mkdir -p $(pwd)/deps && cd $(pwd)/deps - wget -c https://paddledet.bj.bcebos.com/data/opencv-3.4.16_gcc8.2_ffmpeg.tar.gz + wget -c https://paddledet.bj.bcebos.com/data/opencv-3.4.16_gcc8.2_ffmpeg.tar.gz --no-check-certificate tar -xvf opencv-3.4.16_gcc8.2_ffmpeg.tar.gz && cd ../ echo "################### Finish downloading opencv. ###################" fi @@ -60,13 +60,13 @@ elif [ ${MODE} = "benchmark_train" ];then pip install -U pip Cython pip install -r requirements.txt # prepare lite benchmark coco data - wget -nc -P ./dataset/coco/ https://paddledet.bj.bcebos.com/data/coco_benchmark.tar + wget -nc -P ./dataset/coco/ https://paddledet.bj.bcebos.com/data/coco_benchmark.tar --no-check-certificate cd ./dataset/coco/ && tar -xvf coco_benchmark.tar mv -u coco_benchmark/* ./ ls ./ cd ../../ # prepare lite benchmark mot data - wget -nc -P ./dataset/mot/ https://paddledet.bj.bcebos.com/data/mot_benchmark.tar + wget -nc -P ./dataset/mot/ https://paddledet.bj.bcebos.com/data/mot_benchmark.tar --no-check-certificate cd ./dataset/mot/ && tar -xvf mot_benchmark.tar mv -u mot_benchmark/* ./ ls ./ @@ -87,15 +87,15 @@ elif [ ${MODE} = "serving_infer" ];then python -m pip install paddlepaddle-gpu==2.2.2.post101 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html else # download coco lite data - wget -nc -P ./dataset/coco/ https://paddledet.bj.bcebos.com/data/tipc/coco_tipc.tar + wget -nc -P ./dataset/coco/ https://paddledet.bj.bcebos.com/data/tipc/coco_tipc.tar --no-check-certificate cd ./dataset/coco/ && tar -xvf coco_tipc.tar && mv -n coco_tipc/* . rm -rf coco_tipc/ && cd ../../ # download wider_face lite data - wget -nc -P ./dataset/wider_face/ https://paddledet.bj.bcebos.com/data/tipc/wider_tipc.tar + wget -nc -P ./dataset/wider_face/ https://paddledet.bj.bcebos.com/data/tipc/wider_tipc.tar --no-check-certificate cd ./dataset/wider_face/ && tar -xvf wider_tipc.tar && mv -n wider_tipc/* . rm -rf wider_tipc/ && cd ../../ # download spine_coco lite data - wget -nc -P ./dataset/spine_coco/ https://paddledet.bj.bcebos.com/data/tipc/spine_tipc.tar + wget -nc -P ./dataset/spine_coco/ https://paddledet.bj.bcebos.com/data/tipc/spine_tipc.tar --no-check-certificate cd ./dataset/spine_coco/ && tar -xvf spine_tipc.tar && mv -n spine_tipc/* . rm -rf spine_tipc/ && cd ../../ if [[ ${model_name} =~ "s2anet" ]]; then @@ -103,7 +103,7 @@ else cd ../../ fi # download mot lite data - wget -nc -P ./dataset/mot/ https://paddledet.bj.bcebos.com/data/tipc/mot_tipc.tar + wget -nc -P ./dataset/mot/ https://paddledet.bj.bcebos.com/data/tipc/mot_tipc.tar --no-check-certificate cd ./dataset/mot/ && tar -xvf mot_tipc.tar && mv -n mot_tipc/* . rm -rf mot_tipc/ && cd ../../ fi diff --git a/test_tipc/test_train_inference_python.sh b/test_tipc/test_train_inference_python.sh index d0eb77810..91b32271e 100644 --- a/test_tipc/test_train_inference_python.sh +++ b/test_tipc/test_train_inference_python.sh @@ -278,10 +278,16 @@ else set_save_model=$(func_set_params "${save_model_key}" "${save_log}") if [ ${#gpu} -le 2 ];then # train with cpu or single gpu cmd="${python} ${run_train} LearningRate.base_lr=0.0001 log_iter=1 ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_batchsize} ${set_filename} ${set_train_params1} ${set_autocast}" - elif [ ${#ips} -le 26 ];then # train with multi-gpu + elif [ ${#ips} -le 15 ];then # train with multi-gpu cmd="${python} -m paddle.distributed.launch --gpus=${gpu} ${run_train} log_iter=1 ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_batchsize} ${set_filename} ${set_train_params1} ${set_autocast}" else # train with multi-machine - cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${set_use_gpu} ${run_train} log_iter=1 ${set_save_model} ${set_epoch} ${set_pretrain} ${set_batchsize} ${set_filename} ${set_train_params1} ${set_autocast}" + IFS="," + ips_array=(${ips}) + nodes=${#ips_array[@]} + save_log="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}_nodes_${nodes}" + IFS="|" + set_save_model=$(func_set_params "${save_model_key}" "${save_log}") + cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} log_iter=1 ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_batchsize} ${set_filename} ${set_train_params1} ${set_autocast}" fi # run train eval $cmd -- GitLab