提交 e8ced77f 编写于 作者: H HydrogenSulfate

add train_fleet_infer chain

上级 787f91b6
...@@ -112,3 +112,4 @@ bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/MobileNetV3/Mo ...@@ -112,3 +112,4 @@ bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/MobileNetV3/Mo
- [test_lite_arm_cpu_cpp 使用](docs/test_lite_arm_cpu_cpp.md): 测试基于Paddle-Lite的ARM CPU端c++预测部署功能. - [test_lite_arm_cpu_cpp 使用](docs/test_lite_arm_cpu_cpp.md): 测试基于Paddle-Lite的ARM CPU端c++预测部署功能.
- [test_paddle2onnx 使用](docs/test_paddle2onnx.md):测试Paddle2ONNX的模型转化功能,并验证正确性。 - [test_paddle2onnx 使用](docs/test_paddle2onnx.md):测试Paddle2ONNX的模型转化功能,并验证正确性。
- [test_serving_infer_python 使用](docs/test_serving_infer_python.md):测试python serving功能。 - [test_serving_infer_python 使用](docs/test_serving_infer_python.md):测试python serving功能。
- [test_train_fleet_inference_python 使用](./docs/test_train_fleet_inference_python.md):测试基于Python的多机多卡训练与推理等基本功能。
===========================train_params===========================
model_name:GeneralRecognition_PPLCNet_x2_5
python:python3.7
gpu_list:192.168.0.1,192.168.0.2;0,1
-o Global.device:gpu
-o Global.auto_cast:null
-o Global.epochs:lite_train_lite_infer=2|whole_train_whole_infer=120
-o Global.output_dir:./output/
-o DataLoader.Train.sampler.batch_size:8
-o Global.pretrained_model:null
train_model_name:latest
train_infer_img_dir:./dataset/ILSVRC2012/val
null:null
##
trainer:norm_train
norm_train:tools/train.py -c ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5.yaml -o Global.seed=1234 -o DataLoader.Train.sampler.shuffle=False -o DataLoader.Train.loader.num_workers=0 -o DataLoader.Train.loader.use_shared_memory=False
pact_train:null
fpgm_train:null
distill_train:null
null:null
null:null
##
===========================eval_params===========================
eval:tools/eval.py -c ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5.yaml
null:null
##
===========================infer_params==========================
-o Global.save_inference_dir:./inference
-o Global.pretrained_model:
norm_export:tools/export_model.py -c ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5.yaml
quant_export:null
fpgm_export:null
distill_export:null
kl_quant:null
export2:null
pretrained_model_url:https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/rec/models/pretrain/general_PPLCNet_x2_5_pretrained_v1.0.pdparams
infer_model:../inference/
infer_export:True
infer_quant:Fasle
inference:python/predict_rec.py -c configs/inference_rec.yaml
-o Global.use_gpu:True|False
-o Global.enable_mkldnn:True|False
-o Global.cpu_num_threads:1|6
-o Global.batch_size:1|16
-o Global.use_tensorrt:True|False
-o Global.use_fp16:True|False
-o Global.rec_inference_model_dir:../inference
-o Global.infer_imgs:../dataset/Aliproduct/demo_test/
-o Global.save_log_path:null
-o Global.benchmark:True
null:null
null:null
===========================infer_benchmark_params==========================
random_infer_input:[{float32,[3,224,224]}]
\ No newline at end of file
===========================train_params===========================
model_name:PPHGNet_small
python:python3.7
gpu_list:192.168.0.1,192.168.0.2;0,1
-o Global.device:gpu
-o Global.auto_cast:null
-o Global.epochs:lite_train_lite_infer=2|whole_train_whole_infer=120
-o Global.output_dir:./output/
-o DataLoader.Train.sampler.batch_size:8
-o Global.pretrained_model:null
train_model_name:latest
train_infer_img_dir:./dataset/ILSVRC2012/val
null:null
##
trainer:norm_train
norm_train:tools/train.py -c ppcls/configs/ImageNet/PPHGNet/PPHGNet_small.yaml -o Global.seed=1234 -o DataLoader.Train.sampler.shuffle=False -o DataLoader.Train.loader.num_workers=0 -o DataLoader.Train.loader.use_shared_memory=False
pact_train:null
fpgm_train:null
distill_train:null
null:null
null:null
##
===========================eval_params===========================
eval:tools/eval.py -c ppcls/configs/ImageNet/PPHGNet/PPHGNet_small.yaml
null:null
##
===========================infer_params==========================
-o Global.save_inference_dir:./inference
-o Global.pretrained_model:
norm_export:tools/export_model.py -c ppcls/configs/ImageNet/PPHGNet/PPHGNet_small.yaml
quant_export:null
fpgm_export:null
distill_export:null
kl_quant:null
export2:null
pretrained_model_url:https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPHGNet_small_pretrained.pdparams
infer_model:../inference/
infer_export:True
infer_quant:Fasle
inference:python/predict_cls.py -c configs/inference_cls.yaml -o PreProcess.transform_ops.0.ResizeImage.resize_short=236
-o Global.use_gpu:True|False
-o Global.enable_mkldnn:True|False
-o Global.cpu_num_threads:1|6
-o Global.batch_size:1|16
-o Global.use_tensorrt:True|False
-o Global.use_fp16:True|False
-o Global.inference_model_dir:../inference
-o Global.infer_imgs:../dataset/ILSVRC2012/val
-o Global.save_log_path:null
-o Global.benchmark:True
null:null
===========================infer_benchmark_params==========================
random_infer_input:[{float32,[3,224,224]}]
===========================train_params===========================
model_name:PPLCNet_x1_0
python:python3.7
gpu_list:192.168.0.1,192.168.0.2;0,1
-o Global.device:gpu
-o Global.auto_cast:null
-o Global.epochs:lite_train_lite_infer=2|whole_train_whole_infer=120
-o Global.output_dir:./output/
-o DataLoader.Train.sampler.batch_size:8
-o Global.pretrained_model:null
train_model_name:latest
train_infer_img_dir:./dataset/ILSVRC2012/val
null:null
##
trainer:norm_train
norm_train:tools/train.py -c ppcls/configs/ImageNet/PPLCNet/PPLCNet_x1_0.yaml -o Global.seed=1234 -o DataLoader.Train.sampler.shuffle=False -o DataLoader.Train.loader.num_workers=0 -o DataLoader.Train.loader.use_shared_memory=False
pact_train:null
fpgm_train:null
distill_train:null
null:null
null:null
##
===========================eval_params===========================
eval:tools/eval.py -c ppcls/configs/ImageNet/PPLCNet/PPLCNet_x1_0.yaml
null:null
##
===========================infer_params==========================
-o Global.save_inference_dir:./inference
-o Global.pretrained_model:
norm_export:tools/export_model.py -c ppcls/configs/ImageNet/PPLCNet/PPLCNet_x1_0.yaml
quant_export:null
fpgm_export:null
distill_export:null
kl_quant:null
export2:null
pretrained_model_url:https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x1_0_pretrained.pdparams
infer_model:../inference/
infer_export:True
infer_quant:Fasle
inference:python/predict_cls.py -c configs/inference_cls.yaml
-o Global.use_gpu:True|False
-o Global.enable_mkldnn:True|False
-o Global.cpu_num_threads:1|6
-o Global.batch_size:1|16
-o Global.use_tensorrt:True|False
-o Global.use_fp16:True|False
-o Global.inference_model_dir:../inference
-o Global.infer_imgs:../dataset/ILSVRC2012/val
-o Global.save_log_path:null
-o Global.benchmark:True
null:null
===========================infer_benchmark_params==========================
random_infer_input:[{float32,[3,224,224]}]
\ No newline at end of file
===========================train_params===========================
model_name:PPLCNetV2_base
python:python3.7
gpu_list:192.168.0.1,192.168.0.2;0,1
-o Global.device:gpu
-o Global.auto_cast:null
-o Global.epochs:lite_train_lite_infer=2|whole_train_whole_infer=120
-o Global.output_dir:./output/
-o DataLoader.Train.sampler.first_bs:8
-o Global.pretrained_model:null
train_model_name:latest
train_infer_img_dir:./dataset/ILSVRC2012/val
null:null
##
trainer:norm_train
norm_train:tools/train.py -c ppcls/configs/ImageNet/PPLCNetV2/PPLCNetV2_base.yaml -o Global.seed=1234 -o DataLoader.Train.loader.num_workers=0 -o DataLoader.Train.loader.use_shared_memory=False
pact_train:null
fpgm_train:null
distill_train:null
null:null
null:null
##
===========================eval_params===========================
eval:tools/eval.py -c ppcls/configs/ImageNet/PPLCNetV2/PPLCNetV2_base.yaml
null:null
##
===========================infer_params==========================
-o Global.save_inference_dir:./inference
-o Global.pretrained_model:
norm_export:tools/export_model.py -c ppcls/configs/ImageNet/PPLCNetV2/PPLCNetV2_base.yaml
quant_export:null
fpgm_export:null
distill_export:null
kl_quant:null
export2:null
pretrained_model_url:https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNetV2_base_pretrained.pdparams
infer_model:../inference/
infer_export:True
infer_quant:Fasle
inference:python/predict_cls.py -c configs/inference_cls.yaml
-o Global.use_gpu:True|False
-o Global.enable_mkldnn:True|False
-o Global.cpu_num_threads:1|6
-o Global.batch_size:1|16
-o Global.use_tensorrt:True|False
-o Global.use_fp16:True|False
-o Global.inference_model_dir:../inference
-o Global.infer_imgs:../dataset/ILSVRC2012/val
-o Global.save_log_path:null
-o Global.benchmark:True
null:null
===========================infer_benchmark_params==========================
random_infer_input:[{float32,[3,224,224]}]
# Linux GPU/CPU 多机多卡训练推理测试
Linux GPU/CPU 多机多卡训练推理测试的主程序为`test_train_inference_python.sh`,可以测试基于Python的多机多卡模型训练、评估、推理等基本功能。
## 1. 测试结论汇总
- 训练相关:
| 算法名称 | 模型名称 | 多机多卡 |
| :-------: | :-----------------: | :--------: |
| PPLCNet | PPLCNet_x1_0 | 分布式训练 |
| PPLCNetV2 | PPLCNetV2_base | 分布式训练 |
| PPHGNet | PPHGNet_small | 分布式训练 |
| PP-ShiTu | PPShiTu_general_rec | 分布式训练 |
- 推理相关:
| 算法名称 | 模型名称 | device_CPU | device_GPU | batchsize |
| :-------: | :-----------------: | :--------: | :--------: | :-------: |
| PPLCNet | PPLCNet_x1_0 | 支持 | 支持 | 1 |
| PPLCNetV2 | PPLCNetV2_base | 支持 | 支持 | 1 |
| PPHGNet | PPHGNet_small | 支持 | 支持 | 1 |
| PP-ShiTu | PPShiTu_general_rec | 支持 | 支持 | 1 |
## 2. 测试流程
运行环境配置请参考[文档](./install.md)的内容配置TIPC的运行环境。
**下面以 PPLCNet_x1_0 模型为例,介绍测试流程**
### 2.1 功能测试
#### 2.1.1 修改配置文件
首先,修改配置文件`test_tipc/config/PPLCNet/PPLCNet_x1_0_train_fleet_infer_python.txt`中的`gpu_list`设置:假设两台机器的`ip`地址分别为`192.168.0.1``192.168.0.2`,则对应的配置文件`gpu_list`字段需要修改为`gpu_list:192.168.0.1,192.168.0.2;0,1`
**`ip`地址查看命令为`ifconfig`,在`inet addr:`字段后的即为ip地址**
#### 2.1.2 准备数据
运行`prepare.sh`准备数据和模型,数据准备命令如下所示。
```shell
bash test_tipc/prepare.sh test_tipc/config/PPLCNet/PPLCNet_x1_0_train_fleet_infer_python.txt lite_train_lite_infer
```
**注意:** 由于是多机训练,这里需要在所有节点上都运行一次启动上述命令来准备数据。
#### 2.1.3 修改起始端口开始测试
在多机的节点上使用下面的命令设置分布式的起始端口(否则后面运行的时候会由于无法找到运行端口而hang住),一般建议设置在`10000~20000`之间。
```shell
export FLAGS_START_PORT=17000
```
**注意:** 上述修改起始端口命令同样需要在所有节点上都执行一次。
接下来就可以开始执行测试,命令如下所示。
```shell
bash test_tipc/test_train_inference_python.sh test_tipc/config/PPLCNet/PPLCNet_x1_0_train_fleet_infer_python.txt
```
**注意:** 由于是多机训练,这里需要在所有的节点上均运行启动上述命令进行测试。
#### 2.1.4 输出结果
输出结果保存在`test_tipc/output/PPLCNet_x1_0/results_python.log`,内容如下,以`Run successfully`开头表示测试命令正常,否则为测试失败。
```bash
Run successfully with command - python3.7 -m paddle.distributed.launch --ips=192.168.0.1,192.168.0.2 --gpus=0,1 tools/train.py -c ppcls/configs/ImageNet/PPLCNet/PPLCNet_x1_0.yaml -o Global.seed=1234 -o DataL
oader.Train.sampler.shuffle=False -o DataLoader.Train.loader.num_workers=0 -o DataLoader.Train.loader.use_shared_memory=False -o Global.device=gpu -o Global.output_dir=./test_tipc/output/PPLCNet_x1_0/norm_train_gpus_0,
1_autocast_null_nodes_2 -o Global.epochs=2 -o DataLoader.Train.sampler.batch_size=8 !
...
...
Run successfully with command - python3.7 python/predict_cls.py -c configs/inference_cls.yaml -o Global.use_gpu=False -o Global.enable_mkldnn=True -o Global.cpu_num_threads=1 -o Global.inference_model_dir=.././t
est_tipc/output/PPLCNet_x1_0/norm_train_gpus_0,1_autocast_null_nodes_2 -o Global.batch_size=16 -o Global.infer_imgs=../dataset/ILSVRC2012/val -o Global.benchmark=True > .././test_tipc/output/PPLCNet_x1_0/infer_cpu_us
emkldnn_True_threads_1_batchsize_16.log 2>&1 !
```
在配置文件中默认设置`-o Global.benchmark:True`表示开启benchmark选项,此时可以得到测试的详细数据,包含运行环境信息(系统版本、CUDA版本、CUDNN版本、驱动版本),Paddle版本信息,参数设置信息(运行设备、线程数、是否开启内存优化等),模型信息(模型名称、精度),数据信息(batchsize、是否为动态shape等),性能信息(CPU,GPU的占用、运行耗时、预处理耗时、推理耗时、后处理耗时),内容如下所示:
```log
[2022/06/07 17:01:41] root INFO: ---------------------- Env info ----------------------
[2022/06/07 17:01:41] root INFO: OS_version: CentOS 6.10
[2022/06/07 17:01:41] root INFO: CUDA_version: 10.1.243
[2022/06/07 17:01:41] root INFO: CUDNN_version: None.None.None
[2022/06/07 17:01:41] root INFO: drivier_version: 460.32.03
[2022/06/07 17:01:41] root INFO: ---------------------- Paddle info ----------------------
[2022/06/07 17:01:41] root INFO: paddle_version: 2.3.0-rc0
[2022/06/07 17:01:41] root INFO: paddle_version: 2.3.0-rc0
[2022/06/07 17:01:41] root INFO: paddle_commit: 5d4980c052583fec022812d9c29460aff7cdc18b
[2022/06/07 17:01:41] root INFO: log_api_version: 1.0
[2022/06/07 17:01:41] root INFO: ----------------------- Conf info -----------------------
[2022/06/07 17:01:41] root INFO: runtime_device: cpu
[2022/06/07 17:01:41] root INFO: ir_optim: True
[2022/06/07 17:01:41] root INFO: enable_memory_optim: True
[2022/06/07 17:01:41] root INFO: enable_tensorrt: False
[2022/06/07 17:01:41] root INFO: enable_mkldnn: False
[2022/06/07 17:01:41] root INFO: cpu_math_library_num_threads: 6
[2022/06/07 17:01:41] root INFO: ----------------------- Model info ----------------------
[2022/06/07 17:01:41] root INFO: model_name: cls
[2022/06/07 17:01:41] root INFO: precision: fp32
[2022/06/07 17:01:41] root INFO: ----------------------- Data info -----------------------
[2022/06/07 17:01:41] root INFO: batch_size: 16
[2022/06/07 17:01:41] root INFO: input_shape: [3, 224, 224]
[2022/06/07 17:01:41] root INFO: data_num: 3
[2022/06/07 17:01:41] root INFO: ----------------------- Perf info -----------------------
[2022/06/07 17:01:41] root INFO: cpu_rss(MB): 726.5586, gpu_rss(MB): None, gpu_util: None%
[2022/06/07 17:01:41] root INFO: total time spent(s): 0.3527
[2022/06/07 17:01:41] root INFO: preprocess_time(ms): 33.2723, inference_time(ms): 317.9824, postprocess_time(ms): 1.4579
```
该信息可以在运行log中查看,log位置在`test_tipc/output/PPLCNet_x1_0/infer_gpu_usetrt_True_precision_True_batchsize_1.log`
如果运行失败,也会在终端中输出运行失败的日志信息以及对应的运行命令。可以基于该命令,分析运行失败的原因。
**注意:** 由于分布式训练时,仅在`trainer_id=0`所在的节点中保存模型,因此其他的节点中在运行模型导出与推理时会因为找不到保存的模型而报错,为正常现象。
...@@ -43,7 +43,7 @@ function func_get_url_file_name() { ...@@ -43,7 +43,7 @@ function func_get_url_file_name() {
model_name=$(func_parser_value "${lines[1]}") model_name=$(func_parser_value "${lines[1]}")
if [ ${MODE} = "cpp_infer" ]; then if [[ ${MODE} = "cpp_infer" ]]; then
if [ -d "./deploy/cpp/opencv-3.4.7/opencv3/" ] && [ $(md5sum ./deploy/cpp/opencv-3.4.7.tar.gz | awk -F ' ' '{print $1}') = "faa2b5950f8bee3f03118e600c74746a" ]; then if [ -d "./deploy/cpp/opencv-3.4.7/opencv3/" ] && [ $(md5sum ./deploy/cpp/opencv-3.4.7.tar.gz | awk -F ' ' '{print $1}') = "faa2b5950f8bee3f03118e600c74746a" ]; then
echo "################### build opencv skipped ###################" echo "################### build opencv skipped ###################"
else else
...@@ -151,7 +151,7 @@ if [[ $FILENAME == *use_dali* ]]; then ...@@ -151,7 +151,7 @@ if [[ $FILENAME == *use_dali* ]]; then
${python_name} -m pip install --extra-index-url https://developer.download.nvidia.com/compute/redist/nightly --upgrade nvidia-dali-nightly-cuda102 ${python_name} -m pip install --extra-index-url https://developer.download.nvidia.com/compute/redist/nightly --upgrade nvidia-dali-nightly-cuda102
fi fi
if [ ${MODE} = "lite_train_lite_infer" ] || [ ${MODE} = "lite_train_whole_infer" ]; then if [[ ${MODE} = "lite_train_lite_infer" ]] || [[ ${MODE} = "lite_train_whole_infer" ]]; then
# pretrain lite train data # pretrain lite train data
cd dataset cd dataset
rm -rf ILSVRC2012 rm -rf ILSVRC2012
...@@ -163,7 +163,7 @@ if [ ${MODE} = "lite_train_lite_infer" ] || [ ${MODE} = "lite_train_whole_infer" ...@@ -163,7 +163,7 @@ if [ ${MODE} = "lite_train_lite_infer" ] || [ ${MODE} = "lite_train_whole_infer"
mv val.txt val_list.txt mv val.txt val_list.txt
cp -r train/* val/ cp -r train/* val/
cd ../../ cd ../../
elif [ ${MODE} = "whole_infer" ] || [ ${MODE} = "klquant_whole_infer" ]; then elif [[ ${MODE} = "whole_infer" ]] || [[ ${MODE} = "klquant_whole_infer" ]]; then
# download data # download data
cd dataset cd dataset
rm -rf ILSVRC2012 rm -rf ILSVRC2012
...@@ -185,7 +185,7 @@ elif [ ${MODE} = "whole_infer" ] || [ ${MODE} = "klquant_whole_infer" ]; then ...@@ -185,7 +185,7 @@ elif [ ${MODE} = "whole_infer" ] || [ ${MODE} = "klquant_whole_infer" ]; then
eval $cmd eval $cmd
fi fi
elif [ ${MODE} = "whole_train_whole_infer" ]; then elif [[ ${MODE} = "whole_train_whole_infer" ]]; then
cd dataset cd dataset
rm -rf ILSVRC2012 rm -rf ILSVRC2012
wget -nc https://paddle-imagenet-models-name.bj.bcebos.com/data/whole_chain/whole_chain_CIFAR100.tar wget -nc https://paddle-imagenet-models-name.bj.bcebos.com/data/whole_chain/whole_chain_CIFAR100.tar
...@@ -197,7 +197,7 @@ elif [ ${MODE} = "whole_train_whole_infer" ]; then ...@@ -197,7 +197,7 @@ elif [ ${MODE} = "whole_train_whole_infer" ]; then
cd ../../ cd ../../
fi fi
if [ ${MODE} = "serving_infer" ]; then if [[ ${MODE} = "serving_infer" ]]; then
# prepare serving env # prepare serving env
python_name=$(func_parser_value "${lines[2]}") python_name=$(func_parser_value "${lines[2]}")
${python_name} -m pip install install paddle-serving-server-gpu==0.7.0.post102 ${python_name} -m pip install install paddle-serving-server-gpu==0.7.0.post102
...@@ -225,7 +225,7 @@ if [ ${MODE} = "serving_infer" ]; then ...@@ -225,7 +225,7 @@ if [ ${MODE} = "serving_infer" ]; then
unset https_proxy unset https_proxy
fi fi
if [ ${MODE} = "paddle2onnx_infer" ]; then if [[ ${MODE} = "paddle2onnx_infer" ]]; then
# prepare paddle2onnx env # prepare paddle2onnx env
python_name=$(func_parser_value "${lines[2]}") python_name=$(func_parser_value "${lines[2]}")
inference_model_url=$(func_parser_value "${lines[10]}") inference_model_url=$(func_parser_value "${lines[10]}")
...@@ -241,7 +241,7 @@ if [ ${MODE} = "paddle2onnx_infer" ]; then ...@@ -241,7 +241,7 @@ if [ ${MODE} = "paddle2onnx_infer" ]; then
cd ../../ cd ../../
fi fi
if [ ${MODE} = "benchmark_train" ]; then if [[ ${MODE} = "benchmark_train" ]]; then
pip install -r requirements.txt pip install -r requirements.txt
cd dataset cd dataset
rm -rf ILSVRC2012 rm -rf ILSVRC2012
......
...@@ -90,7 +90,7 @@ infer_value1=$(func_parser_value "${lines[50]}") ...@@ -90,7 +90,7 @@ infer_value1=$(func_parser_value "${lines[50]}")
if [ ! $epoch_num ]; then if [ ! $epoch_num ]; then
epoch_num=2 epoch_num=2
fi fi
if [ $MODE = 'benchmark_train' ]; then if [[ $MODE = 'benchmark_train' ]]; then
epoch_num=1 epoch_num=1
fi fi
...@@ -161,7 +161,7 @@ function func_inference(){ ...@@ -161,7 +161,7 @@ function func_inference(){
done done
} }
if [ ${MODE} = "whole_infer" ] || [ ${MODE} = "klquant_whole_infer" ]; then if [[ ${MODE} = "whole_infer" ]] || [[ ${MODE} = "klquant_whole_infer" ]]; then
IFS="|" IFS="|"
infer_export_flag=(${infer_export_flag}) infer_export_flag=(${infer_export_flag})
if [ ${infer_export_flag} != "null" ] && [ ${infer_export_flag} != "False" ]; then if [ ${infer_export_flag} != "null" ] && [ ${infer_export_flag} != "False" ]; then
...@@ -171,7 +171,7 @@ if [ ${MODE} = "whole_infer" ] || [ ${MODE} = "klquant_whole_infer" ]; then ...@@ -171,7 +171,7 @@ if [ ${MODE} = "whole_infer" ] || [ ${MODE} = "klquant_whole_infer" ]; then
fi fi
fi fi
if [ ${MODE} = "whole_infer" ]; then if [[ ${MODE} = "whole_infer" ]]; then
GPUID=$3 GPUID=$3
if [ ${#GPUID} -le 0 ];then if [ ${#GPUID} -le 0 ];then
env=" " env=" "
...@@ -191,7 +191,7 @@ if [ ${MODE} = "whole_infer" ]; then ...@@ -191,7 +191,7 @@ if [ ${MODE} = "whole_infer" ]; then
done done
cd .. cd ..
elif [ ${MODE} = "klquant_whole_infer" ]; then elif [[ ${MODE} = "klquant_whole_infer" ]]; then
# for kl_quant # for kl_quant
if [ ${kl_quant_cmd_value} != "null" ] && [ ${kl_quant_cmd_value} != "False" ]; then if [ ${kl_quant_cmd_value} != "null" ] && [ ${kl_quant_cmd_value} != "False" ]; then
echo "kl_quant" echo "kl_quant"
...@@ -270,7 +270,9 @@ else ...@@ -270,7 +270,9 @@ else
set_batchsize=$(func_set_params "${train_batch_key}" "${train_batch_value}") set_batchsize=$(func_set_params "${train_batch_key}" "${train_batch_value}")
set_train_params1=$(func_set_params "${train_param_key1}" "${train_param_value1}") set_train_params1=$(func_set_params "${train_param_key1}" "${train_param_value1}")
set_use_gpu=$(func_set_params "${train_use_gpu_key}" "${train_use_gpu_value}") set_use_gpu=$(func_set_params "${train_use_gpu_key}" "${train_use_gpu_value}")
if [ ${#ips} -le 26 ];then if [ ${#ips} -le 15 ];then
# if length of ips >= 15, then it is seen as multi-machine
# 15 is the min length of ips info for multi-machine: 0.0.0.0,0.0.0.0
save_log="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}" save_log="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}"
nodes=1 nodes=1
else else
...@@ -289,7 +291,7 @@ else ...@@ -289,7 +291,7 @@ else
set_save_model=$(func_set_params "${save_model_key}" "${save_log}") set_save_model=$(func_set_params "${save_model_key}" "${save_log}")
if [ ${#gpu} -le 2 ];then # train with cpu or single gpu if [ ${#gpu} -le 2 ];then # train with cpu or single gpu
cmd="${python} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1} " cmd="${python} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1} "
elif [ ${#ips} -le 26 ];then # train with multi-gpu elif [ ${#ips} -le 15 ];then # train with multi-gpu
cmd="${python} -m paddle.distributed.launch --gpus=${gpu} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1}" cmd="${python} -m paddle.distributed.launch --gpus=${gpu} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1}"
else # train with multi-machine else # train with multi-machine
cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} ${set_use_gpu} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_autocast} ${set_batchsize} ${set_train_params1}" cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} ${set_use_gpu} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_autocast} ${set_batchsize} ${set_train_params1}"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册