From 7b60e7d892247c8bca9d2d12b5e2d8e3bec982f8 Mon Sep 17 00:00:00 2001 From: shangliang Xu Date: Fri, 18 Feb 2022 20:32:43 +0800 Subject: [PATCH] [TIPC] add benchmark for yolov3, mask_rcnn (#5224) --- ...sk_rcnn_r50_1x_coco_train_infer_python.txt | 8 +- ...cnn_r50_fpn_1x_coco_train_infer_python.txt | 8 +- ...darknet53_270e_coco_train_infer_python.txt | 8 +- test_tipc/static/README.MD | 19 ++++ ...sk_rcnn_r50_1x_coco_bs2_fp32_SingleP_DP.sh | 17 ++++ ...ask_rcnn_r50_1x_coco_bs2_fp32_MultiP_DP.sh | 13 +++ .../benchmark_common/prepare.sh | 15 ++++ .../benchmark_common/run_benchmark.sh | 88 +++++++++++++++++++ ...cnn_r50_fpn_1x_coco_bs2_fp32_SingleP_DP.sh | 17 ++++ ...rcnn_r50_fpn_1x_coco_bs2_fp32_MultiP_DP.sh | 13 +++ .../benchmark_common/prepare.sh | 15 ++++ .../benchmark_common/run_benchmark.sh | 88 +++++++++++++++++++ ...darknet53_270e_coco_bs8_fp32_SingleP_DP.sh | 17 ++++ ..._darknet53_270e_coco_bs8_fp32_MultiP_DP.sh | 13 +++ .../benchmark_common/prepare.sh | 15 ++++ .../benchmark_common/run_benchmark.sh | 88 +++++++++++++++++++ 16 files changed, 439 insertions(+), 3 deletions(-) create mode 100644 test_tipc/static/README.MD create mode 100644 test_tipc/static/mask_rcnn_r50_1x_coco/N1C1/mask_rcnn_r50_1x_coco_bs2_fp32_SingleP_DP.sh create mode 100644 test_tipc/static/mask_rcnn_r50_1x_coco/N1C8/mask_rcnn_r50_1x_coco_bs2_fp32_MultiP_DP.sh create mode 100644 test_tipc/static/mask_rcnn_r50_1x_coco/benchmark_common/prepare.sh create mode 100644 test_tipc/static/mask_rcnn_r50_1x_coco/benchmark_common/run_benchmark.sh create mode 100644 test_tipc/static/mask_rcnn_r50_fpn_1x_coco/N1C1/mask_rcnn_r50_fpn_1x_coco_bs2_fp32_SingleP_DP.sh create mode 100644 test_tipc/static/mask_rcnn_r50_fpn_1x_coco/N1C8/mask_rcnn_r50_fpn_1x_coco_bs2_fp32_MultiP_DP.sh create mode 100644 test_tipc/static/mask_rcnn_r50_fpn_1x_coco/benchmark_common/prepare.sh create mode 100644 test_tipc/static/mask_rcnn_r50_fpn_1x_coco/benchmark_common/run_benchmark.sh create mode 100644 test_tipc/static/yolov3_darknet53_270e_coco/N1C1/yolov3_darknet53_270e_coco_bs8_fp32_SingleP_DP.sh create mode 100644 test_tipc/static/yolov3_darknet53_270e_coco/N1C8/yolov3_darknet53_270e_coco_bs8_fp32_MultiP_DP.sh create mode 100644 test_tipc/static/yolov3_darknet53_270e_coco/benchmark_common/prepare.sh create mode 100644 test_tipc/static/yolov3_darknet53_270e_coco/benchmark_common/run_benchmark.sh diff --git a/test_tipc/configs/mask_rcnn/mask_rcnn_r50_1x_coco_train_infer_python.txt b/test_tipc/configs/mask_rcnn/mask_rcnn_r50_1x_coco_train_infer_python.txt index 933c496e0..89ad118fc 100644 --- a/test_tipc/configs/mask_rcnn/mask_rcnn_r50_1x_coco_train_infer_python.txt +++ b/test_tipc/configs/mask_rcnn/mask_rcnn_r50_1x_coco_train_infer_python.txt @@ -48,4 +48,10 @@ inference:./deploy/python/infer.py --image_dir:./dataset/coco/test2017/ --save_log_path:null --run_benchmark:True ---trt_max_shape:1600 \ No newline at end of file +--trt_max_shape:1600 +===========================train_benchmark_params========================== +batch_size:2|4 +fp_items:fp32|fp16 +epoch:1 +--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile +flags:null \ No newline at end of file diff --git a/test_tipc/configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco_train_infer_python.txt b/test_tipc/configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco_train_infer_python.txt index 078c5a2e5..d0ce936a4 100644 --- a/test_tipc/configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco_train_infer_python.txt +++ b/test_tipc/configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco_train_infer_python.txt @@ -48,4 +48,10 @@ inference:./deploy/python/infer.py --image_dir:./dataset/coco/test2017/ --save_log_path:null --run_benchmark:True ---trt_max_shape:1600 \ No newline at end of file +--trt_max_shape:1600 +===========================train_benchmark_params========================== +batch_size:2|4 +fp_items:fp32|fp16 +epoch:1 +--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile +flags:null \ No newline at end of file diff --git a/test_tipc/configs/yolov3/yolov3_darknet53_270e_coco_train_infer_python.txt b/test_tipc/configs/yolov3/yolov3_darknet53_270e_coco_train_infer_python.txt index 349ab0b6c..7c5bff379 100644 --- a/test_tipc/configs/yolov3/yolov3_darknet53_270e_coco_train_infer_python.txt +++ b/test_tipc/configs/yolov3/yolov3_darknet53_270e_coco_train_infer_python.txt @@ -48,4 +48,10 @@ inference:./deploy/python/infer.py --image_dir:./dataset/coco/test2017/ --save_log_path:null --run_benchmark:True -null:null \ No newline at end of file +null:null +===========================train_benchmark_params========================== +batch_size:8 +fp_items:fp32|fp16 +epoch:1 +--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile +flags:null \ No newline at end of file diff --git a/test_tipc/static/README.MD b/test_tipc/static/README.MD new file mode 100644 index 000000000..94fb8f4e8 --- /dev/null +++ b/test_tipc/static/README.MD @@ -0,0 +1,19 @@ +# PaddleDetection 下静态图benchmark模型执行说明 +静态图benchmark测试脚本说明 +# 目录说明 +# Docker 运行环境 +docker image: registry.baidubce.com/paddlepaddle/paddle:latest-dev-cuda11.2-cudnn8-gcc82 +paddle = 2.2.2 +python = 3.7 +# 运行benchmark测试步骤 +git clone https://github.com/PaddlePaddle/PaddleDetection.git +cd PaddleDetection +# 准备数据 +bash test_tipc/static/${model_item}/benchmark_common/prepare.sh +# 运行模型 +## 单卡(自动运行打开Profiling) +export CUDA_VISIBLE_DEVICES=0 +bash test_tipc/static/${model_item}/N1C1/yolov3_darknet53_270e_coco_bs8_fp32_SingleP_DP.sh +## 多卡 +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +bash test_tipc/static/${model_item}/N1C8/yolov3_darknet53_270e_coco_bs8_fp32_MultiP_DP.sh diff --git a/test_tipc/static/mask_rcnn_r50_1x_coco/N1C1/mask_rcnn_r50_1x_coco_bs2_fp32_SingleP_DP.sh b/test_tipc/static/mask_rcnn_r50_1x_coco/N1C1/mask_rcnn_r50_1x_coco_bs2_fp32_SingleP_DP.sh new file mode 100644 index 000000000..698b4f7c3 --- /dev/null +++ b/test_tipc/static/mask_rcnn_r50_1x_coco/N1C1/mask_rcnn_r50_1x_coco_bs2_fp32_SingleP_DP.sh @@ -0,0 +1,17 @@ +model_item=mask_rcnn_r50_1x_coco +bs_item=2 +fp_item=fp32 +run_process_type=SingleP +run_mode=DP +device_num=N1C1 +max_iter=500 +num_workers=2 + +# get data +bash test_tipc/static/${model_item}/benchmark_common/prepare.sh +# run +bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1; +# run profiling +sleep 10; +export PROFILING=true +bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1; diff --git a/test_tipc/static/mask_rcnn_r50_1x_coco/N1C8/mask_rcnn_r50_1x_coco_bs2_fp32_MultiP_DP.sh b/test_tipc/static/mask_rcnn_r50_1x_coco/N1C8/mask_rcnn_r50_1x_coco_bs2_fp32_MultiP_DP.sh new file mode 100644 index 000000000..ee3a9ddd0 --- /dev/null +++ b/test_tipc/static/mask_rcnn_r50_1x_coco/N1C8/mask_rcnn_r50_1x_coco_bs2_fp32_MultiP_DP.sh @@ -0,0 +1,13 @@ +model_item=mask_rcnn_r50_1x_coco +bs_item=2 +fp_item=fp32 +run_process_type=MultiP +run_mode=DP +device_num=N1C8 +max_iter=500 +num_workers=2 + +# get data +bash test_tipc/static/${model_item}/benchmark_common/prepare.sh +# run +bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1; diff --git a/test_tipc/static/mask_rcnn_r50_1x_coco/benchmark_common/prepare.sh b/test_tipc/static/mask_rcnn_r50_1x_coco/benchmark_common/prepare.sh new file mode 100644 index 000000000..e5a7024d4 --- /dev/null +++ b/test_tipc/static/mask_rcnn_r50_1x_coco/benchmark_common/prepare.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +# 执行路径在模型库的根目录下 +################################# 安装框架 如: +echo "*******prepare benchmark start ***********" +pip install -U pip -i https://pypi.tuna.tsinghua.edu.cn/simple +echo `pip --version` +pip install Cython -i https://pypi.tuna.tsinghua.edu.cn/simple +python -m pip install paddlepaddle-gpu==2.2.2.post112 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html +pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple + +################################# 准备训练数据 如: +wget -nc -P static/data/coco/ https://paddledet.bj.bcebos.com/data/coco_benchmark.tar +cd ./static/data/coco/ && tar -xf coco_benchmark.tar && mv -u coco_benchmark/* . +rm -rf coco_benchmark/ && cd ../../../ +echo "*******prepare benchmark end***********" diff --git a/test_tipc/static/mask_rcnn_r50_1x_coco/benchmark_common/run_benchmark.sh b/test_tipc/static/mask_rcnn_r50_1x_coco/benchmark_common/run_benchmark.sh new file mode 100644 index 000000000..d8d5b44cb --- /dev/null +++ b/test_tipc/static/mask_rcnn_r50_1x_coco/benchmark_common/run_benchmark.sh @@ -0,0 +1,88 @@ +#!/usr/bin/env bash +# Test training benchmark for a model. +# Usage:bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} +function _set_params(){ + model_item=${1:-"model_item"} # (必选) 模型 item + base_batch_size=${2:-"2"} # (必选) 如果是静态图单进程,则表示每张卡上的BS,需在训练时*卡数 + fp_item=${3:-"fp32"} # (必选) fp32|fp16 + run_process_type=${4:-"SingleP"} # (必选) 单进程 SingleP|多进程 MultiP + run_mode=${5:-"DP"} # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP1-MP4-PP1 + device_num=${6:-"N1C1"} # (必选) 使用的卡数量,N1C1|N1C8|N4C32 (4机32卡) + profiling=${PROFILING:-"false"} # (必选) Profiling 开关,默认关闭,通过全局变量传递 + model_repo="PaddleDetection" # (必选) 模型套件的名字 + speed_unit="samples/sec" # (必选)速度指标单位 + skip_steps=10 # (必选)解析日志,跳过模型前几个性能不稳定的step + keyword="ips:" # (必选)解析日志,筛选出性能数据所在行的关键字 + convergence_key="loss:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:" + max_iter=${7:-"500"} # (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件;或使用max_epoch参数 + num_workers=${8:-"8"} # (可选) +# 以下为通用执行命令,无特殊可不用修改 + model_name=${model_item}_bs${base_batch_size}_${fp_item}_${run_process_type}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐 + device=${CUDA_VISIBLE_DEVICES//,/ } + arr=(${device}) + num_gpu_devices=${#arr[*]} + run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # (必填) TRAIN_LOG_DIR benchmark框架设置该参数为全局变量 + profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)} # (必填) PROFILING_LOG_DIR benchmark框架设置该参数为全局变量 + speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)} + + train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log + profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling + speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed +} +function _train(){ + cd ./static + batch_size=${base_batch_size} # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs + echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}" + + if [ ${profiling} = "true" ];then + log_file=${profiling_log_file} + is_profiler=1 + else + log_file=${train_log_file} + is_profiler=0 + fi + if [ ${fp_item} = "fp16" ]; then + use_fp16_cmd="--fp16" + else + use_fp16_cmd="" + fi + + train_cmd="-c configs/mask_rcnn_r50_1x.yml -o LearningRate.base_lr=0.001 snapshot_iter=100000 \ + TrainReader.batch_size==${batch_size} \ + max_iters=${max_iter} log_iter=1 \ + TrainReader.worker_num==${num_workers} ${use_fp16_cmd} \ + --is_profiler=${is_profiler} " +# 以下为通用执行命令,无特殊可不用修改 + case ${run_mode} in + DP) if [[ ${run_process_type} = "SingleP" ]];then + echo "run ${run_mode} ${run_process_type}" + train_cmd="python -u tools/train.py ${train_cmd}" + elif [[ ${run_process_type} = "MultiP" ]];then + rm -rf ./mylog + train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=$CUDA_VISIBLE_DEVICES \ + tools/train.py ${train_cmd}" + else + echo "run ${run_mode} ${run_process_type} error", exit 1 + fi + ;; + DP1-MP1-PP1) echo "run run_mode: DP1-MP1-PP1" ;; + *) echo "choose run_mode "; exit 1; + esac + echo "train_cmd: ${train_cmd} log_file: ${log_file}" + timeout 15m ${train_cmd} > ${log_file} 2>&1 + if [ $? -ne 0 ];then + echo -e "${model_name}, FAIL" + else + echo -e "${model_name}, SUCCESS" + fi + kill -9 `ps -ef|grep 'python'|awk '{print $2}'` + if [ ${run_process_type} = "MultiP" -a -d mylog ]; then + rm ${log_file} + cp mylog/workerlog.0 ${log_file} + fi + cd ../ +} +source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开 +_set_params $@ +_train # 如果只产出训练log,不解析,可取消注释 +_run # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开 diff --git a/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/N1C1/mask_rcnn_r50_fpn_1x_coco_bs2_fp32_SingleP_DP.sh b/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/N1C1/mask_rcnn_r50_fpn_1x_coco_bs2_fp32_SingleP_DP.sh new file mode 100644 index 000000000..499468244 --- /dev/null +++ b/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/N1C1/mask_rcnn_r50_fpn_1x_coco_bs2_fp32_SingleP_DP.sh @@ -0,0 +1,17 @@ +model_item=mask_rcnn_r50_fpn_1x_coco +bs_item=2 +fp_item=fp32 +run_process_type=SingleP +run_mode=DP +device_num=N1C1 +max_iter=500 +num_workers=2 + +# get data +bash test_tipc/static/${model_item}/benchmark_common/prepare.sh +# run +bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1; +# run profiling +sleep 10; +export PROFILING=true +bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1; diff --git a/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/N1C8/mask_rcnn_r50_fpn_1x_coco_bs2_fp32_MultiP_DP.sh b/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/N1C8/mask_rcnn_r50_fpn_1x_coco_bs2_fp32_MultiP_DP.sh new file mode 100644 index 000000000..7781efff8 --- /dev/null +++ b/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/N1C8/mask_rcnn_r50_fpn_1x_coco_bs2_fp32_MultiP_DP.sh @@ -0,0 +1,13 @@ +model_item=mask_rcnn_r50_fpn_1x_coco +bs_item=2 +fp_item=fp32 +run_process_type=MultiP +run_mode=DP +device_num=N1C8 +max_iter=500 +num_workers=2 + +# get data +bash test_tipc/static/${model_item}/benchmark_common/prepare.sh +# run +bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1; diff --git a/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/benchmark_common/prepare.sh b/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/benchmark_common/prepare.sh new file mode 100644 index 000000000..e5a7024d4 --- /dev/null +++ b/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/benchmark_common/prepare.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +# 执行路径在模型库的根目录下 +################################# 安装框架 如: +echo "*******prepare benchmark start ***********" +pip install -U pip -i https://pypi.tuna.tsinghua.edu.cn/simple +echo `pip --version` +pip install Cython -i https://pypi.tuna.tsinghua.edu.cn/simple +python -m pip install paddlepaddle-gpu==2.2.2.post112 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html +pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple + +################################# 准备训练数据 如: +wget -nc -P static/data/coco/ https://paddledet.bj.bcebos.com/data/coco_benchmark.tar +cd ./static/data/coco/ && tar -xf coco_benchmark.tar && mv -u coco_benchmark/* . +rm -rf coco_benchmark/ && cd ../../../ +echo "*******prepare benchmark end***********" diff --git a/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/benchmark_common/run_benchmark.sh b/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/benchmark_common/run_benchmark.sh new file mode 100644 index 000000000..520af4d4c --- /dev/null +++ b/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/benchmark_common/run_benchmark.sh @@ -0,0 +1,88 @@ +#!/usr/bin/env bash +# Test training benchmark for a model. +# Usage:bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} +function _set_params(){ + model_item=${1:-"model_item"} # (必选) 模型 item + base_batch_size=${2:-"2"} # (必选) 如果是静态图单进程,则表示每张卡上的BS,需在训练时*卡数 + fp_item=${3:-"fp32"} # (必选) fp32|fp16 + run_process_type=${4:-"SingleP"} # (必选) 单进程 SingleP|多进程 MultiP + run_mode=${5:-"DP"} # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP1-MP4-PP1 + device_num=${6:-"N1C1"} # (必选) 使用的卡数量,N1C1|N1C8|N4C32 (4机32卡) + profiling=${PROFILING:-"false"} # (必选) Profiling 开关,默认关闭,通过全局变量传递 + model_repo="PaddleDetection" # (必选) 模型套件的名字 + speed_unit="samples/sec" # (必选)速度指标单位 + skip_steps=10 # (必选)解析日志,跳过模型前几个性能不稳定的step + keyword="ips:" # (必选)解析日志,筛选出性能数据所在行的关键字 + convergence_key="loss:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:" + max_iter=${7:-"500"} # (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件;或使用max_epoch参数 + num_workers=${8:-"8"} # (可选) +# 以下为通用执行命令,无特殊可不用修改 + model_name=${model_item}_bs${base_batch_size}_${fp_item}_${run_process_type}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐 + device=${CUDA_VISIBLE_DEVICES//,/ } + arr=(${device}) + num_gpu_devices=${#arr[*]} + run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # (必填) TRAIN_LOG_DIR benchmark框架设置该参数为全局变量 + profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)} # (必填) PROFILING_LOG_DIR benchmark框架设置该参数为全局变量 + speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)} + + train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log + profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling + speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed +} +function _train(){ + cd ./static + batch_size=${base_batch_size} # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs + echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}" + + if [ ${profiling} = "true" ];then + log_file=${profiling_log_file} + is_profiler=1 + else + log_file=${train_log_file} + is_profiler=0 + fi + if [ ${fp_item} = "fp16" ]; then + use_fp16_cmd="--fp16" + else + use_fp16_cmd="" + fi + + train_cmd="-c configs/mask_rcnn_r50_fpn_1x.yml -o LearningRate.base_lr=0.001 snapshot_iter=100000 \ + TrainReader.batch_size==${batch_size} \ + max_iters=${max_iter} log_iter=1 \ + TrainReader.worker_num==${num_workers} ${use_fp16_cmd} \ + --is_profiler=${is_profiler} " +# 以下为通用执行命令,无特殊可不用修改 + case ${run_mode} in + DP) if [[ ${run_process_type} = "SingleP" ]];then + echo "run ${run_mode} ${run_process_type}" + train_cmd="python -u tools/train.py ${train_cmd}" + elif [[ ${run_process_type} = "MultiP" ]];then + rm -rf ./mylog + train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=$CUDA_VISIBLE_DEVICES \ + tools/train.py ${train_cmd}" + else + echo "run ${run_mode} ${run_process_type} error", exit 1 + fi + ;; + DP1-MP1-PP1) echo "run run_mode: DP1-MP1-PP1" ;; + *) echo "choose run_mode "; exit 1; + esac + echo "train_cmd: ${train_cmd} log_file: ${log_file}" + timeout 15m ${train_cmd} > ${log_file} 2>&1 + if [ $? -ne 0 ];then + echo -e "${model_name}, FAIL" + else + echo -e "${model_name}, SUCCESS" + fi + kill -9 `ps -ef|grep 'python'|awk '{print $2}'` + if [ ${run_process_type} = "MultiP" -a -d mylog ]; then + rm ${log_file} + cp mylog/workerlog.0 ${log_file} + fi + cd ../ +} +source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开 +_set_params $@ +_train # 如果只产出训练log,不解析,可取消注释 +_run # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开 diff --git a/test_tipc/static/yolov3_darknet53_270e_coco/N1C1/yolov3_darknet53_270e_coco_bs8_fp32_SingleP_DP.sh b/test_tipc/static/yolov3_darknet53_270e_coco/N1C1/yolov3_darknet53_270e_coco_bs8_fp32_SingleP_DP.sh new file mode 100644 index 000000000..db222b973 --- /dev/null +++ b/test_tipc/static/yolov3_darknet53_270e_coco/N1C1/yolov3_darknet53_270e_coco_bs8_fp32_SingleP_DP.sh @@ -0,0 +1,17 @@ +model_item=yolov3_darknet53_270e_coco +bs_item=8 +fp_item=fp32 +run_process_type=SingleP +run_mode=DP +device_num=N1C1 +max_iter=500 +num_workers=8 + +# get data +bash test_tipc/static/${model_item}/benchmark_common/prepare.sh +# run +bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1; +# run profiling +sleep 10; +export PROFILING=true +bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1; diff --git a/test_tipc/static/yolov3_darknet53_270e_coco/N1C8/yolov3_darknet53_270e_coco_bs8_fp32_MultiP_DP.sh b/test_tipc/static/yolov3_darknet53_270e_coco/N1C8/yolov3_darknet53_270e_coco_bs8_fp32_MultiP_DP.sh new file mode 100644 index 000000000..80c80fe97 --- /dev/null +++ b/test_tipc/static/yolov3_darknet53_270e_coco/N1C8/yolov3_darknet53_270e_coco_bs8_fp32_MultiP_DP.sh @@ -0,0 +1,13 @@ +model_item=yolov3_darknet53_270e_coco +bs_item=8 +fp_item=fp32 +run_process_type=MultiP +run_mode=DP +device_num=N1C8 +max_iter=500 +num_workers=8 + +# get data +bash test_tipc/static/${model_item}/benchmark_common/prepare.sh +# run +bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1; diff --git a/test_tipc/static/yolov3_darknet53_270e_coco/benchmark_common/prepare.sh b/test_tipc/static/yolov3_darknet53_270e_coco/benchmark_common/prepare.sh new file mode 100644 index 000000000..e5a7024d4 --- /dev/null +++ b/test_tipc/static/yolov3_darknet53_270e_coco/benchmark_common/prepare.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +# 执行路径在模型库的根目录下 +################################# 安装框架 如: +echo "*******prepare benchmark start ***********" +pip install -U pip -i https://pypi.tuna.tsinghua.edu.cn/simple +echo `pip --version` +pip install Cython -i https://pypi.tuna.tsinghua.edu.cn/simple +python -m pip install paddlepaddle-gpu==2.2.2.post112 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html +pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple + +################################# 准备训练数据 如: +wget -nc -P static/data/coco/ https://paddledet.bj.bcebos.com/data/coco_benchmark.tar +cd ./static/data/coco/ && tar -xf coco_benchmark.tar && mv -u coco_benchmark/* . +rm -rf coco_benchmark/ && cd ../../../ +echo "*******prepare benchmark end***********" diff --git a/test_tipc/static/yolov3_darknet53_270e_coco/benchmark_common/run_benchmark.sh b/test_tipc/static/yolov3_darknet53_270e_coco/benchmark_common/run_benchmark.sh new file mode 100644 index 000000000..fcd55431b --- /dev/null +++ b/test_tipc/static/yolov3_darknet53_270e_coco/benchmark_common/run_benchmark.sh @@ -0,0 +1,88 @@ +#!/usr/bin/env bash +# Test training benchmark for a model. +# Usage:bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} +function _set_params(){ + model_item=${1:-"model_item"} # (必选) 模型 item + base_batch_size=${2:-"2"} # (必选) 如果是静态图单进程,则表示每张卡上的BS,需在训练时*卡数 + fp_item=${3:-"fp32"} # (必选) fp32|fp16 + run_process_type=${4:-"SingleP"} # (必选) 单进程 SingleP|多进程 MultiP + run_mode=${5:-"DP"} # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP1-MP4-PP1 + device_num=${6:-"N1C1"} # (必选) 使用的卡数量,N1C1|N1C8|N4C32 (4机32卡) + profiling=${PROFILING:-"false"} # (必选) Profiling 开关,默认关闭,通过全局变量传递 + model_repo="PaddleDetection" # (必选) 模型套件的名字 + speed_unit="samples/sec" # (必选)速度指标单位 + skip_steps=10 # (必选)解析日志,跳过模型前几个性能不稳定的step + keyword="ips:" # (必选)解析日志,筛选出性能数据所在行的关键字 + convergence_key="loss:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:" + max_iter=${7:-"500"} # (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件;或使用max_epoch参数 + num_workers=${8:-"8"} # (可选) +# 以下为通用执行命令,无特殊可不用修改 + model_name=${model_item}_bs${base_batch_size}_${fp_item}_${run_process_type}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐 + device=${CUDA_VISIBLE_DEVICES//,/ } + arr=(${device}) + num_gpu_devices=${#arr[*]} + run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # (必填) TRAIN_LOG_DIR benchmark框架设置该参数为全局变量 + profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)} # (必填) PROFILING_LOG_DIR benchmark框架设置该参数为全局变量 + speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)} + + train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log + profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling + speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed +} +function _train(){ + cd ./static + batch_size=${base_batch_size} # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs + echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}" + + if [ ${profiling} = "true" ];then + log_file=${profiling_log_file} + is_profiler=1 + else + log_file=${train_log_file} + is_profiler=0 + fi + if [ ${fp_item} = "fp16" ]; then + use_fp16_cmd="--fp16" + else + use_fp16_cmd="" + fi + + train_cmd="-c configs/yolov3_darknet.yml -o LearningRate.base_lr=0.002 snapshot_iter=100000 \ + TrainReader.batch_size==${batch_size} \ + max_iters=${max_iter} log_iter=1 \ + TrainReader.worker_num==${num_workers} ${use_fp16_cmd} \ + --is_profiler=${is_profiler} " +# 以下为通用执行命令,无特殊可不用修改 + case ${run_mode} in + DP) if [[ ${run_process_type} = "SingleP" ]];then + echo "run ${run_mode} ${run_process_type}" + train_cmd="python -u tools/train.py ${train_cmd}" + elif [[ ${run_process_type} = "MultiP" ]];then + rm -rf ./mylog + train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=$CUDA_VISIBLE_DEVICES \ + tools/train.py ${train_cmd}" + else + echo "run ${run_mode} ${run_process_type} error", exit 1 + fi + ;; + DP1-MP1-PP1) echo "run run_mode: DP1-MP1-PP1" ;; + *) echo "choose run_mode "; exit 1; + esac + echo "train_cmd: ${train_cmd} log_file: ${log_file}" + timeout 15m ${train_cmd} > ${log_file} 2>&1 + if [ $? -ne 0 ];then + echo -e "${model_name}, FAIL" + else + echo -e "${model_name}, SUCCESS" + fi + kill -9 `ps -ef|grep 'python'|awk '{print $2}'` + if [ ${run_process_type} = "MultiP" -a -d mylog ]; then + rm ${log_file} + cp mylog/workerlog.0 ${log_file} + fi + cd ../ +} +source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开 +_set_params $@ +_train # 如果只产出训练log,不解析,可取消注释 +_run # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开 -- GitLab