[TIPC] add benchmark for yolov3, mask_rcnn (#5224)

7b60e7d8 · shangliang Xu · GitHub · 92872581 · 7b60e7d8 · 7b60e7d8
16 changed file
--- a/test_tipc/configs/mask_rcnn/mask_rcnn_r50_1x_coco_train_infer_python.txt
+++ b/test_tipc/configs/mask_rcnn/mask_rcnn_r50_1x_coco_train_infer_python.txt
@@ -49,3 +49,9 @@ inference:./deploy/python/infer.py
 --save_log_path:null
 --run_benchmark:True
 --trt_max_shape:1600
+===========================train_benchmark_params==========================
+batch_size:2|4
+fp_items:fp32|fp16
+epoch:1
+--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile
+flags:null
\ No newline at end of file
--- a/test_tipc/configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco_train_infer_python.txt
+++ b/test_tipc/configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco_train_infer_python.txt
@@ -49,3 +49,9 @@ inference:./deploy/python/infer.py
 --save_log_path:null
 --run_benchmark:True
 --trt_max_shape:1600
+===========================train_benchmark_params==========================
+batch_size:2|4
+fp_items:fp32|fp16
+epoch:1
+--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile
+flags:null
\ No newline at end of file
--- a/test_tipc/configs/yolov3/yolov3_darknet53_270e_coco_train_infer_python.txt
+++ b/test_tipc/configs/yolov3/yolov3_darknet53_270e_coco_train_infer_python.txt
@@ -49,3 +49,9 @@ inference:./deploy/python/infer.py
 --save_log_path:null
 --run_benchmark:True
 null:null
+===========================train_benchmark_params==========================
+batch_size:8
+fp_items:fp32|fp16
+epoch:1
+--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile
+flags:null
\ No newline at end of file
--- a/test_tipc/static/README.MD
+++ b/test_tipc/static/README.MD
+# PaddleDetection 下静态图benchmark模型执行说明
+静态图benchmark测试脚本说明
+# 目录说明 
+# Docker 运行环境
+docker image: registry.baidubce.com/paddlepaddle/paddle:latest-dev-cuda11.2-cudnn8-gcc82
+paddle = 2.2.2
+python = 3.7
+# 运行benchmark测试步骤
+git clone https://github.com/PaddlePaddle/PaddleDetection.git
+cd PaddleDetection
+# 准备数据
+bash test_tipc/static/${model_item}/benchmark_common/prepare.sh
+# 运行模型
+## 单卡（自动运行打开Profiling）
+export CUDA_VISIBLE_DEVICES=0 
+bash  test_tipc/static/${model_item}/N1C1/yolov3_darknet53_270e_coco_bs8_fp32_SingleP_DP.sh 
+## 多卡
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 
+bash test_tipc/static/${model_item}/N1C8/yolov3_darknet53_270e_coco_bs8_fp32_MultiP_DP.sh
--- a/test_tipc/static/mask_rcnn_r50_1x_coco/N1C1/mask_rcnn_r50_1x_coco_bs2_fp32_SingleP_DP.sh
+++ b/test_tipc/static/mask_rcnn_r50_1x_coco/N1C1/mask_rcnn_r50_1x_coco_bs2_fp32_SingleP_DP.sh
+model_item=mask_rcnn_r50_1x_coco
+bs_item=2
+fp_item=fp32
+run_process_type=SingleP
+run_mode=DP
+device_num=N1C1
+max_iter=500
+num_workers=2
+
+# get data
+bash test_tipc/static/${model_item}/benchmark_common/prepare.sh
+# run
+bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1;
+# run profiling
+sleep 10;
+export PROFILING=true
+bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1;
--- a/test_tipc/static/mask_rcnn_r50_1x_coco/N1C8/mask_rcnn_r50_1x_coco_bs2_fp32_MultiP_DP.sh
+++ b/test_tipc/static/mask_rcnn_r50_1x_coco/N1C8/mask_rcnn_r50_1x_coco_bs2_fp32_MultiP_DP.sh
+model_item=mask_rcnn_r50_1x_coco
+bs_item=2
+fp_item=fp32
+run_process_type=MultiP
+run_mode=DP
+device_num=N1C8
+max_iter=500
+num_workers=2
+
+# get data
+bash test_tipc/static/${model_item}/benchmark_common/prepare.sh
+# run
+bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1;
--- a/test_tipc/static/mask_rcnn_r50_1x_coco/benchmark_common/prepare.sh
+++ b/test_tipc/static/mask_rcnn_r50_1x_coco/benchmark_common/prepare.sh
+#!/usr/bin/env bash
+# 执行路径在模型库的根目录下
+################################# 安装框架 如:
+echo "*******prepare benchmark start ***********"
+pip install -U pip -i https://pypi.tuna.tsinghua.edu.cn/simple
+echo `pip --version`
+pip install Cython -i https://pypi.tuna.tsinghua.edu.cn/simple
+python -m pip install paddlepaddle-gpu==2.2.2.post112 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
+pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
+
+################################# 准备训练数据 如:
+wget -nc -P static/data/coco/ https://paddledet.bj.bcebos.com/data/coco_benchmark.tar
+cd ./static/data/coco/ && tar -xf coco_benchmark.tar && mv -u coco_benchmark/* .
+rm -rf coco_benchmark/ && cd ../../../
+echo "*******prepare benchmark end***********"
--- a/test_tipc/static/mask_rcnn_r50_1x_coco/benchmark_common/run_benchmark.sh
+++ b/test_tipc/static/mask_rcnn_r50_1x_coco/benchmark_common/run_benchmark.sh
+#!/usr/bin/env bash
+# Test training benchmark for a model.
+# Usage：bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num}
+function _set_params(){
+    model_item=${1:-"model_item"}   # (必选) 模型 item
+    base_batch_size=${2:-"2"}       # (必选) 如果是静态图单进程，则表示每张卡上的BS，需在训练时*卡数
+    fp_item=${3:-"fp32"}            # (必选) fp32|fp16
+    run_process_type=${4:-"SingleP"} # (必选) 单进程 SingleP|多进程 MultiP
+    run_mode=${5:-"DP"}             # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP1-MP4-PP1
+    device_num=${6:-"N1C1"}         # (必选) 使用的卡数量，N1C1|N1C8|N4C32 （4机32卡）
+    profiling=${PROFILING:-"false"}      # (必选) Profiling  开关，默认关闭，通过全局变量传递
+    model_repo="PaddleDetection"          # (必选) 模型套件的名字
+    speed_unit="samples/sec"         # (必选)速度指标单位
+    skip_steps=10                  # (必选)解析日志，跳过模型前几个性能不稳定的step
+    keyword="ips:"                 # (必选)解析日志，筛选出性能数据所在行的关键字
+    convergence_key="loss:"        # (可选)解析日志，筛选出收敛数据所在行的关键字 如：convergence_key="loss:"
+    max_iter=${7:-"500"}           # （可选）需保证模型执行时间在5分钟内，需要修改代码提前中断的直接提PR 合入套件；或使用max_epoch参数
+    num_workers=${8:-"8"}                  # (可选)
+#   以下为通用执行命令，无特殊可不用修改
+    model_name=${model_item}_bs${base_batch_size}_${fp_item}_${run_process_type}_${run_mode}  # (必填) 且格式不要改动,与竞品名称对齐
+    device=${CUDA_VISIBLE_DEVICES//,/ }
+    arr=(${device})
+    num_gpu_devices=${#arr[*]}
+    run_log_path=${TRAIN_LOG_DIR:-$(pwd)}  # （必填） TRAIN_LOG_DIR  benchmark框架设置该参数为全局变量
+    profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)}  # （必填） PROFILING_LOG_DIR benchmark框架设置该参数为全局变量
+    speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)}
+
+    train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log
+    profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling
+    speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed
+}
+function _train(){
+    cd ./static
+    batch_size=${base_batch_size}  # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs
+    echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}"
+
+    if [ ${profiling} = "true" ];then
+        log_file=${profiling_log_file}
+        is_profiler=1
+    else
+        log_file=${train_log_file}
+        is_profiler=0
+    fi
+    if [ ${fp_item} = "fp16" ]; then
+        use_fp16_cmd="--fp16"
+    else
+        use_fp16_cmd=""
+    fi
+
+    train_cmd="-c configs/mask_rcnn_r50_1x.yml -o LearningRate.base_lr=0.001 snapshot_iter=100000 \
+         TrainReader.batch_size==${batch_size} \
+         max_iters=${max_iter} log_iter=1 \
+         TrainReader.worker_num==${num_workers} ${use_fp16_cmd} \
+         --is_profiler=${is_profiler} "
+#   以下为通用执行命令，无特殊可不用修改
+    case ${run_mode} in
+    DP) if [[ ${run_process_type} = "SingleP" ]];then
+            echo "run ${run_mode} ${run_process_type}"
+            train_cmd="python -u tools/train.py ${train_cmd}"
+        elif [[ ${run_process_type} = "MultiP" ]];then
+            rm -rf ./mylog
+            train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=$CUDA_VISIBLE_DEVICES \
+                  tools/train.py ${train_cmd}"
+        else
+            echo "run ${run_mode} ${run_process_type} error", exit 1
+        fi
+        ;;
+    DP1-MP1-PP1)  echo "run run_mode: DP1-MP1-PP1" ;;
+    *) echo "choose run_mode "; exit 1;
+    esac
+    echo "train_cmd: ${train_cmd}  log_file: ${log_file}"
+    timeout 15m ${train_cmd} > ${log_file} 2>&1
+    if [ $? -ne 0 ];then
+        echo -e "${model_name}, FAIL"
+    else
+        echo -e "${model_name}, SUCCESS"
+    fi
+    kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
+    if [ ${run_process_type} = "MultiP" -a -d mylog ]; then
+        rm ${log_file}
+        cp mylog/workerlog.0 ${log_file}
+    fi
+    cd ../
+}
+source ${BENCHMARK_ROOT}/scripts/run_model.sh   # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开
+_set_params $@
+_train       # 如果只产出训练log,不解析,可取消注释
+_run     # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开
--- a/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/N1C1/mask_rcnn_r50_fpn_1x_coco_bs2_fp32_SingleP_DP.sh
+++ b/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/N1C1/mask_rcnn_r50_fpn_1x_coco_bs2_fp32_SingleP_DP.sh
+model_item=mask_rcnn_r50_fpn_1x_coco
+bs_item=2
+fp_item=fp32
+run_process_type=SingleP
+run_mode=DP
+device_num=N1C1
+max_iter=500
+num_workers=2
+
+# get data
+bash test_tipc/static/${model_item}/benchmark_common/prepare.sh
+# run
+bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1;
+# run profiling
+sleep 10;
+export PROFILING=true
+bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1;
--- a/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/N1C8/mask_rcnn_r50_fpn_1x_coco_bs2_fp32_MultiP_DP.sh
+++ b/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/N1C8/mask_rcnn_r50_fpn_1x_coco_bs2_fp32_MultiP_DP.sh
+model_item=mask_rcnn_r50_fpn_1x_coco
+bs_item=2
+fp_item=fp32
+run_process_type=MultiP
+run_mode=DP
+device_num=N1C8
+max_iter=500
+num_workers=2
+
+# get data
+bash test_tipc/static/${model_item}/benchmark_common/prepare.sh
+# run
+bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1;
--- a/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/benchmark_common/prepare.sh
+++ b/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/benchmark_common/prepare.sh
+#!/usr/bin/env bash
+# 执行路径在模型库的根目录下
+################################# 安装框架 如:
+echo "*******prepare benchmark start ***********"
+pip install -U pip -i https://pypi.tuna.tsinghua.edu.cn/simple
+echo `pip --version`
+pip install Cython -i https://pypi.tuna.tsinghua.edu.cn/simple
+python -m pip install paddlepaddle-gpu==2.2.2.post112 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
+pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
+
+################################# 准备训练数据 如:
+wget -nc -P static/data/coco/ https://paddledet.bj.bcebos.com/data/coco_benchmark.tar
+cd ./static/data/coco/ && tar -xf coco_benchmark.tar && mv -u coco_benchmark/* .
+rm -rf coco_benchmark/ && cd ../../../
+echo "*******prepare benchmark end***********"
--- a/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/benchmark_common/run_benchmark.sh
+++ b/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/benchmark_common/run_benchmark.sh
+#!/usr/bin/env bash
+# Test training benchmark for a model.
+# Usage：bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num}
+function _set_params(){
+    model_item=${1:-"model_item"}   # (必选) 模型 item
+    base_batch_size=${2:-"2"}       # (必选) 如果是静态图单进程，则表示每张卡上的BS，需在训练时*卡数
+    fp_item=${3:-"fp32"}            # (必选) fp32|fp16
+    run_process_type=${4:-"SingleP"} # (必选) 单进程 SingleP|多进程 MultiP
+    run_mode=${5:-"DP"}             # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP1-MP4-PP1
+    device_num=${6:-"N1C1"}         # (必选) 使用的卡数量，N1C1|N1C8|N4C32 （4机32卡）
+    profiling=${PROFILING:-"false"}      # (必选) Profiling  开关，默认关闭，通过全局变量传递
+    model_repo="PaddleDetection"          # (必选) 模型套件的名字
+    speed_unit="samples/sec"         # (必选)速度指标单位
+    skip_steps=10                  # (必选)解析日志，跳过模型前几个性能不稳定的step
+    keyword="ips:"                 # (必选)解析日志，筛选出性能数据所在行的关键字
+    convergence_key="loss:"        # (可选)解析日志，筛选出收敛数据所在行的关键字 如：convergence_key="loss:"
+    max_iter=${7:-"500"}           # （可选）需保证模型执行时间在5分钟内，需要修改代码提前中断的直接提PR 合入套件；或使用max_epoch参数
+    num_workers=${8:-"8"}                  # (可选)
+#   以下为通用执行命令，无特殊可不用修改
+    model_name=${model_item}_bs${base_batch_size}_${fp_item}_${run_process_type}_${run_mode}  # (必填) 且格式不要改动,与竞品名称对齐
+    device=${CUDA_VISIBLE_DEVICES//,/ }
+    arr=(${device})
+    num_gpu_devices=${#arr[*]}
+    run_log_path=${TRAIN_LOG_DIR:-$(pwd)}  # （必填） TRAIN_LOG_DIR  benchmark框架设置该参数为全局变量
+    profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)}  # （必填） PROFILING_LOG_DIR benchmark框架设置该参数为全局变量
+    speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)}
+
+    train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log
+    profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling
+    speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed
+}
+function _train(){
+    cd ./static
+    batch_size=${base_batch_size}  # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs
+    echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}"
+
+    if [ ${profiling} = "true" ];then
+        log_file=${profiling_log_file}
+        is_profiler=1
+    else
+        log_file=${train_log_file}
+        is_profiler=0
+    fi
+    if [ ${fp_item} = "fp16" ]; then
+        use_fp16_cmd="--fp16"
+    else
+        use_fp16_cmd=""
+    fi
+
+    train_cmd="-c configs/mask_rcnn_r50_fpn_1x.yml -o LearningRate.base_lr=0.001 snapshot_iter=100000 \
+         TrainReader.batch_size==${batch_size} \
+         max_iters=${max_iter} log_iter=1 \
+         TrainReader.worker_num==${num_workers} ${use_fp16_cmd} \
+         --is_profiler=${is_profiler} "
+#   以下为通用执行命令，无特殊可不用修改
+    case ${run_mode} in
+    DP) if [[ ${run_process_type} = "SingleP" ]];then
+            echo "run ${run_mode} ${run_process_type}"
+            train_cmd="python -u tools/train.py ${train_cmd}"
+        elif [[ ${run_process_type} = "MultiP" ]];then
+            rm -rf ./mylog
+            train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=$CUDA_VISIBLE_DEVICES \
+                  tools/train.py ${train_cmd}"
+        else
+            echo "run ${run_mode} ${run_process_type} error", exit 1
+        fi
+        ;;
+    DP1-MP1-PP1)  echo "run run_mode: DP1-MP1-PP1" ;;
+    *) echo "choose run_mode "; exit 1;
+    esac
+    echo "train_cmd: ${train_cmd}  log_file: ${log_file}"
+    timeout 15m ${train_cmd} > ${log_file} 2>&1
+    if [ $? -ne 0 ];then
+        echo -e "${model_name}, FAIL"
+    else
+        echo -e "${model_name}, SUCCESS"
+    fi
+    kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
+    if [ ${run_process_type} = "MultiP" -a -d mylog ]; then
+        rm ${log_file}
+        cp mylog/workerlog.0 ${log_file}
+    fi
+    cd ../
+}
+source ${BENCHMARK_ROOT}/scripts/run_model.sh   # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开
+_set_params $@
+_train       # 如果只产出训练log,不解析,可取消注释
+_run     # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开
--- a/test_tipc/static/yolov3_darknet53_270e_coco/N1C1/yolov3_darknet53_270e_coco_bs8_fp32_SingleP_DP.sh
+++ b/test_tipc/static/yolov3_darknet53_270e_coco/N1C1/yolov3_darknet53_270e_coco_bs8_fp32_SingleP_DP.sh
+model_item=yolov3_darknet53_270e_coco
+bs_item=8
+fp_item=fp32
+run_process_type=SingleP
+run_mode=DP
+device_num=N1C1
+max_iter=500
+num_workers=8
+
+# get data
+bash test_tipc/static/${model_item}/benchmark_common/prepare.sh
+# run
+bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1;
+# run profiling
+sleep 10;
+export PROFILING=true
+bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1;
--- a/test_tipc/static/yolov3_darknet53_270e_coco/N1C8/yolov3_darknet53_270e_coco_bs8_fp32_MultiP_DP.sh
+++ b/test_tipc/static/yolov3_darknet53_270e_coco/N1C8/yolov3_darknet53_270e_coco_bs8_fp32_MultiP_DP.sh
+model_item=yolov3_darknet53_270e_coco
+bs_item=8
+fp_item=fp32
+run_process_type=MultiP
+run_mode=DP
+device_num=N1C8
+max_iter=500
+num_workers=8
+
+# get data
+bash test_tipc/static/${model_item}/benchmark_common/prepare.sh
+# run
+bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1;
--- a/test_tipc/static/yolov3_darknet53_270e_coco/benchmark_common/prepare.sh
+++ b/test_tipc/static/yolov3_darknet53_270e_coco/benchmark_common/prepare.sh
+#!/usr/bin/env bash
+# 执行路径在模型库的根目录下
+################################# 安装框架 如:
+echo "*******prepare benchmark start ***********"
+pip install -U pip -i https://pypi.tuna.tsinghua.edu.cn/simple
+echo `pip --version`
+pip install Cython -i https://pypi.tuna.tsinghua.edu.cn/simple
+python -m pip install paddlepaddle-gpu==2.2.2.post112 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
+pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
+
+################################# 准备训练数据 如:
+wget -nc -P static/data/coco/ https://paddledet.bj.bcebos.com/data/coco_benchmark.tar
+cd ./static/data/coco/ && tar -xf coco_benchmark.tar && mv -u coco_benchmark/* .
+rm -rf coco_benchmark/ && cd ../../../
+echo "*******prepare benchmark end***********"
--- a/test_tipc/static/yolov3_darknet53_270e_coco/benchmark_common/run_benchmark.sh
+++ b/test_tipc/static/yolov3_darknet53_270e_coco/benchmark_common/run_benchmark.sh
+#!/usr/bin/env bash
+# Test training benchmark for a model.
+# Usage：bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num}
+function _set_params(){
+    model_item=${1:-"model_item"}   # (必选) 模型 item
+    base_batch_size=${2:-"2"}       # (必选) 如果是静态图单进程，则表示每张卡上的BS，需在训练时*卡数
+    fp_item=${3:-"fp32"}            # (必选) fp32|fp16
+    run_process_type=${4:-"SingleP"} # (必选) 单进程 SingleP|多进程 MultiP
+    run_mode=${5:-"DP"}             # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP1-MP4-PP1
+    device_num=${6:-"N1C1"}         # (必选) 使用的卡数量，N1C1|N1C8|N4C32 （4机32卡）
+    profiling=${PROFILING:-"false"}      # (必选) Profiling  开关，默认关闭，通过全局变量传递
+    model_repo="PaddleDetection"          # (必选) 模型套件的名字
+    speed_unit="samples/sec"         # (必选)速度指标单位
+    skip_steps=10                  # (必选)解析日志，跳过模型前几个性能不稳定的step
+    keyword="ips:"                 # (必选)解析日志，筛选出性能数据所在行的关键字
+    convergence_key="loss:"        # (可选)解析日志，筛选出收敛数据所在行的关键字 如：convergence_key="loss:"
+    max_iter=${7:-"500"}           # （可选）需保证模型执行时间在5分钟内，需要修改代码提前中断的直接提PR 合入套件；或使用max_epoch参数
+    num_workers=${8:-"8"}                  # (可选)
+#   以下为通用执行命令，无特殊可不用修改
+    model_name=${model_item}_bs${base_batch_size}_${fp_item}_${run_process_type}_${run_mode}  # (必填) 且格式不要改动,与竞品名称对齐
+    device=${CUDA_VISIBLE_DEVICES//,/ }
+    arr=(${device})
+    num_gpu_devices=${#arr[*]}
+    run_log_path=${TRAIN_LOG_DIR:-$(pwd)}  # （必填） TRAIN_LOG_DIR  benchmark框架设置该参数为全局变量
+    profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)}  # （必填） PROFILING_LOG_DIR benchmark框架设置该参数为全局变量
+    speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)}
+
+    train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log
+    profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling
+    speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed
+}
+function _train(){
+    cd ./static
+    batch_size=${base_batch_size}  # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs
+    echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}"
+
+    if [ ${profiling} = "true" ];then
+        log_file=${profiling_log_file}
+        is_profiler=1
+    else
+        log_file=${train_log_file}
+        is_profiler=0
+    fi
+    if [ ${fp_item} = "fp16" ]; then
+        use_fp16_cmd="--fp16"
+    else
+        use_fp16_cmd=""
+    fi
+
+    train_cmd="-c configs/yolov3_darknet.yml -o LearningRate.base_lr=0.002 snapshot_iter=100000 \
+         TrainReader.batch_size==${batch_size} \
+         max_iters=${max_iter} log_iter=1 \
+         TrainReader.worker_num==${num_workers} ${use_fp16_cmd} \
+         --is_profiler=${is_profiler} "
+#   以下为通用执行命令，无特殊可不用修改
+    case ${run_mode} in
+    DP) if [[ ${run_process_type} = "SingleP" ]];then
+            echo "run ${run_mode} ${run_process_type}"
+            train_cmd="python -u tools/train.py ${train_cmd}"
+        elif [[ ${run_process_type} = "MultiP" ]];then
+            rm -rf ./mylog
+            train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=$CUDA_VISIBLE_DEVICES \
+                  tools/train.py ${train_cmd}"
+        else
+            echo "run ${run_mode} ${run_process_type} error", exit 1
+        fi
+        ;;
+    DP1-MP1-PP1)  echo "run run_mode: DP1-MP1-PP1" ;;
+    *) echo "choose run_mode "; exit 1;
+    esac
+    echo "train_cmd: ${train_cmd}  log_file: ${log_file}"
+    timeout 15m ${train_cmd} > ${log_file} 2>&1
+    if [ $? -ne 0 ];then
+        echo -e "${model_name}, FAIL"
+    else
+        echo -e "${model_name}, SUCCESS"
+    fi
+    kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
+    if [ ${run_process_type} = "MultiP" -a -d mylog ]; then
+        rm ${log_file}
+        cp mylog/workerlog.0 ${log_file}
+    fi
+    cd ../
+}
+source ${BENCHMARK_ROOT}/scripts/run_model.sh   # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开
+_set_params $@
+_train       # 如果只产出训练log,不解析,可取消注释
+_run     # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开