fix benchmark (#5335)

* fix becnhmark,delete run_process_type * fix * fix benchmarkk

fix benchmark (#5335)
* fix becnhmark,delete run_process_type * fix * fix benchmarkk
7cb9fc59 · gmm · GitHub · 9f9df6f7 · 7cb9fc59 · 7cb9fc59
10 changed file
--- a/test_tipc/benchmark_train.sh
+++ b/test_tipc/benchmark_train.sh
@@ -137,7 +137,6 @@ else
    batch_size=${params_list[1]}
    batch_size=`echo  ${batch_size} | tr -cd "[0-9]" `
    precision=${params_list[2]}
-    # run_process_type=${params_list[3]}
    run_mode=${params_list[3]}
    device_num=${params_list[4]}
    IFS=";"
@@ -162,10 +161,9 @@ for batch_size in ${batch_size_list[*]}; do
            gpu_id=$(set_gpu_id $device_num)

            if [ ${#gpu_id} -le 1 ];then
-                run_process_type="SingleP"
                log_path="$SAVE_LOG/profiling_log"
                mkdir -p $log_path
-                log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_profiling"
+                log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_profiling"
                func_sed_params "$FILENAME" "${line_gpuid}" "0"  # sed used gpu_id
                # set profile_option params
                tmp=`sed -i "${line_profile}s/.*/${profile_option}/" "${FILENAME}"`
@@ -181,8 +179,8 @@ for batch_size in ${batch_size_list[*]}; do
                speed_log_path="$SAVE_LOG/index"
                mkdir -p $log_path
                mkdir -p $speed_log_path
-                log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_log"
-                speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_speed"
+                log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_log"
+                speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_speed"
                func_sed_params "$FILENAME" "${line_profile}" "null"  # sed profile_id as null
                cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 "
                echo $cmd
@@ -193,13 +191,12 @@ for batch_size in ${batch_size_list[*]}; do
                eval "cat ${log_path}/${log_name}"

                # parser log
-                _model_name="${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}"
+                _model_name="${model_name}_bs${batch_size}_${precision}_${run_mode}"
                cmd="${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \
                        --speed_log_file '${speed_log_path}/${speed_log_name}' \
                        --model_name ${_model_name} \
                        --base_batch_size ${batch_size} \
                        --run_mode ${run_mode} \
-                        --run_process_type ${run_process_type} \
                        --fp_item ${precision} \
                        --keyword ips: \
                        --skip_steps 2 \
@@ -213,13 +210,12 @@ for batch_size in ${batch_size_list[*]}; do
            else
                IFS=";"
                unset_env=`unset CUDA_VISIBLE_DEVICES`
-                run_process_type="MultiP"
                log_path="$SAVE_LOG/train_log"
                speed_log_path="$SAVE_LOG/index"
                mkdir -p $log_path
                mkdir -p $speed_log_path
-                log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_log"
-                speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_speed"
+                log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_log"
+                speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_speed"
                func_sed_params "$FILENAME" "${line_gpuid}" "$gpu_id"  # sed used gpu_id
                func_sed_params "$FILENAME" "${line_profile}" "null"  # sed --profile_option as null
                cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 "
@@ -230,14 +226,13 @@ for batch_size in ${batch_size_list[*]}; do
                export model_run_time=$((${job_et}-${job_bt}))
                eval "cat ${log_path}/${log_name}"
                # parser log
-                _model_name="${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}"
+                _model_name="${model_name}_bs${batch_size}_${precision}_${run_mode}"

                cmd="${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \
                        --speed_log_file '${speed_log_path}/${speed_log_name}' \
                        --model_name ${_model_name} \
                        --base_batch_size ${batch_size} \
                        --run_mode ${run_mode} \
-                        --run_process_type ${run_process_type} \
                        --fp_item ${precision} \
                        --keyword ips: \
                        --skip_steps 2 \

--- a/test_tipc/static/mask_rcnn_r50_1x_coco/N1C1/mask_rcnn_r50_1x_coco_bs2_fp32_SingleP_DP.sh
+++ b/test_tipc/static/mask_rcnn_r50_1x_coco/N1C1/mask_rcnn_r50_1x_coco_bs2_fp32_SingleP_DP.sh
 model_item=mask_rcnn_r50_1x_coco
 bs_item=2
 fp_item=fp32
-run_process_type=SingleP
 run_mode=DP
 device_num=N1C1
 max_iter=100
@@ -10,8 +9,8 @@ num_workers=2
 # get data
 bash test_tipc/static/${model_item}/benchmark_common/prepare.sh
 # run
-bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1;
+bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1;
 # run profiling
 sleep 10;
 export PROFILING=true
-bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} 11 ${num_workers} 2>&1;
+bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} 11 ${num_workers} 2>&1;
--- a/test_tipc/static/mask_rcnn_r50_1x_coco/N1C8/mask_rcnn_r50_1x_coco_bs2_fp32_MultiP_DP.sh
+++ b/test_tipc/static/mask_rcnn_r50_1x_coco/N1C8/mask_rcnn_r50_1x_coco_bs2_fp32_MultiP_DP.sh
 model_item=mask_rcnn_r50_1x_coco
 bs_item=2
 fp_item=fp32
-run_process_type=MultiP
 run_mode=DP
 device_num=N1C8
 max_iter=100
@@ -10,4 +9,4 @@ num_workers=2
 # get data
 bash test_tipc/static/${model_item}/benchmark_common/prepare.sh
 # run
-bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1;
+bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1;
--- a/test_tipc/static/mask_rcnn_r50_1x_coco/benchmark_common/run_benchmark.sh
+++ b/test_tipc/static/mask_rcnn_r50_1x_coco/benchmark_common/run_benchmark.sh
 #!/usr/bin/env bash
 # Test training benchmark for a model.
-# Usage：bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num}
+# Usage：bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num}
 function _set_params(){
    model_item=${1:-"model_item"}   # (必选) 模型 item
    base_batch_size=${2:-"2"}       # (必选) 如果是静态图单进程，则表示每张卡上的BS，需在训练时*卡数
    fp_item=${3:-"fp32"}            # (必选) fp32|fp16
-    run_process_type=${4:-"SingleP"} # (必选) 单进程 SingleP|多进程 MultiP
-    run_mode=${5:-"DP"}             # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP1-MP4-PP1
-    device_num=${6:-"N1C1"}         # (必选) 使用的卡数量，N1C1|N1C8|N4C32 （4机32卡）
+    run_mode=${4:-"DP"}             # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP1-MP4-PP1
+    device_num=${5:-"N1C1"}         # (必选) 使用的卡数量，N1C1|N1C8|N4C32 （4机32卡）
    profiling=${PROFILING:-"false"}      # (必选) Profiling  开关，默认关闭，通过全局变量传递
    model_repo="PaddleDetection"          # (必选) 模型套件的名字
    speed_unit="samples/sec"         # (必选)速度指标单位
    skip_steps=10                  # (必选)解析日志，跳过模型前几个性能不稳定的step
    keyword="ips:"                 # (必选)解析日志，筛选出性能数据所在行的关键字
    convergence_key="loss:"        # (可选)解析日志，筛选出收敛数据所在行的关键字 如：convergence_key="loss:"
-    max_iter=${7:-"100"}           # （可选）需保证模型执行时间在5分钟内，需要修改代码提前中断的直接提PR 合入套件；或使用max_epoch参数
-    num_workers=${8:-"8"}                  # (可选)
+    max_iter=${6:-"100"}           # （可选）需保证模型执行时间在5分钟内，需要修改代码提前中断的直接提PR 合入套件；或使用max_epoch参数
+    num_workers=${7:-"8"}                  # (可选)
 #   以下为通用执行命令，无特殊可不用修改
-    model_name=${model_item}_bs${base_batch_size}_${fp_item}_${run_process_type}_${run_mode}  # (必填) 且格式不要改动,与竞品名称对齐
+    model_name=${model_item}_bs${base_batch_size}_${fp_item}_${run_mode}  # (必填) 且格式不要改动,与竞品名称对齐
    device=${CUDA_VISIBLE_DEVICES//,/ }
    arr=(${device})
    num_gpu_devices=${#arr[*]}
@@ -30,6 +29,9 @@ function _set_params(){
    speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed
 }
 function _train(){
+    export FLAGS_eager_delete_tensor_gb=0.0
+    export FLAGS_fraction_of_gpu_memory_to_use=0.98
+    export FLAGS_memory_fraction_of_eager_deletion=1.0
    cd ./static
    batch_size=${base_batch_size}  # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs
    echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}"
@@ -54,20 +56,19 @@ function _train(){
         --is_profiler=${is_profiler} "
 #   以下为通用执行命令，无特殊可不用修改
    case ${run_mode} in
-    DP) if [[ ${run_process_type} = "SingleP" ]];then
-            echo "run ${run_mode} ${run_process_type}"
+    DP) if [[ ${device_num} = "N1C1" ]];then
+            echo "run ${run_mode} ${device_num}"
            train_cmd="python -u tools/train.py ${train_cmd}"
-        elif [[ ${run_process_type} = "MultiP" ]];then
+        else
            rm -rf ./mylog
            train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=$CUDA_VISIBLE_DEVICES \
                  tools/train.py ${train_cmd}"
-        else
-            echo "run ${run_mode} ${run_process_type} error", exit 1
        fi
        ;;
    DP1-MP1-PP1)  echo "run run_mode: DP1-MP1-PP1" ;;
    *) echo "choose run_mode "; exit 1;
    esac
+
    echo "train_cmd: ${train_cmd}  log_file: ${log_file}"
    timeout 15m ${train_cmd} > ${log_file} 2>&1
    if [ $? -ne 0 ];then
@@ -75,8 +76,8 @@ function _train(){
    else
        echo -e "${model_name}, SUCCESS"
    fi
-    kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
-    if [ ${run_process_type} = "MultiP" -a -d mylog ]; then
+    # kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
+    if [ ${device_num} != "N1C1" -a -d mylog ]; then
        rm ${log_file}
        cp mylog/workerlog.0 ${log_file}
    fi

--- a/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/N1C1/mask_rcnn_r50_fpn_1x_coco_bs2_fp32_SingleP_DP.sh
+++ b/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/N1C1/mask_rcnn_r50_fpn_1x_coco_bs2_fp32_SingleP_DP.sh
 model_item=mask_rcnn_r50_fpn_1x_coco
 bs_item=2
 fp_item=fp32
-run_process_type=SingleP
 run_mode=DP
 device_num=N1C1
 max_iter=100
@@ -10,8 +9,8 @@ num_workers=2
 # get data
 bash test_tipc/static/${model_item}/benchmark_common/prepare.sh
 # run
-bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1;
+bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1;
 # run profiling
 sleep 10;
 export PROFILING=true
-bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} 11 ${num_workers} 2>&1;
+bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} 11 ${num_workers} 2>&1;
--- a/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/N1C8/mask_rcnn_r50_fpn_1x_coco_bs2_fp32_MultiP_DP.sh
+++ b/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/N1C8/mask_rcnn_r50_fpn_1x_coco_bs2_fp32_MultiP_DP.sh
 model_item=mask_rcnn_r50_fpn_1x_coco
 bs_item=2
 fp_item=fp32
-run_process_type=MultiP
 run_mode=DP
 device_num=N1C8
 max_iter=100
@@ -10,4 +9,4 @@ num_workers=2
 # get data
 bash test_tipc/static/${model_item}/benchmark_common/prepare.sh
 # run
-bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1;
+bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1;
--- a/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/benchmark_common/run_benchmark.sh
+++ b/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/benchmark_common/run_benchmark.sh
 #!/usr/bin/env bash
 # Test training benchmark for a model.
-# Usage：bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num}
+# Usage：bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num}
 function _set_params(){
    model_item=${1:-"model_item"}   # (必选) 模型 item
    base_batch_size=${2:-"2"}       # (必选) 如果是静态图单进程，则表示每张卡上的BS，需在训练时*卡数
    fp_item=${3:-"fp32"}            # (必选) fp32|fp16
-    run_process_type=${4:-"SingleP"} # (必选) 单进程 SingleP|多进程 MultiP
-    run_mode=${5:-"DP"}             # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP1-MP4-PP1
-    device_num=${6:-"N1C1"}         # (必选) 使用的卡数量，N1C1|N1C8|N4C32 （4机32卡）
+    run_mode=${4:-"DP"}             # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP1-MP4-PP1
+    device_num=${5:-"N1C1"}         # (必选) 使用的卡数量，N1C1|N1C8|N4C32 （4机32卡）
    profiling=${PROFILING:-"false"}      # (必选) Profiling  开关，默认关闭，通过全局变量传递
    model_repo="PaddleDetection"          # (必选) 模型套件的名字
    speed_unit="samples/sec"         # (必选)速度指标单位
    skip_steps=10                  # (必选)解析日志，跳过模型前几个性能不稳定的step
    keyword="ips:"                 # (必选)解析日志，筛选出性能数据所在行的关键字
    convergence_key="loss:"        # (可选)解析日志，筛选出收敛数据所在行的关键字 如：convergence_key="loss:"
-    max_iter=${7:-"100"}           # （可选）需保证模型执行时间在5分钟内，需要修改代码提前中断的直接提PR 合入套件；或使用max_epoch参数
-    num_workers=${8:-"8"}                  # (可选)
+    max_iter=${6:-"100"}           # （可选）需保证模型执行时间在5分钟内，需要修改代码提前中断的直接提PR 合入套件；或使用max_epoch参数
+    num_workers=${7:-"8"}                  # (可选)
 #   以下为通用执行命令，无特殊可不用修改
-    model_name=${model_item}_bs${base_batch_size}_${fp_item}_${run_process_type}_${run_mode}  # (必填) 且格式不要改动,与竞品名称对齐
+    model_name=${model_item}_bs${base_batch_size}_${fp_item}_${run_mode}  # (必填) 且格式不要改动,与竞品名称对齐
    device=${CUDA_VISIBLE_DEVICES//,/ }
    arr=(${device})
    num_gpu_devices=${#arr[*]}
@@ -30,6 +29,9 @@ function _set_params(){
    speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed
 }
 function _train(){
+    export FLAGS_eager_delete_tensor_gb=0.0
+    export FLAGS_fraction_of_gpu_memory_to_use=0.98
+    export FLAGS_memory_fraction_of_eager_deletion=1.0
    cd ./static
    batch_size=${base_batch_size}  # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs
    echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}"
@@ -54,20 +56,19 @@ function _train(){
         --is_profiler=${is_profiler} "
 #   以下为通用执行命令，无特殊可不用修改
    case ${run_mode} in
-    DP) if [[ ${run_process_type} = "SingleP" ]];then
-            echo "run ${run_mode} ${run_process_type}"
+    DP) if [[ ${device_num} = "N1C1" ]];then
+            echo "run ${run_mode} ${device_num}"
            train_cmd="python -u tools/train.py ${train_cmd}"
-        elif [[ ${run_process_type} = "MultiP" ]];then
+        else
            rm -rf ./mylog
            train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=$CUDA_VISIBLE_DEVICES \
                  tools/train.py ${train_cmd}"
-        else
-            echo "run ${run_mode} ${run_process_type} error", exit 1
        fi
        ;;
    DP1-MP1-PP1)  echo "run run_mode: DP1-MP1-PP1" ;;
    *) echo "choose run_mode "; exit 1;
    esac
+
    echo "train_cmd: ${train_cmd}  log_file: ${log_file}"
    timeout 15m ${train_cmd} > ${log_file} 2>&1
    if [ $? -ne 0 ];then
@@ -75,8 +76,8 @@ function _train(){
    else
        echo -e "${model_name}, SUCCESS"
    fi
-    kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
-    if [ ${run_process_type} = "MultiP" -a -d mylog ]; then
+    # kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
+    if [ ${device_num} != "N1C1" -a -d mylog ]; then
        rm ${log_file}
        cp mylog/workerlog.0 ${log_file}
    fi

--- a/test_tipc/static/yolov3_darknet53_270e_coco/N1C1/yolov3_darknet53_270e_coco_bs8_fp32_SingleP_DP.sh
+++ b/test_tipc/static/yolov3_darknet53_270e_coco/N1C1/yolov3_darknet53_270e_coco_bs8_fp32_SingleP_DP.sh
 model_item=yolov3_darknet53_270e_coco
 bs_item=8
 fp_item=fp32
-run_process_type=SingleP
 run_mode=DP
 device_num=N1C1
 max_iter=100
@@ -10,8 +9,8 @@ num_workers=8
 # get data
 bash test_tipc/static/${model_item}/benchmark_common/prepare.sh
 # run
-bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1;
+bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1;
 # run profiling
 sleep 10;
 export PROFILING=true
-bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} 11 ${num_workers} 2>&1;
+bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} 11 ${num_workers} 2>&1;
--- a/test_tipc/static/yolov3_darknet53_270e_coco/N1C8/yolov3_darknet53_270e_coco_bs8_fp32_MultiP_DP.sh
+++ b/test_tipc/static/yolov3_darknet53_270e_coco/N1C8/yolov3_darknet53_270e_coco_bs8_fp32_MultiP_DP.sh
 model_item=yolov3_darknet53_270e_coco
 bs_item=8
 fp_item=fp32
-run_process_type=MultiP
 run_mode=DP
 device_num=N1C8
 max_iter=100
@@ -10,4 +9,4 @@ num_workers=8
 # get data
 bash test_tipc/static/${model_item}/benchmark_common/prepare.sh
 # run
-bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1;
+bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1;
--- a/test_tipc/static/yolov3_darknet53_270e_coco/benchmark_common/run_benchmark.sh
+++ b/test_tipc/static/yolov3_darknet53_270e_coco/benchmark_common/run_benchmark.sh
 #!/usr/bin/env bash
 # Test training benchmark for a model.
-# Usage：bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num}
+# Usage：bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item}  ${run_mode} ${device_num}
 function _set_params(){
    model_item=${1:-"model_item"}   # (必选) 模型 item
    base_batch_size=${2:-"2"}       # (必选) 如果是静态图单进程，则表示每张卡上的BS，需在训练时*卡数
    fp_item=${3:-"fp32"}            # (必选) fp32|fp16
-    run_process_type=${4:-"SingleP"} # (必选) 单进程 SingleP|多进程 MultiP
-    run_mode=${5:-"DP"}             # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP1-MP4-PP1
-    device_num=${6:-"N1C1"}         # (必选) 使用的卡数量，N1C1|N1C8|N4C32 （4机32卡）
+    run_mode=${4:-"DP"}             # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP1-MP4-PP1
+    device_num=${5:-"N1C1"}         # (必选) 使用的卡数量，N1C1|N1C8|N4C32 （4机32卡）
    profiling=${PROFILING:-"false"}      # (必选) Profiling  开关，默认关闭，通过全局变量传递
    model_repo="PaddleDetection"          # (必选) 模型套件的名字
    speed_unit="samples/sec"         # (必选)速度指标单位
    skip_steps=10                  # (必选)解析日志，跳过模型前几个性能不稳定的step
    keyword="ips:"                 # (必选)解析日志，筛选出性能数据所在行的关键字
    convergence_key="loss:"        # (可选)解析日志，筛选出收敛数据所在行的关键字 如：convergence_key="loss:"
-    max_iter=${7:-"100"}           # （可选）需保证模型执行时间在5分钟内，需要修改代码提前中断的直接提PR 合入套件；或使用max_epoch参数
-    num_workers=${8:-"8"}                  # (可选)
+    max_iter=${6:-"100"}           # （可选）需保证模型执行时间在5分钟内，需要修改代码提前中断的直接提PR 合入套件；或使用max_epoch参数
+    num_workers=${7:-"8"}                  # (可选)
 #   以下为通用执行命令，无特殊可不用修改
-    model_name=${model_item}_bs${base_batch_size}_${fp_item}_${run_process_type}_${run_mode}  # (必填) 且格式不要改动,与竞品名称对齐
+    model_name=${model_item}_bs${base_batch_size}_${fp_item}_${run_mode}  # (必填) 且格式不要改动,与竞品名称对齐
    device=${CUDA_VISIBLE_DEVICES//,/ }
    arr=(${device})
    num_gpu_devices=${#arr[*]}
@@ -30,6 +29,9 @@ function _set_params(){
    speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed
 }
 function _train(){
+    export FLAGS_eager_delete_tensor_gb=0.0
+    export FLAGS_fraction_of_gpu_memory_to_use=0.98
+    export FLAGS_memory_fraction_of_eager_deletion=1.0
    cd ./static
    batch_size=${base_batch_size}  # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs
    echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}"
@@ -54,20 +56,19 @@ function _train(){
         --is_profiler=${is_profiler} "
 #   以下为通用执行命令，无特殊可不用修改
    case ${run_mode} in
-    DP) if [[ ${run_process_type} = "SingleP" ]];then
-            echo "run ${run_mode} ${run_process_type}"
+    DP) if [[ ${device_num} = "N1C1" ]];then
+            echo "run ${run_mode} ${device_num}"
            train_cmd="python -u tools/train.py ${train_cmd}"
-        elif [[ ${run_process_type} = "MultiP" ]];then
+        else                
            rm -rf ./mylog
            train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=$CUDA_VISIBLE_DEVICES \
                  tools/train.py ${train_cmd}"
-        else
-            echo "run ${run_mode} ${run_process_type} error", exit 1
        fi
        ;;
    DP1-MP1-PP1)  echo "run run_mode: DP1-MP1-PP1" ;;
    *) echo "choose run_mode "; exit 1;
    esac
+    
    echo "train_cmd: ${train_cmd}  log_file: ${log_file}"
    timeout 15m ${train_cmd} > ${log_file} 2>&1
    if [ $? -ne 0 ];then
@@ -75,8 +76,8 @@ function _train(){
    else
        echo -e "${model_name}, SUCCESS"
    fi
-    kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
-    if [ ${run_process_type} = "MultiP" -a -d mylog ]; then
+    # kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
+    if [ ${device_num} != "N1C1" -a -d mylog ]; then
        rm ${log_file}
        cp mylog/workerlog.0 ${log_file}
    fi