diff --git a/test_tipc/benchmark_train.sh b/test_tipc/benchmark_train.sh index befa75526995e36518dd241e8e5d49fb5b654d73..47415bbd7e8da3961b368e264d273e8132724136 100644 --- a/test_tipc/benchmark_train.sh +++ b/test_tipc/benchmark_train.sh @@ -137,7 +137,6 @@ else batch_size=${params_list[1]} batch_size=`echo ${batch_size} | tr -cd "[0-9]" ` precision=${params_list[2]} - # run_process_type=${params_list[3]} run_mode=${params_list[3]} device_num=${params_list[4]} IFS=";" @@ -162,10 +161,9 @@ for batch_size in ${batch_size_list[*]}; do gpu_id=$(set_gpu_id $device_num) if [ ${#gpu_id} -le 1 ];then - run_process_type="SingleP" log_path="$SAVE_LOG/profiling_log" mkdir -p $log_path - log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_profiling" + log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_profiling" func_sed_params "$FILENAME" "${line_gpuid}" "0" # sed used gpu_id # set profile_option params tmp=`sed -i "${line_profile}s/.*/${profile_option}/" "${FILENAME}"` @@ -181,8 +179,8 @@ for batch_size in ${batch_size_list[*]}; do speed_log_path="$SAVE_LOG/index" mkdir -p $log_path mkdir -p $speed_log_path - log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_log" - speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_speed" + log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_log" + speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_speed" func_sed_params "$FILENAME" "${line_profile}" "null" # sed profile_id as null cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 " echo $cmd @@ -193,13 +191,12 @@ for batch_size in ${batch_size_list[*]}; do eval "cat ${log_path}/${log_name}" # parser log - _model_name="${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}" + _model_name="${model_name}_bs${batch_size}_${precision}_${run_mode}" cmd="${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \ --speed_log_file '${speed_log_path}/${speed_log_name}' \ --model_name ${_model_name} \ --base_batch_size ${batch_size} \ --run_mode ${run_mode} \ - --run_process_type ${run_process_type} \ --fp_item ${precision} \ --keyword ips: \ --skip_steps 2 \ @@ -213,13 +210,12 @@ for batch_size in ${batch_size_list[*]}; do else IFS=";" unset_env=`unset CUDA_VISIBLE_DEVICES` - run_process_type="MultiP" log_path="$SAVE_LOG/train_log" speed_log_path="$SAVE_LOG/index" mkdir -p $log_path mkdir -p $speed_log_path - log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_log" - speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_speed" + log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_log" + speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_speed" func_sed_params "$FILENAME" "${line_gpuid}" "$gpu_id" # sed used gpu_id func_sed_params "$FILENAME" "${line_profile}" "null" # sed --profile_option as null cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 " @@ -230,14 +226,13 @@ for batch_size in ${batch_size_list[*]}; do export model_run_time=$((${job_et}-${job_bt})) eval "cat ${log_path}/${log_name}" # parser log - _model_name="${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}" + _model_name="${model_name}_bs${batch_size}_${precision}_${run_mode}" cmd="${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \ --speed_log_file '${speed_log_path}/${speed_log_name}' \ --model_name ${_model_name} \ --base_batch_size ${batch_size} \ --run_mode ${run_mode} \ - --run_process_type ${run_process_type} \ --fp_item ${precision} \ --keyword ips: \ --skip_steps 2 \ diff --git a/test_tipc/static/mask_rcnn_r50_1x_coco/N1C1/mask_rcnn_r50_1x_coco_bs2_fp32_SingleP_DP.sh b/test_tipc/static/mask_rcnn_r50_1x_coco/N1C1/mask_rcnn_r50_1x_coco_bs2_fp32_DP.sh similarity index 58% rename from test_tipc/static/mask_rcnn_r50_1x_coco/N1C1/mask_rcnn_r50_1x_coco_bs2_fp32_SingleP_DP.sh rename to test_tipc/static/mask_rcnn_r50_1x_coco/N1C1/mask_rcnn_r50_1x_coco_bs2_fp32_DP.sh index 66cd2adbc8e404ad364a58dc215e33e4a5fb9302..65878fd7010418c4f9ff5c3b60d83f60d6c10184 100644 --- a/test_tipc/static/mask_rcnn_r50_1x_coco/N1C1/mask_rcnn_r50_1x_coco_bs2_fp32_SingleP_DP.sh +++ b/test_tipc/static/mask_rcnn_r50_1x_coco/N1C1/mask_rcnn_r50_1x_coco_bs2_fp32_DP.sh @@ -1,7 +1,6 @@ model_item=mask_rcnn_r50_1x_coco bs_item=2 fp_item=fp32 -run_process_type=SingleP run_mode=DP device_num=N1C1 max_iter=100 @@ -10,8 +9,8 @@ num_workers=2 # get data bash test_tipc/static/${model_item}/benchmark_common/prepare.sh # run -bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1; +bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1; # run profiling sleep 10; export PROFILING=true -bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} 11 ${num_workers} 2>&1; +bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} 11 ${num_workers} 2>&1; diff --git a/test_tipc/static/mask_rcnn_r50_1x_coco/N1C8/mask_rcnn_r50_1x_coco_bs2_fp32_MultiP_DP.sh b/test_tipc/static/mask_rcnn_r50_1x_coco/N1C8/mask_rcnn_r50_1x_coco_bs2_fp32_DP.sh similarity index 63% rename from test_tipc/static/mask_rcnn_r50_1x_coco/N1C8/mask_rcnn_r50_1x_coco_bs2_fp32_MultiP_DP.sh rename to test_tipc/static/mask_rcnn_r50_1x_coco/N1C8/mask_rcnn_r50_1x_coco_bs2_fp32_DP.sh index 1dabcecc8c09e9b61d9f56e4040c0c5487e597f6..4109e021e991e6bc14f866b5d7ba104775ff8c54 100644 --- a/test_tipc/static/mask_rcnn_r50_1x_coco/N1C8/mask_rcnn_r50_1x_coco_bs2_fp32_MultiP_DP.sh +++ b/test_tipc/static/mask_rcnn_r50_1x_coco/N1C8/mask_rcnn_r50_1x_coco_bs2_fp32_DP.sh @@ -1,7 +1,6 @@ model_item=mask_rcnn_r50_1x_coco bs_item=2 fp_item=fp32 -run_process_type=MultiP run_mode=DP device_num=N1C8 max_iter=100 @@ -10,4 +9,4 @@ num_workers=2 # get data bash test_tipc/static/${model_item}/benchmark_common/prepare.sh # run -bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1; +bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1; diff --git a/test_tipc/static/mask_rcnn_r50_1x_coco/benchmark_common/run_benchmark.sh b/test_tipc/static/mask_rcnn_r50_1x_coco/benchmark_common/run_benchmark.sh index 57e685d5390cf7d3f87deb887b01f443c50eade1..2bfed8bcefbc374c3e11cf1f874816ca31d800fe 100644 --- a/test_tipc/static/mask_rcnn_r50_1x_coco/benchmark_common/run_benchmark.sh +++ b/test_tipc/static/mask_rcnn_r50_1x_coco/benchmark_common/run_benchmark.sh @@ -1,23 +1,22 @@ #!/usr/bin/env bash # Test training benchmark for a model. -# Usage:bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} +# Usage:bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} function _set_params(){ model_item=${1:-"model_item"} # (必选) 模型 item base_batch_size=${2:-"2"} # (必选) 如果是静态图单进程,则表示每张卡上的BS,需在训练时*卡数 fp_item=${3:-"fp32"} # (必选) fp32|fp16 - run_process_type=${4:-"SingleP"} # (必选) 单进程 SingleP|多进程 MultiP - run_mode=${5:-"DP"} # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP1-MP4-PP1 - device_num=${6:-"N1C1"} # (必选) 使用的卡数量,N1C1|N1C8|N4C32 (4机32卡) + run_mode=${4:-"DP"} # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP1-MP4-PP1 + device_num=${5:-"N1C1"} # (必选) 使用的卡数量,N1C1|N1C8|N4C32 (4机32卡) profiling=${PROFILING:-"false"} # (必选) Profiling 开关,默认关闭,通过全局变量传递 model_repo="PaddleDetection" # (必选) 模型套件的名字 speed_unit="samples/sec" # (必选)速度指标单位 skip_steps=10 # (必选)解析日志,跳过模型前几个性能不稳定的step keyword="ips:" # (必选)解析日志,筛选出性能数据所在行的关键字 convergence_key="loss:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:" - max_iter=${7:-"100"} # (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件;或使用max_epoch参数 - num_workers=${8:-"8"} # (可选) + max_iter=${6:-"100"} # (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件;或使用max_epoch参数 + num_workers=${7:-"8"} # (可选) # 以下为通用执行命令,无特殊可不用修改 - model_name=${model_item}_bs${base_batch_size}_${fp_item}_${run_process_type}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐 + model_name=${model_item}_bs${base_batch_size}_${fp_item}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐 device=${CUDA_VISIBLE_DEVICES//,/ } arr=(${device}) num_gpu_devices=${#arr[*]} @@ -30,6 +29,9 @@ function _set_params(){ speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed } function _train(){ + export FLAGS_eager_delete_tensor_gb=0.0 + export FLAGS_fraction_of_gpu_memory_to_use=0.98 + export FLAGS_memory_fraction_of_eager_deletion=1.0 cd ./static batch_size=${base_batch_size} # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}" @@ -54,20 +56,19 @@ function _train(){ --is_profiler=${is_profiler} " # 以下为通用执行命令,无特殊可不用修改 case ${run_mode} in - DP) if [[ ${run_process_type} = "SingleP" ]];then - echo "run ${run_mode} ${run_process_type}" + DP) if [[ ${device_num} = "N1C1" ]];then + echo "run ${run_mode} ${device_num}" train_cmd="python -u tools/train.py ${train_cmd}" - elif [[ ${run_process_type} = "MultiP" ]];then + else rm -rf ./mylog train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=$CUDA_VISIBLE_DEVICES \ tools/train.py ${train_cmd}" - else - echo "run ${run_mode} ${run_process_type} error", exit 1 fi ;; DP1-MP1-PP1) echo "run run_mode: DP1-MP1-PP1" ;; *) echo "choose run_mode "; exit 1; esac + echo "train_cmd: ${train_cmd} log_file: ${log_file}" timeout 15m ${train_cmd} > ${log_file} 2>&1 if [ $? -ne 0 ];then @@ -75,8 +76,8 @@ function _train(){ else echo -e "${model_name}, SUCCESS" fi - kill -9 `ps -ef|grep 'python'|awk '{print $2}'` - if [ ${run_process_type} = "MultiP" -a -d mylog ]; then + # kill -9 `ps -ef|grep 'python'|awk '{print $2}'` + if [ ${device_num} != "N1C1" -a -d mylog ]; then rm ${log_file} cp mylog/workerlog.0 ${log_file} fi diff --git a/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/N1C1/mask_rcnn_r50_fpn_1x_coco_bs2_fp32_SingleP_DP.sh b/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/N1C1/mask_rcnn_r50_fpn_1x_coco_bs2_fp32_DP.sh similarity index 59% rename from test_tipc/static/mask_rcnn_r50_fpn_1x_coco/N1C1/mask_rcnn_r50_fpn_1x_coco_bs2_fp32_SingleP_DP.sh rename to test_tipc/static/mask_rcnn_r50_fpn_1x_coco/N1C1/mask_rcnn_r50_fpn_1x_coco_bs2_fp32_DP.sh index 266a5a7560787761398856eca6063a26b5bfacb3..a862a2279e4c08c7e6065276d040b88438b2fd2c 100644 --- a/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/N1C1/mask_rcnn_r50_fpn_1x_coco_bs2_fp32_SingleP_DP.sh +++ b/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/N1C1/mask_rcnn_r50_fpn_1x_coco_bs2_fp32_DP.sh @@ -1,7 +1,6 @@ model_item=mask_rcnn_r50_fpn_1x_coco bs_item=2 fp_item=fp32 -run_process_type=SingleP run_mode=DP device_num=N1C1 max_iter=100 @@ -10,8 +9,8 @@ num_workers=2 # get data bash test_tipc/static/${model_item}/benchmark_common/prepare.sh # run -bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1; +bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1; # run profiling sleep 10; export PROFILING=true -bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} 11 ${num_workers} 2>&1; +bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} 11 ${num_workers} 2>&1; diff --git a/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/N1C8/mask_rcnn_r50_fpn_1x_coco_bs2_fp32_MultiP_DP.sh b/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/N1C8/mask_rcnn_r50_fpn_1x_coco_bs2_fp32_DP.sh similarity index 64% rename from test_tipc/static/mask_rcnn_r50_fpn_1x_coco/N1C8/mask_rcnn_r50_fpn_1x_coco_bs2_fp32_MultiP_DP.sh rename to test_tipc/static/mask_rcnn_r50_fpn_1x_coco/N1C8/mask_rcnn_r50_fpn_1x_coco_bs2_fp32_DP.sh index b8000b8eb1ce6e629bb7e6250a3f4899cf04cc5e..a9a41f2aed2182a59b8f365bbadc63994c53f27d 100644 --- a/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/N1C8/mask_rcnn_r50_fpn_1x_coco_bs2_fp32_MultiP_DP.sh +++ b/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/N1C8/mask_rcnn_r50_fpn_1x_coco_bs2_fp32_DP.sh @@ -1,7 +1,6 @@ model_item=mask_rcnn_r50_fpn_1x_coco bs_item=2 fp_item=fp32 -run_process_type=MultiP run_mode=DP device_num=N1C8 max_iter=100 @@ -10,4 +9,4 @@ num_workers=2 # get data bash test_tipc/static/${model_item}/benchmark_common/prepare.sh # run -bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1; +bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1; diff --git a/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/benchmark_common/run_benchmark.sh b/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/benchmark_common/run_benchmark.sh index d3a5bbb9984a697cc6d164dcf080d861b1b7d8a0..740464a624db1511a52ecd24a5b7178d2287b1a6 100644 --- a/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/benchmark_common/run_benchmark.sh +++ b/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/benchmark_common/run_benchmark.sh @@ -1,23 +1,22 @@ #!/usr/bin/env bash # Test training benchmark for a model. -# Usage:bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} +# Usage:bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} function _set_params(){ model_item=${1:-"model_item"} # (必选) 模型 item base_batch_size=${2:-"2"} # (必选) 如果是静态图单进程,则表示每张卡上的BS,需在训练时*卡数 fp_item=${3:-"fp32"} # (必选) fp32|fp16 - run_process_type=${4:-"SingleP"} # (必选) 单进程 SingleP|多进程 MultiP - run_mode=${5:-"DP"} # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP1-MP4-PP1 - device_num=${6:-"N1C1"} # (必选) 使用的卡数量,N1C1|N1C8|N4C32 (4机32卡) + run_mode=${4:-"DP"} # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP1-MP4-PP1 + device_num=${5:-"N1C1"} # (必选) 使用的卡数量,N1C1|N1C8|N4C32 (4机32卡) profiling=${PROFILING:-"false"} # (必选) Profiling 开关,默认关闭,通过全局变量传递 model_repo="PaddleDetection" # (必选) 模型套件的名字 speed_unit="samples/sec" # (必选)速度指标单位 skip_steps=10 # (必选)解析日志,跳过模型前几个性能不稳定的step keyword="ips:" # (必选)解析日志,筛选出性能数据所在行的关键字 convergence_key="loss:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:" - max_iter=${7:-"100"} # (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件;或使用max_epoch参数 - num_workers=${8:-"8"} # (可选) + max_iter=${6:-"100"} # (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件;或使用max_epoch参数 + num_workers=${7:-"8"} # (可选) # 以下为通用执行命令,无特殊可不用修改 - model_name=${model_item}_bs${base_batch_size}_${fp_item}_${run_process_type}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐 + model_name=${model_item}_bs${base_batch_size}_${fp_item}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐 device=${CUDA_VISIBLE_DEVICES//,/ } arr=(${device}) num_gpu_devices=${#arr[*]} @@ -30,6 +29,9 @@ function _set_params(){ speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed } function _train(){ + export FLAGS_eager_delete_tensor_gb=0.0 + export FLAGS_fraction_of_gpu_memory_to_use=0.98 + export FLAGS_memory_fraction_of_eager_deletion=1.0 cd ./static batch_size=${base_batch_size} # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}" @@ -54,20 +56,19 @@ function _train(){ --is_profiler=${is_profiler} " # 以下为通用执行命令,无特殊可不用修改 case ${run_mode} in - DP) if [[ ${run_process_type} = "SingleP" ]];then - echo "run ${run_mode} ${run_process_type}" + DP) if [[ ${device_num} = "N1C1" ]];then + echo "run ${run_mode} ${device_num}" train_cmd="python -u tools/train.py ${train_cmd}" - elif [[ ${run_process_type} = "MultiP" ]];then + else rm -rf ./mylog train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=$CUDA_VISIBLE_DEVICES \ tools/train.py ${train_cmd}" - else - echo "run ${run_mode} ${run_process_type} error", exit 1 fi ;; DP1-MP1-PP1) echo "run run_mode: DP1-MP1-PP1" ;; *) echo "choose run_mode "; exit 1; esac + echo "train_cmd: ${train_cmd} log_file: ${log_file}" timeout 15m ${train_cmd} > ${log_file} 2>&1 if [ $? -ne 0 ];then @@ -75,8 +76,8 @@ function _train(){ else echo -e "${model_name}, SUCCESS" fi - kill -9 `ps -ef|grep 'python'|awk '{print $2}'` - if [ ${run_process_type} = "MultiP" -a -d mylog ]; then + # kill -9 `ps -ef|grep 'python'|awk '{print $2}'` + if [ ${device_num} != "N1C1" -a -d mylog ]; then rm ${log_file} cp mylog/workerlog.0 ${log_file} fi diff --git a/test_tipc/static/yolov3_darknet53_270e_coco/N1C1/yolov3_darknet53_270e_coco_bs8_fp32_SingleP_DP.sh b/test_tipc/static/yolov3_darknet53_270e_coco/N1C1/yolov3_darknet53_270e_coco_bs8_fp32_DP.sh similarity index 59% rename from test_tipc/static/yolov3_darknet53_270e_coco/N1C1/yolov3_darknet53_270e_coco_bs8_fp32_SingleP_DP.sh rename to test_tipc/static/yolov3_darknet53_270e_coco/N1C1/yolov3_darknet53_270e_coco_bs8_fp32_DP.sh index d30a34bcbb686554dcafe470a0ef1cda163cafeb..3d3db1be54e782d3a2a7b9d3920d51299430f1ef 100644 --- a/test_tipc/static/yolov3_darknet53_270e_coco/N1C1/yolov3_darknet53_270e_coco_bs8_fp32_SingleP_DP.sh +++ b/test_tipc/static/yolov3_darknet53_270e_coco/N1C1/yolov3_darknet53_270e_coco_bs8_fp32_DP.sh @@ -1,7 +1,6 @@ model_item=yolov3_darknet53_270e_coco bs_item=8 fp_item=fp32 -run_process_type=SingleP run_mode=DP device_num=N1C1 max_iter=100 @@ -10,8 +9,8 @@ num_workers=8 # get data bash test_tipc/static/${model_item}/benchmark_common/prepare.sh # run -bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1; +bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1; # run profiling sleep 10; export PROFILING=true -bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} 11 ${num_workers} 2>&1; +bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} 11 ${num_workers} 2>&1; diff --git a/test_tipc/static/yolov3_darknet53_270e_coco/N1C8/yolov3_darknet53_270e_coco_bs8_fp32_MultiP_DP.sh b/test_tipc/static/yolov3_darknet53_270e_coco/N1C8/yolov3_darknet53_270e_coco_bs8_fp32_DP.sh similarity index 64% rename from test_tipc/static/yolov3_darknet53_270e_coco/N1C8/yolov3_darknet53_270e_coco_bs8_fp32_MultiP_DP.sh rename to test_tipc/static/yolov3_darknet53_270e_coco/N1C8/yolov3_darknet53_270e_coco_bs8_fp32_DP.sh index 8081990254812ffff24296b1fa97faa3df638727..5cc30f26745c7058319275c2d2dd56a0274a91d6 100644 --- a/test_tipc/static/yolov3_darknet53_270e_coco/N1C8/yolov3_darknet53_270e_coco_bs8_fp32_MultiP_DP.sh +++ b/test_tipc/static/yolov3_darknet53_270e_coco/N1C8/yolov3_darknet53_270e_coco_bs8_fp32_DP.sh @@ -1,7 +1,6 @@ model_item=yolov3_darknet53_270e_coco bs_item=8 fp_item=fp32 -run_process_type=MultiP run_mode=DP device_num=N1C8 max_iter=100 @@ -10,4 +9,4 @@ num_workers=8 # get data bash test_tipc/static/${model_item}/benchmark_common/prepare.sh # run -bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1; +bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1; diff --git a/test_tipc/static/yolov3_darknet53_270e_coco/benchmark_common/run_benchmark.sh b/test_tipc/static/yolov3_darknet53_270e_coco/benchmark_common/run_benchmark.sh index 8a75a75b8033221a19cb2d816bcc8d317be57d27..3af4d15712454dad44bdffa0524a641893e9f12d 100644 --- a/test_tipc/static/yolov3_darknet53_270e_coco/benchmark_common/run_benchmark.sh +++ b/test_tipc/static/yolov3_darknet53_270e_coco/benchmark_common/run_benchmark.sh @@ -1,23 +1,22 @@ #!/usr/bin/env bash # Test training benchmark for a model. -# Usage:bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} +# Usage:bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} function _set_params(){ model_item=${1:-"model_item"} # (必选) 模型 item base_batch_size=${2:-"2"} # (必选) 如果是静态图单进程,则表示每张卡上的BS,需在训练时*卡数 fp_item=${3:-"fp32"} # (必选) fp32|fp16 - run_process_type=${4:-"SingleP"} # (必选) 单进程 SingleP|多进程 MultiP - run_mode=${5:-"DP"} # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP1-MP4-PP1 - device_num=${6:-"N1C1"} # (必选) 使用的卡数量,N1C1|N1C8|N4C32 (4机32卡) + run_mode=${4:-"DP"} # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP1-MP4-PP1 + device_num=${5:-"N1C1"} # (必选) 使用的卡数量,N1C1|N1C8|N4C32 (4机32卡) profiling=${PROFILING:-"false"} # (必选) Profiling 开关,默认关闭,通过全局变量传递 model_repo="PaddleDetection" # (必选) 模型套件的名字 speed_unit="samples/sec" # (必选)速度指标单位 skip_steps=10 # (必选)解析日志,跳过模型前几个性能不稳定的step keyword="ips:" # (必选)解析日志,筛选出性能数据所在行的关键字 convergence_key="loss:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:" - max_iter=${7:-"100"} # (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件;或使用max_epoch参数 - num_workers=${8:-"8"} # (可选) + max_iter=${6:-"100"} # (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件;或使用max_epoch参数 + num_workers=${7:-"8"} # (可选) # 以下为通用执行命令,无特殊可不用修改 - model_name=${model_item}_bs${base_batch_size}_${fp_item}_${run_process_type}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐 + model_name=${model_item}_bs${base_batch_size}_${fp_item}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐 device=${CUDA_VISIBLE_DEVICES//,/ } arr=(${device}) num_gpu_devices=${#arr[*]} @@ -30,6 +29,9 @@ function _set_params(){ speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed } function _train(){ + export FLAGS_eager_delete_tensor_gb=0.0 + export FLAGS_fraction_of_gpu_memory_to_use=0.98 + export FLAGS_memory_fraction_of_eager_deletion=1.0 cd ./static batch_size=${base_batch_size} # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}" @@ -54,20 +56,19 @@ function _train(){ --is_profiler=${is_profiler} " # 以下为通用执行命令,无特殊可不用修改 case ${run_mode} in - DP) if [[ ${run_process_type} = "SingleP" ]];then - echo "run ${run_mode} ${run_process_type}" + DP) if [[ ${device_num} = "N1C1" ]];then + echo "run ${run_mode} ${device_num}" train_cmd="python -u tools/train.py ${train_cmd}" - elif [[ ${run_process_type} = "MultiP" ]];then + else rm -rf ./mylog train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=$CUDA_VISIBLE_DEVICES \ tools/train.py ${train_cmd}" - else - echo "run ${run_mode} ${run_process_type} error", exit 1 fi ;; DP1-MP1-PP1) echo "run run_mode: DP1-MP1-PP1" ;; *) echo "choose run_mode "; exit 1; esac + echo "train_cmd: ${train_cmd} log_file: ${log_file}" timeout 15m ${train_cmd} > ${log_file} 2>&1 if [ $? -ne 0 ];then @@ -75,8 +76,8 @@ function _train(){ else echo -e "${model_name}, SUCCESS" fi - kill -9 `ps -ef|grep 'python'|awk '{print $2}'` - if [ ${run_process_type} = "MultiP" -a -d mylog ]; then + # kill -9 `ps -ef|grep 'python'|awk '{print $2}'` + if [ ${device_num} != "N1C1" -a -d mylog ]; then rm ${log_file} cp mylog/workerlog.0 ${log_file} fi