diff --git a/test_tipc/benchmark_train.sh b/test_tipc/benchmark_train.sh index b80e7257ce924b357650c64910a1e323389680d2..39d4c6d1adc3c4e1b43c7733c312f7b80b08d2f9 100644 --- a/test_tipc/benchmark_train.sh +++ b/test_tipc/benchmark_train.sh @@ -142,7 +142,6 @@ else batch_size=${params_list[1]} batch_size=`echo ${batch_size} | tr -cd "[0-9]" ` precision=${params_list[2]} - # run_process_type=${params_list[3]} run_mode=${params_list[3]} device_num=${params_list[4]} IFS=";" @@ -167,10 +166,9 @@ for batch_size in ${batch_size_list[*]}; do gpu_id=$(set_gpu_id $device_num) if [ ${#gpu_id} -le 1 ];then - run_process_type="SingleP" log_path="$SAVE_LOG/profiling_log" mkdir -p $log_path - log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_profiling" + log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_profiling" func_sed_params "$FILENAME" "${line_gpuid}" "0" # sed used gpu_id # set profile_option params tmp=`sed -i "${line_profile}s/.*/${profile_option}/" "${FILENAME}"` @@ -186,8 +184,8 @@ for batch_size in ${batch_size_list[*]}; do speed_log_path="$SAVE_LOG/index" mkdir -p $log_path mkdir -p $speed_log_path - log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_log" - speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_speed" + log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_log" + speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_speed" func_sed_params "$FILENAME" "${line_profile}" "null" # sed profile_id as null cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 " echo $cmd @@ -198,13 +196,12 @@ for batch_size in ${batch_size_list[*]}; do eval "cat ${log_path}/${log_name}" # parser log - _model_name="${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}" + _model_name="${model_name}_bs${batch_size}_${precision}_${run_mode}" cmd="${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \ --speed_log_file '${speed_log_path}/${speed_log_name}' \ --model_name ${_model_name} \ --base_batch_size ${batch_size} \ --run_mode ${run_mode} \ - --run_process_type ${run_process_type} \ --fp_item ${precision} \ --keyword ips: \ --skip_steps 2 \ @@ -218,13 +215,12 @@ for batch_size in ${batch_size_list[*]}; do else IFS=";" unset_env=`unset CUDA_VISIBLE_DEVICES` - run_process_type="MultiP" log_path="$SAVE_LOG/train_log" speed_log_path="$SAVE_LOG/index" mkdir -p $log_path mkdir -p $speed_log_path - log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_log" - speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_speed" + log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_log" + speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_speed" func_sed_params "$FILENAME" "${line_gpuid}" "$gpu_id" # sed used gpu_id func_sed_params "$FILENAME" "${line_profile}" "null" # sed --profile_option as null cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 " @@ -235,14 +231,13 @@ for batch_size in ${batch_size_list[*]}; do export model_run_time=$((${job_et}-${job_bt})) eval "cat ${log_path}/${log_name}" # parser log - _model_name="${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}" + _model_name="${model_name}_bs${batch_size}_${precision}_${run_mode}" cmd="${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \ --speed_log_file '${speed_log_path}/${speed_log_name}' \ --model_name ${_model_name} \ --base_batch_size ${batch_size} \ --run_mode ${run_mode} \ - --run_process_type ${run_process_type} \ --fp_item ${precision} \ --keyword ips: \ --skip_steps 2 \ diff --git a/test_tipc/static/ResNet50/N1C1/ResNet50_bs256_fp16_SingleP_DP.sh b/test_tipc/static/ResNet50/N1C1/ResNet50_bs256_fp16_DP.sh similarity index 62% rename from test_tipc/static/ResNet50/N1C1/ResNet50_bs256_fp16_SingleP_DP.sh rename to test_tipc/static/ResNet50/N1C1/ResNet50_bs256_fp16_DP.sh index b338608ba31f612bb661706a8ab9e17dcd51f5d6..2e74ab1b949eddecd86ded1ace0e761c73ce0ca8 100644 --- a/test_tipc/static/ResNet50/N1C1/ResNet50_bs256_fp16_SingleP_DP.sh +++ b/test_tipc/static/ResNet50/N1C1/ResNet50_bs256_fp16_DP.sh @@ -1,7 +1,6 @@ model_item=ResNet50 bs_item=256 fp_item=fp16 -run_process_type=SingleP run_mode=DP device_num=N1C1 max_epochs=1 @@ -10,4 +9,4 @@ num_workers=8 # get data bash test_tipc/static/${model_item}/benchmark_common/prepare.sh # run -bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; +bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; diff --git a/test_tipc/static/ResNet50/N1C1/ResNet50_bs256_fp32_SingleP_DP.sh b/test_tipc/static/ResNet50/N1C1/ResNet50_bs256_fp32_DP.sh similarity index 56% rename from test_tipc/static/ResNet50/N1C1/ResNet50_bs256_fp32_SingleP_DP.sh rename to test_tipc/static/ResNet50/N1C1/ResNet50_bs256_fp32_DP.sh index 6bb64ef7e52b5d6f6f6d36e212f4d002a6b12af3..0fc06ebaae4759d98cb1b8ada33ae8181dfee604 100644 --- a/test_tipc/static/ResNet50/N1C1/ResNet50_bs256_fp32_SingleP_DP.sh +++ b/test_tipc/static/ResNet50/N1C1/ResNet50_bs256_fp32_DP.sh @@ -1,7 +1,6 @@ model_item=ResNet50 bs_item=256 fp_item=fp32 -run_process_type=SingleP run_mode=DP device_num=N1C1 max_epochs=1 @@ -10,8 +9,8 @@ num_workers=8 # get data bash test_tipc/static/${model_item}/benchmark_common/prepare.sh # run -bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; +bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; # run profiling sleep 10; export PROFILING=true -bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; +bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; diff --git a/test_tipc/static/ResNet50/N1C1/ResNet50_bs64_fp16_SingleP_DP.sh b/test_tipc/static/ResNet50/N1C1/ResNet50_bs64_fp16_DP.sh similarity index 62% rename from test_tipc/static/ResNet50/N1C1/ResNet50_bs64_fp16_SingleP_DP.sh rename to test_tipc/static/ResNet50/N1C1/ResNet50_bs64_fp16_DP.sh index c6d96b1f84b029fddd903c8ef6cbeaa22fb7a23d..270225ec85725d2c871e2c88f51f1f327716f1a1 100644 --- a/test_tipc/static/ResNet50/N1C1/ResNet50_bs64_fp16_SingleP_DP.sh +++ b/test_tipc/static/ResNet50/N1C1/ResNet50_bs64_fp16_DP.sh @@ -1,7 +1,6 @@ model_item=ResNet50 bs_item=64 fp_item=fp16 -run_process_type=SingleP run_mode=DP device_num=N1C1 max_epochs=1 @@ -10,4 +9,4 @@ num_workers=8 # get data bash test_tipc/static/${model_item}/benchmark_common/prepare.sh # run -bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; +bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; diff --git a/test_tipc/static/ResNet50/N1C1/ResNet50_bs64_fp32_SingleP_DP.sh b/test_tipc/static/ResNet50/N1C1/ResNet50_bs64_fp32_DP.sh similarity index 56% rename from test_tipc/static/ResNet50/N1C1/ResNet50_bs64_fp32_SingleP_DP.sh rename to test_tipc/static/ResNet50/N1C1/ResNet50_bs64_fp32_DP.sh index 52ca473a46c3098ae879337c232cad61feb112ed..3db66a7e6f255c8977cb398b0750a7fd9486fc05 100644 --- a/test_tipc/static/ResNet50/N1C1/ResNet50_bs64_fp32_SingleP_DP.sh +++ b/test_tipc/static/ResNet50/N1C1/ResNet50_bs64_fp32_DP.sh @@ -1,7 +1,6 @@ model_item=ResNet50 bs_item=64 fp_item=fp32 -run_process_type=SingleP run_mode=DP device_num=N1C1 max_epochs=1 @@ -10,8 +9,8 @@ num_workers=8 # get data bash test_tipc/static/${model_item}/benchmark_common/prepare.sh # run -bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; +bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; # run profiling sleep 10; export PROFILING=true -bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; +bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; diff --git a/test_tipc/static/ResNet50/N1C8/ResNet50_bs256_fp16_MultiP_DP.sh b/test_tipc/static/ResNet50/N1C8/ResNet50_bs256_fp16_DP.sh similarity index 62% rename from test_tipc/static/ResNet50/N1C8/ResNet50_bs256_fp16_MultiP_DP.sh rename to test_tipc/static/ResNet50/N1C8/ResNet50_bs256_fp16_DP.sh index 492764df57eee5a6b1700679037e162a32d0235c..769ab2b1fed18a32744b41051a52aa176f50df1d 100644 --- a/test_tipc/static/ResNet50/N1C8/ResNet50_bs256_fp16_MultiP_DP.sh +++ b/test_tipc/static/ResNet50/N1C8/ResNet50_bs256_fp16_DP.sh @@ -1,7 +1,6 @@ model_item=ResNet50 bs_item=256 fp_item=fp16 -run_process_type=MultiP run_mode=DP device_num=N1C8 max_epochs=1 @@ -10,4 +9,4 @@ num_workers=8 # get data bash test_tipc/static/${model_item}/benchmark_common/prepare.sh # run -bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; +bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; diff --git a/test_tipc/static/ResNet50/N1C8/ResNet50_bs256_fp32_MultiP_DP.sh b/test_tipc/static/ResNet50/N1C8/ResNet50_bs256_fp32_DP.sh similarity index 62% rename from test_tipc/static/ResNet50/N1C8/ResNet50_bs256_fp32_MultiP_DP.sh rename to test_tipc/static/ResNet50/N1C8/ResNet50_bs256_fp32_DP.sh index f4988432d0784982bc4461e8db892e0988646c11..88ac5b85e6266875a8b94d910a1d8044de3cc55b 100644 --- a/test_tipc/static/ResNet50/N1C8/ResNet50_bs256_fp32_MultiP_DP.sh +++ b/test_tipc/static/ResNet50/N1C8/ResNet50_bs256_fp32_DP.sh @@ -1,7 +1,6 @@ model_item=ResNet50 bs_item=256 fp_item=fp32 -run_process_type=MultiP run_mode=DP device_num=N1C8 max_epochs=1 @@ -10,4 +9,4 @@ num_workers=8 # get data bash test_tipc/static/${model_item}/benchmark_common/prepare.sh # run -bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; +bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; diff --git a/test_tipc/static/ResNet50/N1C8/ResNet50_bs64_fp16_MultiP_DP.sh b/test_tipc/static/ResNet50/N1C8/ResNet50_bs64_fp16_DP.sh similarity index 62% rename from test_tipc/static/ResNet50/N1C8/ResNet50_bs64_fp16_MultiP_DP.sh rename to test_tipc/static/ResNet50/N1C8/ResNet50_bs64_fp16_DP.sh index e42ae50686f24c53482da79f3bb3e6f73c46f10e..61b768ea4702c143a7508a72844b740127a41bb6 100644 --- a/test_tipc/static/ResNet50/N1C8/ResNet50_bs64_fp16_MultiP_DP.sh +++ b/test_tipc/static/ResNet50/N1C8/ResNet50_bs64_fp16_DP.sh @@ -1,7 +1,6 @@ model_item=ResNet50 bs_item=64 fp_item=fp16 -run_process_type=MultiP run_mode=DP device_num=N1C8 max_epochs=1 @@ -10,4 +9,4 @@ num_workers=8 # get data bash test_tipc/static/${model_item}/benchmark_common/prepare.sh # run -bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; +bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; diff --git a/test_tipc/static/ResNet50/N1C8/ResNet50_bs64_fp32_MultiP_DP.sh b/test_tipc/static/ResNet50/N1C8/ResNet50_bs64_fp32_DP.sh similarity index 62% rename from test_tipc/static/ResNet50/N1C8/ResNet50_bs64_fp32_MultiP_DP.sh rename to test_tipc/static/ResNet50/N1C8/ResNet50_bs64_fp32_DP.sh index e1e1d76e324b482d807dbe89a4e489d89f50a735..c20638ed22abd9867e55fdd8dbe42d474a27c8f6 100644 --- a/test_tipc/static/ResNet50/N1C8/ResNet50_bs64_fp32_MultiP_DP.sh +++ b/test_tipc/static/ResNet50/N1C8/ResNet50_bs64_fp32_DP.sh @@ -1,7 +1,6 @@ model_item=ResNet50 bs_item=64 fp_item=fp32 -run_process_type=MultiP run_mode=DP device_num=N1C8 max_epochs=1 @@ -10,4 +9,4 @@ num_workers=8 # get data bash test_tipc/static/${model_item}/benchmark_common/prepare.sh # run -bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; +bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; diff --git a/test_tipc/static/ResNet50/benchmark_common/run_benchmark.sh b/test_tipc/static/ResNet50/benchmark_common/run_benchmark.sh index 2c3e4e1e30c7770b75e0e6401c973ded04de227e..4d5b22181e693303a5a22a82a0ac6ab4253b5882 100644 --- a/test_tipc/static/ResNet50/benchmark_common/run_benchmark.sh +++ b/test_tipc/static/ResNet50/benchmark_common/run_benchmark.sh @@ -1,23 +1,22 @@ #!/usr/bin/env bash # Test training benchmark for a model. -# Usage:bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} +# Usage:bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} function _set_params(){ model_item=${1:-"model_item"} # (必选) 模型 item base_batch_size=${2:-"2"} # (必选) 如果是静态图单进程,则表示每张卡上的BS,需在训练时*卡数 fp_item=${3:-"fp32"} # (必选) fp32|fp16 - run_process_type=${4:-"SingleP"} # (必选) 单进程 SingleP|多进程 MultiP - run_mode=${5:-"DP"} # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP1-MP4-PP1 - device_num=${6:-"N1C1"} # (必选) 使用的卡数量,N1C1|N1C8|N4C32 (4机32卡) + run_mode=${4:-"DP"} # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP1-MP4-PP1 + device_num=${5:-"N1C1"} # (必选) 使用的卡数量,N1C1|N1C8|N4C32 (4机32卡) profiling=${PROFILING:-"false"} # (必选) Profiling 开关,默认关闭,通过全局变量传递 model_repo="PaddleClas" # (必选) 模型套件的名字 speed_unit="samples/sec" # (必选)速度指标单位 skip_steps=10 # (必选)解析日志,跳过模型前几个性能不稳定的step keyword="ips:" # (必选)解析日志,筛选出性能数据所在行的关键字 convergence_key="loss:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:" - max_epochs=${7:-"1"} # (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件;或使用max_epoch参数 - num_workers=${8:-"4"} # (可选) + max_epochs=${6:-"1"} # (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件;或使用max_epoch参数 + num_workers=${7:-"4"} # (可选) # 以下为通用执行命令,无特殊可不用修改 - model_name=${model_item}_bs${base_batch_size}_${fp_item}_${run_process_type}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐 + model_name=${model_item}_bs${base_batch_size}_${fp_item}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐 device=${CUDA_VISIBLE_DEVICES//,/ } arr=(${device}) num_gpu_devices=${#arr[*]} @@ -48,13 +47,19 @@ function _train(){ train_cmd="${config_file} -o DataLoader.Train.sampler.batch_size=${base_batch_size} -o Global.epochs=${max_epochs} -o DataLoader.Train.loader.num_workers=${num_workers} ${profiling_config} -o Global.eval_during_train=False" # 以下为通用执行命令,无特殊可不用修改 - case ${run_process_type} in - SingleP) - train_cmd="python ppcls/static/train.py ${train_cmd}";; - MultiP) - train_cmd="python -m paddle.distributed.launch --gpus 0,1,2,3,4,5,6,7 ppcls/static/train.py ${train_cmd}";; - *) echo "choose run_process_type(SingleP or MultiP)"; exit 1; + case ${run_mode} in + DP) if [[ ${device_num} = "N1C1" ]];then + echo "run ${run_mode} ${device_num}" + train_cmd="python ppcls/static/train.py ${train_cmd}" + else + rm -rf ./mylog + train_cmd="python -m paddle.distributed.launch --gpus 0,1,2,3,4,5,6,7 ppcls/static/train.py ${train_cmd}" + fi + ;; + DP1-MP1-PP1) echo "run run_mode: DP1-MP1-PP1" ;; + *) echo "choose run_mode "; exit 1; esac + echo "train_cmd: ${train_cmd} log_file: ${log_file}" timeout 5m ${train_cmd} > ${log_file} 2>&1 if [ $? -ne 0 ];then @@ -63,7 +68,7 @@ function _train(){ echo -e "${model_name}, SUCCESS" fi # kill -9 `ps -ef|grep 'python'|awk '{print $2}'` - if [ ${run_process_type} = "MultiP" -a -d mylog ]; then + if [ ${device_num} != "N1C1" -a -d mylog ]; then rm ${log_file} cp mylog/workerlog.0 ${log_file} fi