Merge pull request #1679 from RainFrost1/benchmark

add benchmark for tipc

Merge pull request #1679 from RainFrost1/benchmark
add benchmark for tipc
14d82854 · Walter · GitHub · 9e9a77f3 · 089382a4 · 14d82854
14 changed file
--- a/ppcls/engine/train/utils.py
+++ b/ppcls/engine/train/utils.py
@@ -48,7 +48,7 @@ def log_info(trainer, batch_size, epoch_id, iter_id):
        for key in trainer.time_info
    ])
-    ips_msg = "ips: {:.5f} images/sec".format(
+    ips_msg = "ips: {:.5f} samples/s".format(
        batch_size / trainer.time_info["batch_cost"].avg)
    eta_sec = ((trainer.config["Global"]["epochs"] - epoch_id + 1
                ) * len(trainer.train_dataloader) - iter_id

--- a/test_tipc/benchmark_train.sh
+++ b/test_tipc/benchmark_train.sh
+#!/bin/bash
+source test_tipc/common_func.sh
+# set env
+python=python
+export model_branch=`git symbolic-ref HEAD 2>/dev/null | cut -d"/" -f 3`
+export model_commit=$(git log|head -n1|awk '{print $2}') 
+export str_tmp=$(echo `pip list|grep paddlepaddle-gpu|awk -F ' ' '{print $2}'`)
+export frame_version=${str_tmp%%.post*}
+export frame_commit=$(echo `${python} -c "import paddle;print(paddle.version.commit)"`)
+# run benchmark sh 
+# Usage:
+# bash run_benchmark_train.sh config.txt params
+# or 
+# bash run_benchmark_train.sh config.txt
+function func_parser_params(){
+    strs=$1
+    IFS="="
+    array=(${strs})
+    tmp=${array[1]}
+    echo ${tmp}
+}
+function func_sed_params(){
+    filename=$1
+    line=$2
+    param_value=$3
+    params=`sed -n "${line}p" $filename`
+    IFS=":"
+    array=(${params})
+    key=${array[0]}
+    value=${array[1]}
+    new_params="${key}:${param_value}"
+    IFS=";"
+    cmd="sed -i '${line}s/.*/${new_params}/' '${filename}'"
+    eval $cmd
+}
+function set_gpu_id(){
+    string=$1
+    _str=${string:1:6}
+    IFS="C"
+    arr=(${_str})
+    M=${arr[0]}
+    P=${arr[1]}
+    gn=`expr $P - 1`
+    gpu_num=`expr $gn / $M`
+    seq=`seq -s "," 0 $gpu_num`
+    echo $seq
+}
+function get_repo_name(){
+    IFS=";"
+    cur_dir=$(pwd)
+    IFS="/"
+    arr=(${cur_dir})
+    echo ${arr[-1]}
+}
+FILENAME=$1
+# copy FILENAME as new
+new_filename="./test_tipc/benchmark_train.txt"
+cmd=`yes|cp $FILENAME $new_filename`
+FILENAME=$new_filename
+# MODE must be one of ['benchmark_train']
+MODE=$2
+PARAMS=$3
+# bash test_tipc/benchmark_train.sh test_tipc/configs/det_mv3_db_v2_0/train_benchmark.txt  benchmark_train dynamic_bs8_null_DP_N1C1
+IFS=$'\n'
+# parser params from train_benchmark.txt
+sed -i 's/ -o DataLoader.Train.sampler.shuffle=False//g' $FILENAME
+sed -i 's/ -o DataLoader.Train.loader.num_workers=0//g' $FILENAME
+sed -i 's/-o DataLoader.Train.loader.use_shared_memory=False/-o Global.eval_during_train=False/g' $FILENAME
+dataline=`cat $FILENAME`
+# parser params
+IFS=$'\n'
+lines=(${dataline})
+model_name=$(func_parser_value "${lines[1]}")
+# 获取benchmark_params所在的行数
+line_num=`grep -n "train_benchmark_params" $FILENAME  | cut -d ":" -f 1`
+# for train log parser
+batch_size=$(func_parser_value "${lines[line_num]}")
+line_num=`expr $line_num + 1`
+fp_items=$(func_parser_value "${lines[line_num]}")
+line_num=`expr $line_num + 1`
+epoch=$(func_parser_value "${lines[line_num]}")
+line_num=`expr $line_num + 1`
+profile_option_key=$(func_parser_key "${lines[line_num]}")
+profile_option_params=$(func_parser_value "${lines[line_num]}")
+profile_option="${profile_option_key}:${profile_option_params}"
+line_num=`expr $line_num + 1`
+flags_value=$(func_parser_value "${lines[line_num]}")
+# set flags
+IFS=";"
+flags_list=(${flags_value})
+for _flag in ${flags_list[*]}; do
+    cmd="export ${_flag}"
+    eval $cmd
+done
+# set log_name
+repo_name=$(get_repo_name )
+SAVE_LOG=${BENCHMARK_LOG_DIR:-$(pwd)}   # */benchmark_log
+mkdir -p "${SAVE_LOG}/benchmark_log/"
+status_log="${SAVE_LOG}/benchmark_log/results.log"
+# The number of lines in which train params can be replaced.
+line_python=3
+line_gpuid=4
+line_precision=6
+line_epoch=7
+line_batchsize=9
+line_profile=13
+line_eval_py=24
+line_export_py=30
+line_norm_train=16
+func_sed_params "$FILENAME" "${line_eval_py}" "null"
+func_sed_params "$FILENAME" "${line_export_py}" "null"
+func_sed_params "$FILENAME" "${line_python}"  "$python"
+# if params
+if  [ ! -n "$PARAMS" ] ;then
+    # PARAMS input is not a word.
+    IFS="|"
+    batch_size_list=(${batch_size})
+    fp_items_list=(${fp_items})
+    device_num_list=(N1C4)
+    run_mode="DP"
+else
+    # parser params from input: modeltype_bs${bs_item}_${fp_item}_${run_mode}_${device_num}
+    IFS="_"
+    params_list=(${PARAMS})
+    model_type=${params_list[0]}
+    batch_size=${params_list[1]}
+    batch_size=`echo  ${batch_size} | tr -cd "[0-9]" `
+    precision=${params_list[2]}
+    # run_process_type=${params_list[3]}
+    run_mode=${params_list[3]}
+    device_num=${params_list[4]}
+    IFS=";"
+    if [ ${precision} = "null" ];then
+        precision="fp32"
+    fi
+    fp_items_list=($precision)
+    batch_size_list=($batch_size)
+    device_num_list=($device_num)
+fi
+IFS="|"
+for batch_size in ${batch_size_list[*]}; do 
+    for precision in ${fp_items_list[*]}; do
+        for device_num in ${device_num_list[*]}; do
+            # sed batchsize and precision
+            func_sed_params "$FILENAME" "${line_precision}" "$precision"
+            func_sed_params "$FILENAME" "${line_batchsize}" "$batch_size"
+            func_sed_params "$FILENAME" "${line_epoch}" "$epoch"
+            gpu_id=$(set_gpu_id $device_num)
+            if [ ${#gpu_id} -le 1 ];then
+                run_process_type="SingleP"
+                log_path="$SAVE_LOG/profiling_log"
+                mkdir -p $log_path
+                log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_profiling"
+                func_sed_params "$FILENAME" "${line_gpuid}" "0"  # sed used gpu_id 
+                # set profile_option params
+                tmp=`sed -i "${line_profile}s/.*/${profile_option}/" "${FILENAME}"`
+                # run test_train_inference_python.sh
+                cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 "
+                echo $cmd
+                eval $cmd
+                eval "cat ${log_path}/${log_name}"
+                # without profile
+                log_path="$SAVE_LOG/train_log"
+                speed_log_path="$SAVE_LOG/index"
+                mkdir -p $log_path
+                mkdir -p $speed_log_path
+                log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_log"
+                speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_speed"
+                func_sed_params "$FILENAME" "${line_profile}" "null"  # sed profile_id as null
+                cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 "
+                echo $cmd
+                job_bt=`date '+%Y%m%d%H%M%S'`
+                eval $cmd
+                job_et=`date '+%Y%m%d%H%M%S'`
+                export model_run_time=$((${job_et}-${job_bt}))
+                eval "cat ${log_path}/${log_name}"
+                # parser log
+                _model_name="${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}"
+                cmd="${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \
+                        --speed_log_file '${speed_log_path}/${speed_log_name}' \
+                        --model_name ${_model_name} \
+                        --base_batch_size ${batch_size} \
+                        --run_mode ${run_mode} \
+                        --run_process_type ${run_process_type} \
+                        --fp_item ${precision} \
+                        --keyword ips: \
+                        --skip_steps 2 \
+                        --device_num ${device_num} \
+                        --speed_unit samples/s \
+                        --convergence_key loss: "
+                echo $cmd
+                eval $cmd
+                last_status=${PIPESTATUS[0]}
+                status_check $last_status "${cmd}" "${status_log}"
+            else
+                IFS=";"
+                unset_env=`unset CUDA_VISIBLE_DEVICES`
+                run_process_type="MultiP"
+                log_path="$SAVE_LOG/train_log"
+                speed_log_path="$SAVE_LOG/index"
+                mkdir -p $log_path
+                mkdir -p $speed_log_path
+                log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_log"
+                speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_speed"
+                func_sed_params "$FILENAME" "${line_gpuid}" "$gpu_id"  # sed used gpu_id 
+                func_sed_params "$FILENAME" "${line_profile}" "null"  # sed --profile_option as null
+                cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 "
+                echo $cmd
+                job_bt=`date '+%Y%m%d%H%M%S'`
+                eval $cmd
+                job_et=`date '+%Y%m%d%H%M%S'`
+                export model_run_time=$((${job_et}-${job_bt}))
+                eval "cat ${log_path}/${log_name}"
+                # parser log
+                _model_name="${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}"
+                cmd="${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \
+                        --speed_log_file '${speed_log_path}/${speed_log_name}' \
+                        --model_name ${_model_name} \
+                        --base_batch_size ${batch_size} \
+                        --run_mode ${run_mode} \
+                        --run_process_type ${run_process_type} \
+                        --fp_item ${precision} \
+                        --keyword ips: \
+                        --skip_steps 2 \
+                        --device_num ${device_num} \
+                        --speed_unit images/s \
+                        --convergence_key loss: "
+                echo $cmd
+                eval $cmd
+                last_status=${PIPESTATUS[0]}
+                status_check $last_status "${cmd}" "${status_log}"
+            fi
+        done
+    done
+done
+cd train_log
+mkdir train_log
+mv Paddle* train_log/
+cd ..
+mv index train_log/
+mv profiling_log train_log/
--- a/test_tipc/config/HRNet/HRNet_W48_C_train_infer_python.txt
+++ b/test_tipc/config/HRNet/HRNet_W48_C_train_infer_python.txt
@@ -50,3 +50,9 @@ inference:python/predict_cls.py -c configs/inference_cls.yaml
 -o Global.benchmark:True
 null:null
 null:null
+===========================train_benchmark_params==========================
+batch_size:64|128
+fp_items:fp32
+epoch:1
+--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile
+flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98;FLAGS_conv_workspace_size_limit=4096
--- a/test_tipc/config/MobileNetV1/MobileNetV1_train_infer_python.txt
+++ b/test_tipc/config/MobileNetV1/MobileNetV1_train_infer_python.txt
@@ -50,3 +50,9 @@ inference:python/predict_cls.py -c configs/inference_cls.yaml
 -o Global.benchmark:True
 null:null
 null:null
+===========================train_benchmark_params==========================
+batch_size:64|128
+fp_items:fp32
+epoch:1
+--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile
+flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98;FLAGS_conv_workspace_size_limit=4096
--- a/test_tipc/config/MobileNetV2/MobileNetV2_train_infer_python.txt
+++ b/test_tipc/config/MobileNetV2/MobileNetV2_train_infer_python.txt
@@ -50,3 +50,9 @@ inference:python/predict_cls.py -c configs/inference_cls.yaml
 -o Global.benchmark:True
 null:null
 null:null
+===========================train_benchmark_params==========================
+batch_size:64|128
+fp_items:fp32
+epoch:1
+--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile
+flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98;FLAGS_conv_workspace_size_limit=4096
--- a/test_tipc/config/MobileNetV3/MobileNetV3_large_x1_0_train_infer_python.txt
+++ b/test_tipc/config/MobileNetV3/MobileNetV3_large_x1_0_train_infer_python.txt
@@ -50,3 +50,9 @@ inference:python/predict_cls.py -c configs/inference_cls.yaml
 -o Global.benchmark:True
 null:null
 null:null
+===========================train_benchmark_params==========================
+batch_size:256|640
+fp_items:fp32
+epoch:1
+--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile
+flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98;FLAGS_conv_workspace_size_limit=4096
--- a/test_tipc/config/ResNet/ResNet152_train_infer_python.txt
+++ b/test_tipc/config/ResNet/ResNet152_train_infer_python.txt
@@ -50,3 +50,9 @@ inference:python/predict_cls.py -c configs/inference_cls.yaml
 -o Global.benchmark:True
 null:null
 null:null
+===========================train_benchmark_params==========================
+batch_size:32
+fp_items:fp32
+epoch:1
+--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile
+flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98;FLAGS_conv_workspace_size_limit=4096
--- a/test_tipc/config/ResNet/ResNet50_train_infer_python.txt
+++ b/test_tipc/config/ResNet/ResNet50_train_infer_python.txt
@@ -50,3 +50,9 @@ inference:python/predict_cls.py -c configs/inference_cls.yaml
 -o Global.benchmark:True
 null:null
 null:null
+===========================train_benchmark_params==========================
+batch_size:128
+fp_items:fp32
+epoch:1
+--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile
+flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98;FLAGS_conv_workspace_size_limit=4096
--- a/test_tipc/config/ShuffleNet/ShuffleNetV2_x1_0_train_infer_python.txt
+++ b/test_tipc/config/ShuffleNet/ShuffleNetV2_x1_0_train_infer_python.txt
@@ -50,3 +50,9 @@ inference:python/predict_cls.py -c configs/inference_cls.yaml
 -o Global.benchmark:True
 null:null
 null:null
+===========================train_benchmark_params==========================
+batch_size:256|1536
+fp_items:fp32
+epoch:1
+--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile
+flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98;FLAGS_conv_workspace_size_limit=4096
--- a/test_tipc/config/SwinTransformer/SwinTransformer_tiny_patch4_window7_224_train_infer_python.txt
+++ b/test_tipc/config/SwinTransformer/SwinTransformer_tiny_patch4_window7_224_train_infer_python.txt
@@ -50,3 +50,9 @@ inference:python/predict_cls.py -c configs/inference_cls.yaml
 -o Global.benchmark:True
 null:null
 null:null
+===========================train_benchmark_params==========================
+batch_size:64|104
+fp_items:fp32
+epoch:1
+--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile
+flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98;FLAGS_conv_workspace_size_limit=4096
--- a/test_tipc/config/Twins/alt_gvt_base_train_infer_python.txt
+++ b/test_tipc/config/Twins/alt_gvt_base_train_infer_python.txt
@@ -50,3 +50,9 @@ inference:python/predict_cls.py -c configs/inference_cls.yaml
 -o Global.benchmark:True
 null:null
 null:null
+===========================train_benchmark_params==========================
+batch_size:64|176
+fp_items:fp32
+epoch:1
+--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile
+flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98;FLAGS_conv_workspace_size_limit=4096
--- a/test_tipc/docs/benchmark_train.md
+++ b/test_tipc/docs/benchmark_train.md
+# TIPC Linux端Benchmark测试文档
+该文档为Benchmark测试说明，Benchmark预测功能测试的主程序为`benchmark_train.sh`，用于验证监控模型训练的性能。
+# 1. 测试流程
+## 1.1 准备数据和环境安装
+运行`test_tipc/prepare.sh`，完成训练数据准备和安装环境流程。
+```shell
+# 运行格式：bash test_tipc/prepare.sh  train_benchmark.txt  mode
+bash test_tipc/prepare.sh test_tipc/configs/MobileNetV2/MobileNetV2_train_infer_python.txt benchmark_train
+```
+## 1.2 功能测试
+执行`test_tipc/benchmark_train.sh`，完成模型训练和日志解析
+```shell
+# 运行格式：bash test_tipc/benchmark_train.sh train_benchmark.txt mode
+bash test_tipc/benchmark_train.sh test_tipc/config/MobileNetV2/MobileNetV2_train_infer_python.txt benchmark_train
+```
+`test_tipc/benchmark_train.sh`支持根据传入的第三个参数实现只运行某一个训练配置，如下：
+```shell
+# 运行格式：bash test_tipc/benchmark_train.sh train_benchmark.txt mode params
+bash test_tipc/benchmark_train.sh test_tipc/configs/MobileNetV2/MobileNetV2_train_infer_python.txt benchmark_train  dynamic_bs8_fp32_DP_N1C1
+```
+dynamic_bs8_fp32_DP_N1C1为test_tipc/benchmark_train.sh传入的参数，格式如下：
+`${modeltype}_${batch_size}_${fp_item}_${run_mode}_${device_num}`
+包含的信息有：模型类型、batchsize大小、训练精度如fp32,fp16等、分布式运行模式以及分布式训练使用的机器信息如单机单卡（N1C1）。
+## 2. 日志输出
+运行后将保存模型的训练日志和解析日志，使用 `test_tipc/configs/MobileNetV2/MobileNetV2_train_infer_python.txt` 参数文件的训练日志解析结果是：
+```
+{"model_branch": "dygaph", "model_commit": "7c39a1996b19087737c05d883fd346d2f39dbcc0", "model_name": "cls_MobileNetV2_bs8_fp32_SingleP_DP", "batch_size": 8, "fp_item": "fp32", "run_process_type": "SingleP", "run_mode": "DP", "convergence_value": "5.413110", "convergence_key": "loss:", "ips": 19.333, "speed_unit": "samples/s", "device_num": "N1C1", "model_run_time": "0", "frame_commit": "8cc09552473b842c651ead3b9848d41827a3dbab", "frame_version": "0.0.0"}
+```
+训练日志和日志解析结果保存在benchmark_log目录下，文件组织格式如下：
+```
+train_log/
+├── index
+│   ├── PaddleClas_cls_MobileNetV2_bs8_fp32_SingleP_DP_N1C1_speed
+│   └── PaddleClas_cls_MobileNetV2_bs8_fp32_SingleP_DP_N1C4_speed
+├── profiling_log
+│   └── PaddleClas_cls_MobileNetV2_bs8_fp32_SingleP_DP_N1C1_profiling
+└── train_log
+    ├── PaddleClas_cls_MobileNetV2_bs8_fp32_SingleP_DP_N1C1_log
+    └── PaddleClas_cls_MobileNetV2_bs8_fp32_SingleP_DP_N1C4_log
+```
--- a/test_tipc/prepare.sh
+++ b/test_tipc/prepare.sh
@@ -182,3 +182,15 @@ if [ ${MODE} = "paddle2onnx_infer" ];then
    wget -nc https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/inference/ResNet50_vd_infer.tar  && tar xf ResNet50_vd_infer.tar
    cd ../../
 fi
+if [ ${MODE} = "benchmark_train" ];then
+    pip install -r requirements.txt
+    cd dataset
+    rm -rf ILSVRC2012
+    wget -nc https://paddle-imagenet-models-name.bj.bcebos.com/data/ImageNet1k/ILSVRC2012_val.tar
+    tar xf ILSVRC2012_val.tar
+    ln -s ILSVRC2012_val ILSVRC2012
+    cd ILSVRC2012
+    ln -s val_list.txt  train_list.txt
+    cd ../../
+fi
--- a/test_tipc/test_train_inference_python.sh
+++ b/test_tipc/test_train_inference_python.sh
@@ -90,6 +90,10 @@ infer_value1=$(func_parser_value "${lines[50]}")
 if [ ! $epoch_num ]; then
  epoch_num=2
 fi
+if [ $MODE = 'benchmark_train' ]; then
+  epoch_num=1
+fi
 LOG_PATH="./test_tipc/output"
 mkdir -p ${LOG_PATH}
 status_log="${LOG_PATH}/results_python.log"