benchmark_train.sh 12.0 KB
Newer Older
D
dongshuilong 已提交
1 2 3 4 5 6 7 8 9
#!/bin/bash
source test_tipc/common_func.sh

# set env
python=python
export str_tmp=$(echo `pip list|grep paddlepaddle-gpu|awk -F ' ' '{print $2}'`)
export frame_version=${str_tmp%%.post*}
export frame_commit=$(echo `${python} -c "import paddle;print(paddle.version.commit)"`)

G
gaotingquan 已提交
10
# run benchmark sh
D
dongshuilong 已提交
11 12
# Usage:
# bash run_benchmark_train.sh config.txt params
G
gaotingquan 已提交
13
# or
D
dongshuilong 已提交
14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
# bash run_benchmark_train.sh config.txt

function func_parser_params(){
    strs=$1
    IFS="="
    array=(${strs})
    tmp=${array[1]}
    echo ${tmp}
}

function func_sed_params(){
    filename=$1
    line=$2
    param_value=$3
    params=`sed -n "${line}p" $filename`
    IFS=":"
    array=(${params})
    key=${array[0]}
    value=${array[1]}
    new_params="${key}:${param_value}"
    IFS=";"
    cmd="sed -i '${line}s/.*/${new_params}/' '${filename}'"
    eval $cmd
}

function set_gpu_id(){
    string=$1
    _str=${string:1:6}
    IFS="C"
    arr=(${_str})
    M=${arr[0]}
    P=${arr[1]}
    gn=`expr $P - 1`
    gpu_num=`expr $gn / $M`
    seq=`seq -s "," 0 $gpu_num`
    echo $seq
}

function get_repo_name(){
    IFS=";"
    cur_dir=$(pwd)
    IFS="/"
    arr=(${cur_dir})
    echo ${arr[-1]}
}

FILENAME=$1
# copy FILENAME as new
new_filename="./test_tipc/benchmark_train.txt"
cmd=`yes|cp $FILENAME $new_filename`
FILENAME=$new_filename
# MODE must be one of ['benchmark_train']
MODE=$2
PARAMS=$3
A
Aurelius84 已提交
68
model_type=$4
69
# bash test_tipc/benchmark_train.sh test_tipc/configs/det_mv3_db_v2_0/train_benchmark.txt  benchmark_train
A
Aurelius84 已提交
70
# bash test_tipc/benchmark_train.sh test_tipc/configs/det_mv3_db_v2_0/train_benchmark.txt  benchmark_train dynamicTostatic
A
Aurelius84 已提交
71
# bash test_tipc/benchmark_train.sh test_tipc/configs/det_mv3_db_v2_0/train_benchmark.txt  benchmark_train dynamic_bs8_null_DP_N1C1
A
Aurelius84 已提交
72
# bash test_tipc/benchmark_train.sh test_tipc/configs/det_mv3_db_v2_0/train_benchmark.txt  benchmark_train dynamicTostatic_bs8_null_DP_N1C1
73

D
dongshuilong 已提交
74 75
IFS=$'\n'
# parser params from train_benchmark.txt
G
gaotingquan 已提交
76
sed -i 's/ -o DataLoader.Train.sampler.shuffle=False/ -o Global.print_batch_step=1/g' $FILENAME
77
sed -i 's/-o DataLoader.Train.loader.use_shared_memory=False/ -o Global.eval_during_train=False/g' $FILENAME
D
dongshuilong 已提交
78 79 80 81 82 83 84 85 86 87 88 89 90 91
dataline=`cat $FILENAME`
# parser params
IFS=$'\n'
lines=(${dataline})
model_name=$(func_parser_value "${lines[1]}")

# 获取benchmark_params所在的行数
line_num=`grep -n "train_benchmark_params" $FILENAME  | cut -d ":" -f 1`
# for train log parser
batch_size=$(func_parser_value "${lines[line_num]}")
line_num=`expr $line_num + 1`
fp_items=$(func_parser_value "${lines[line_num]}")
line_num=`expr $line_num + 1`
epoch=$(func_parser_value "${lines[line_num]}")
92 93
line_num=`expr $line_num + 1`
model_type=$(func_parser_value "${lines[line_num]}")
G
gaotingquan 已提交
94 95 96
line_num=`expr $line_num + 1`
num_workers=$(func_parser_value "${lines[line_num]}")

D
dongshuilong 已提交
97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124

line_num=`expr $line_num + 1`
profile_option_key=$(func_parser_key "${lines[line_num]}")
profile_option_params=$(func_parser_value "${lines[line_num]}")
profile_option="${profile_option_key}:${profile_option_params}"

line_num=`expr $line_num + 1`
flags_value=$(func_parser_value "${lines[line_num]}")
# set flags
IFS=";"
flags_list=(${flags_value})
for _flag in ${flags_list[*]}; do
    cmd="export ${_flag}"
    eval $cmd
done

# set log_name
repo_name=$(get_repo_name )
SAVE_LOG=${BENCHMARK_LOG_DIR:-$(pwd)}   # */benchmark_log
mkdir -p "${SAVE_LOG}/benchmark_log/"
status_log="${SAVE_LOG}/benchmark_log/results.log"

# The number of lines in which train params can be replaced.
line_python=3
line_gpuid=4
line_precision=6
line_epoch=7
line_batchsize=9
125
line_model_type=15
D
dongshuilong 已提交
126 127 128 129
line_profile=13
line_eval_py=24
line_export_py=30
line_norm_train=16
130 131
line_pact_train=17
line_fgpm_train=18
D
dongshuilong 已提交
132 133 134 135

func_sed_params "$FILENAME" "${line_eval_py}" "null"
func_sed_params "$FILENAME" "${line_export_py}" "null"
func_sed_params "$FILENAME" "${line_python}"  "$python"
136 137
func_sed_params "$FILENAME" "${line_pact_train}" "null"
func_sed_params "$FILENAME" "${line_fgpm_train}" "null"
G
gaotingquan 已提交
138 139
# set num_workers
sed -i "s/ -o DataLoader.Train.loader.num_workers=0/ -o DataLoader.Train.loader.num_workers=${num_workers}/g" $FILENAME
D
dongshuilong 已提交
140 141

# if params
G
gaotingquan 已提交
142
if [[ ! -n "$PARAMS" ]];then
D
dongshuilong 已提交
143 144 145 146 147
    # PARAMS input is not a word.
    IFS="|"
    batch_size_list=(${batch_size})
    fp_items_list=(${fp_items})
    device_num_list=(N1C4)
148
    model_type_list=(${model_type})
D
dongshuilong 已提交
149
    run_mode="DP"
150 151 152 153 154 155 156
elif [[ ${PARAMS} = "dynamicTostatic" ]];then
    IFS="|"
    model_type=$PARAMS
    batch_size_list=(${batch_size})
    fp_items_list=(${fp_items})
    device_num_list=(N1C4)
    run_mode="DP"
D
dongshuilong 已提交
157 158 159 160 161 162 163 164 165 166 167 168
else
    # parser params from input: modeltype_bs${bs_item}_${fp_item}_${run_mode}_${device_num}
    IFS="_"
    params_list=(${PARAMS})
    model_type=${params_list[0]}
    batch_size=${params_list[1]}
    batch_size=`echo  ${batch_size} | tr -cd "[0-9]" `
    precision=${params_list[2]}
    run_mode=${params_list[3]}
    device_num=${params_list[4]}
    IFS=";"

G
gaotingquan 已提交
169
    if [[ ${precision} = "null" ]];then
D
dongshuilong 已提交
170 171 172 173 174 175 176
        precision="fp32"
    fi

    fp_items_list=($precision)
    batch_size_list=($batch_size)
    device_num_list=($device_num)

177 178 179 180 181 182
    # parse "to_static" options and modify trainer into "to_static_trainer"
    if [[ ${model_type} = "dynamicTostatic" ]];then
        model_type_list="to_static_train"
    else
        model_type_list="norm_train"
    fi
A
Aurelius84 已提交
183 184 185
fi


D
dongshuilong 已提交
186
IFS="|"
G
gaotingquan 已提交
187
for batch_size in ${batch_size_list[*]}; do
D
dongshuilong 已提交
188 189
    for precision in ${fp_items_list[*]}; do
        for device_num in ${device_num_list[*]}; do
190 191 192 193 194 195
            for model_type in ${model_type_list[*]}; do
                # sed batchsize and precision
                func_sed_params "$FILENAME" "${line_precision}" "$precision"
                func_sed_params "$FILENAME" "${line_batchsize}" "$batch_size"
                func_sed_params "$FILENAME" "${line_epoch}" "$epoch"
                func_sed_params "$FILENAME" "${line_model_type}" "$model_type"
D
dongshuilong 已提交
196

197 198 199 200 201 202
                # for log name
                if [[ ${model_type} = "to_static_train" ]];then
                    to_static="d2sT_"
                else
                    to_static=""
                fi
203

204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220
                gpu_id=$(set_gpu_id $device_num)

                # if bs is big, then copy train_list.txt to generate more train log
                # At least 25 log number would be good to calculate ips for benchmark system.
                # So the copy number for train_list is as follows:
                total_batch_size=`echo $[$batch_size*${device_num:1:1}*${device_num:3:3}]`
                if [[ $model_name == *GeneralRecognition* ]]; then
                    cd dataset/
                    train_list_length=`cat train_reg_all_data.txt | wc -l`
                    copy_num=`echo $[25*10*$total_batch_size/$train_list_length]`
                    if [[ $copy_num -gt 1 ]];then
                        rm -rf train_reg_all_data.txt
                        for ((i=1; i <=$copy_num; i++));do
                            cat tipc_shitu_demo_data/demo_train.txt >> train_reg_all_data.txt
                        done
                    fi
                    cd ..
221
                fi
G
gaotingquan 已提交
222

223 224 225 226 227 228 229
                if [[ ${#gpu_id} -le 1 ]];then
                    log_path="$SAVE_LOG/profiling_log"
                    mkdir -p $log_path
                    log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}profiling"
                    func_sed_params "$FILENAME" "${line_gpuid}" "0"  # sed used gpu_id
                    # set profile_option params
                    tmp=`sed -i "${line_profile}s/.*/${profile_option}/" "${FILENAME}"`
D
dongshuilong 已提交
230

231
                    # run test_train_inference_python.sh
G
gaotingquan 已提交
232
                    cmd="timeout 5m bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 "
233
                    echo $cmd
G
gaotingquan 已提交
234
                    eval ${cmd}
235
                    eval "cat ${log_path}/${log_name}"
D
dongshuilong 已提交
236

237 238 239 240 241 242 243 244
                    # without profile
                    log_path="$SAVE_LOG/train_log"
                    speed_log_path="$SAVE_LOG/index"
                    mkdir -p $log_path
                    mkdir -p $speed_log_path
                    log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}log"
                    speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}speed"
                    func_sed_params "$FILENAME" "${line_profile}" "null"  # sed profile_id as null
G
gaotingquan 已提交
245
                    cmd="timeout 5m bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 "
246 247
                    echo $cmd
                    job_bt=`date '+%Y%m%d%H%M%S'`
G
gaotingquan 已提交
248
                    eval ${cmd}
249 250 251
                    job_et=`date '+%Y%m%d%H%M%S'`
                    export model_run_time=$((${job_et}-${job_bt}))
                    eval "cat ${log_path}/${log_name}"
D
dongshuilong 已提交
252

253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280
                    # parser log
                    _model_name="${model_name}_bs${batch_size}_${precision}_${run_mode}"
                    cmd="${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \
                            --speed_log_file '${speed_log_path}/${speed_log_name}' \
                            --model_name ${_model_name} \
                            --base_batch_size ${batch_size} \
                            --run_mode ${run_mode} \
                            --fp_item ${precision} \
                            --keyword ips: \
                            --skip_steps 100 \
                            --device_num ${device_num} \
                            --speed_unit samples/s \
                            --convergence_key loss: "
                    echo $cmd
                    eval $cmd
                    last_status=${PIPESTATUS[0]}
                    status_check $last_status "${cmd}" "${status_log}" "${model_name}"
                else
                    IFS=";"
                    unset_env=`unset CUDA_VISIBLE_DEVICES`
                    log_path="$SAVE_LOG/train_log"
                    speed_log_path="$SAVE_LOG/index"
                    mkdir -p $log_path
                    mkdir -p $speed_log_path
                    log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}log"
                    speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}speed"
                    func_sed_params "$FILENAME" "${line_gpuid}" "$gpu_id"  # sed used gpu_id
                    func_sed_params "$FILENAME" "${line_profile}" "null"  # sed --profile_option as null
281 282 283 284 285 286
                    if [[ ${device_num} = "N4C32" ]];then
                        duration=10m
                    else
                        duration=5m
                    fi
                    cmd="timeout ${duration} bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 "
287 288
                    echo $cmd
                    job_bt=`date '+%Y%m%d%H%M%S'`
G
gaotingquan 已提交
289
                    eval ${cmd}
290 291 292 293 294
                    job_et=`date '+%Y%m%d%H%M%S'`
                    export model_run_time=$((${job_et}-${job_bt}))
                    eval "cat ${log_path}/${log_name}"
                    # parser log
                    _model_name="${model_name}_bs${batch_size}_${precision}_${run_mode}"
G
gaotingquan 已提交
295

296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312
                    cmd="${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \
                            --speed_log_file '${speed_log_path}/${speed_log_name}' \
                            --model_name ${_model_name} \
                            --base_batch_size ${batch_size} \
                            --run_mode ${run_mode} \
                            --fp_item ${precision} \
                            --keyword ips: \
                            --skip_steps 100 \
                            --device_num ${device_num} \
                            --speed_unit images/s \
                            --convergence_key loss: "
                    echo $cmd
                    eval $cmd
                    last_status=${PIPESTATUS[0]}
                    status_check $last_status "${cmd}" "${status_log}" "${model_name}"
                fi
            done
D
dongshuilong 已提交
313 314 315
        done
    done
done