diff --git a/test_tipc/benchmark_train.sh b/test_tipc/benchmark_train.sh new file mode 100644 index 0000000000000000000000000000000000000000..116ecacd36d211241ba8155586ee11d9bf052d8c --- /dev/null +++ b/test_tipc/benchmark_train.sh @@ -0,0 +1,252 @@ +#!/bin/bash +source test_tipc/common_func.sh + +# set env +python=python +export model_branch=`git symbolic-ref HEAD 2>/dev/null | cut -d"/" -f 3` +export model_commit=$(git log|head -n1|awk '{print $2}') +export str_tmp=$(echo `pip list|grep paddlepaddle-gpu|awk -F ' ' '{print $2}'`) +export frame_version=${str_tmp%%.post*} +export frame_commit=$(echo `${python} -c "import paddle;print(paddle.version.commit)"`) + +# run benchmark sh +# Usage: +# bash run_benchmark_train.sh config.txt params +# or +# bash run_benchmark_train.sh config.txt + +function func_parser_params(){ + strs=$1 + IFS="=" + array=(${strs}) + tmp=${array[1]} + echo ${tmp} +} + +function func_sed_params(){ + filename=$1 + line=$2 + param_value=$3 + params=`sed -n "${line}p" $filename` + IFS=":" + array=(${params}) + key=${array[0]} + value=${array[1]} + new_params="${key}:${param_value}" + IFS=";" + cmd="sed -i '${line}s/.*/${new_params}/' '${filename}'" + eval $cmd +} + +function set_gpu_id(){ + string=$1 + _str=${string:1:6} + IFS="C" + arr=(${_str}) + M=${arr[0]} + P=${arr[1]} + gn=`expr $P - 1` + gpu_num=`expr $gn / $M` + seq=`seq -s "," 0 $gpu_num` + echo $seq +} + +function get_repo_name(){ + IFS=";" + cur_dir=$(pwd) + IFS="/" + arr=(${cur_dir}) + echo ${arr[-1]} +} + +FILENAME=$1 +# copy FILENAME as new +new_filename="./test_tipc/benchmark_train.txt" +cmd=`yes|cp $FILENAME $new_filename` +FILENAME=$new_filename +# MODE must be one of ['benchmark_train'] +MODE=$2 +PARAMS=$3 +IFS=$'\n' +# parser params from train_benchmark.txt +dataline=`cat $FILENAME` +# parser params +IFS=$'\n' +lines=(${dataline}) +model_name=$(func_parser_value "${lines[1]}") + +# 获取benchmark_params所在的行数 +line_num=`grep -n "train_benchmark_params" $FILENAME | cut -d ":" -f 1` +# for train log parser +batch_size=$(func_parser_value "${lines[line_num]}") +line_num=`expr $line_num + 1` +fp_items=$(func_parser_value "${lines[line_num]}") +line_num=`expr $line_num + 1` +epoch=$(func_parser_value "${lines[line_num]}") + +line_num=`expr $line_num + 1` +profile_option_key=$(func_parser_key "${lines[line_num]}") +profile_option_params=$(func_parser_value "${lines[line_num]}") +profile_option="${profile_option_key}:${profile_option_params}" + +line_num=`expr $line_num + 1` +flags_value=$(func_parser_value "${lines[line_num]}") +# set flags +IFS=";" +flags_list=(${flags_value}) +for _flag in ${flags_list[*]}; do + cmd="export ${_flag}" + eval $cmd +done + +# set log_name +repo_name=$(get_repo_name ) +SAVE_LOG=${BENCHMARK_LOG_DIR:-$(pwd)} # */benchmark_log +mkdir -p "${SAVE_LOG}/benchmark_log/" +status_log="${SAVE_LOG}/benchmark_log/results.log" + +# The number of lines in which train params can be replaced. +line_python=3 +line_gpuid=4 +line_precision=6 +line_epoch=7 +line_batchsize=9 +line_profile=13 +line_eval_py=24 +line_export_py=30 + +func_sed_params "$FILENAME" "${line_eval_py}" "null" +func_sed_params "$FILENAME" "${line_export_py}" "null" +func_sed_params "$FILENAME" "${line_python}" "$python" + +# if params +if [ ! -n "$PARAMS" ] ;then + # PARAMS input is not a word. + IFS="|" + batch_size_list=(${batch_size}) + fp_items_list=(${fp_items}) + device_num_list=(N1C4) + run_mode="DP" +else + # parser params from input: modeltype_bs${bs_item}_${fp_item}_${run_mode}_${device_num} + IFS="_" + params_list=(${PARAMS}) + model_type=${params_list[0]} + batch_size=${params_list[1]} + batch_size=`echo ${batch_size} | tr -cd "[0-9]" ` + precision=${params_list[2]} + # run_process_type=${params_list[3]} + run_mode=${params_list[3]} + device_num=${params_list[4]} + IFS=";" + + if [ ${precision} = "null" ];then + precision="fp32" + fi + + fp_items_list=($precision) + batch_size_list=($batch_size) + device_num_list=($device_num) +fi + +IFS="|" +for batch_size in ${batch_size_list[*]}; do + for precision in ${fp_items_list[*]}; do + for device_num in ${device_num_list[*]}; do + # sed batchsize and precision + #func_sed_params "$FILENAME" "${line_precision}" "$precision" + func_sed_params "$FILENAME" "${line_batchsize}" "$MODE=$batch_size" + func_sed_params "$FILENAME" "${line_epoch}" "$MODE=$epoch" + gpu_id=$(set_gpu_id $device_num) + + if [ ${#gpu_id} -le 1 ];then + run_process_type="SingleP" + log_path="$SAVE_LOG/profiling_log" + mkdir -p $log_path + log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_profiling" + func_sed_params "$FILENAME" "${line_gpuid}" "0" # sed used gpu_id + # set profile_option params + tmp=`sed -i "${line_profile}s/.*/${profile_option}/" "${FILENAME}"` + + # run test_train_inference_python.sh + cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 " + echo $cmd + eval $cmd + eval "cat ${log_path}/${log_name}" + + # without profile + log_path="$SAVE_LOG/train_log" + speed_log_path="$SAVE_LOG/index" + mkdir -p $log_path + mkdir -p $speed_log_path + log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_log" + speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_speed" + func_sed_params "$FILENAME" "${line_profile}" "null" # sed profile_id as null + cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 " + echo $cmd + job_bt=`date '+%Y%m%d%H%M%S'` + eval $cmd + job_et=`date '+%Y%m%d%H%M%S'` + export model_run_time=$((${job_et}-${job_bt})) + eval "cat ${log_path}/${log_name}" + + # parser log + _model_name="${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}" + cmd="${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \ + --speed_log_file '${speed_log_path}/${speed_log_name}' \ + --model_name ${_model_name} \ + --base_batch_size ${batch_size} \ + --run_mode ${run_mode} \ + --run_process_type ${run_process_type} \ + --fp_item ${precision} \ + --keyword ips: \ + --skip_steps 2 \ + --device_num ${device_num} \ + --speed_unit samples/s \ + --convergence_key loss: " + echo $cmd + eval $cmd + last_status=${PIPESTATUS[0]} + status_check $last_status "${cmd}" "${status_log}" + else + IFS=";" + unset_env=`unset CUDA_VISIBLE_DEVICES` + run_process_type="MultiP" + log_path="$SAVE_LOG/train_log" + speed_log_path="$SAVE_LOG/index" + mkdir -p $log_path + mkdir -p $speed_log_path + log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_log" + speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_speed" + func_sed_params "$FILENAME" "${line_gpuid}" "$gpu_id" # sed used gpu_id + func_sed_params "$FILENAME" "${line_profile}" "null" # sed --profile_option as null + cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 " + echo $cmd + job_bt=`date '+%Y%m%d%H%M%S'` + eval $cmd + job_et=`date '+%Y%m%d%H%M%S'` + export model_run_time=$((${job_et}-${job_bt})) + eval "cat ${log_path}/${log_name}" + # parser log + _model_name="${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}" + + cmd="${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \ + --speed_log_file '${speed_log_path}/${speed_log_name}' \ + --model_name ${_model_name} \ + --base_batch_size ${batch_size} \ + --run_mode ${run_mode} \ + --run_process_type ${run_process_type} \ + --fp_item ${precision} \ + --keyword ips: \ + --skip_steps 2 \ + --device_num ${device_num} \ + --speed_unit images/s \ + --convergence_key loss: " + echo $cmd + eval $cmd + last_status=${PIPESTATUS[0]} + status_check $last_status "${cmd}" "${status_log}" + fi + done + done +done \ No newline at end of file diff --git a/test_tipc/configs/basicvsr/train_infer_python.txt b/test_tipc/configs/basicvsr/train_infer_python.txt index 6990b078c502549c902db10194ddf005b62b8eb5..2c3f6ca2c3262151dc8325b69354d8cdab8a21a1 100644 --- a/test_tipc/configs/basicvsr/train_infer_python.txt +++ b/test_tipc/configs/basicvsr/train_infer_python.txt @@ -48,4 +48,10 @@ null:null null:null null:null --benchmark:True -null:null \ No newline at end of file +null:null +===========================train_benchmark_params========================== +batch_size:2|4 +fp_items:fp32 +total_iters:50 +--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile +flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98;FLAGS_conv_workspace_size_limit=4096 \ No newline at end of file diff --git a/test_tipc/configs/cyclegan/train_infer_python.txt b/test_tipc/configs/cyclegan/train_infer_python.txt index 6acd3c47ebc768b0c460c955497dd69df5531d16..116fa443b4854781a8aad7e6768530ca49e317f6 100644 --- a/test_tipc/configs/cyclegan/train_infer_python.txt +++ b/test_tipc/configs/cyclegan/train_infer_python.txt @@ -13,7 +13,7 @@ train_infer_img_dir:./data/horse2zebra/test null:null ## trainer:norm_train -norm_train:tools/main.py -c configs/cyclegan_horse2zebra.yaml --seed 123 -o log_config.interval=10 snapshot_config.interval=1 +norm_train:tools/main.py -c configs/cyclegan_horse2zebra.yaml --seed 123 -o log_config.interval=1 snapshot_config.interval=1 pact_train:null fpgm_train:null distill_train:null @@ -48,4 +48,10 @@ null:null null:null null:null --benchmark:True -null:null \ No newline at end of file +null:null +===========================train_benchmark_params========================== +batch_size:1 +fp_items:fp32 +epoch:1 +--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile +flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98;FLAGS_conv_workspace_size_limit=4096 \ No newline at end of file diff --git a/test_tipc/configs/fom/train_infer_python.txt b/test_tipc/configs/fom/train_infer_python.txt index b9ff8d7923b1ecb8c2809a58c1a050aed33d124b..02fb2fd8332b2b05f8957cc13ab74a0196749daa 100644 --- a/test_tipc/configs/fom/train_infer_python.txt +++ b/test_tipc/configs/fom/train_infer_python.txt @@ -48,4 +48,10 @@ null:null null:null null:null --benchmark:True -null:null \ No newline at end of file +null:null +===========================train_benchmark_params========================== +batch_size:16 +fp_items:fp32 +epoch:1 +--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile +flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98;FLAGS_conv_workspace_size_limit=4096 \ No newline at end of file diff --git a/test_tipc/configs/pix2pix/train_infer_python.txt b/test_tipc/configs/pix2pix/train_infer_python.txt index e9e1e1cc0fe794bb3753b0fa19f91e6adfc92221..ba0558c164abb3c8e921147fc39b019d89c95ce6 100644 --- a/test_tipc/configs/pix2pix/train_infer_python.txt +++ b/test_tipc/configs/pix2pix/train_infer_python.txt @@ -13,7 +13,7 @@ train_infer_img_dir:./data/facades/test null:null ## trainer:norm_train -norm_train:tools/main.py -c configs/pix2pix_facades.yaml --seed 123 -o dataset.train.num_workers=0 log_config.interval=5 +norm_train:tools/main.py -c configs/pix2pix_facades.yaml --seed 123 -o dataset.train.num_workers=0 log_config.interval=1 pact_train:null fpgm_train:null distill_train:null @@ -48,4 +48,10 @@ null:null null:null null:null --benchmark:True -null:null \ No newline at end of file +null:null +===========================train_benchmark_params========================== +batch_size:1 +fp_items:fp32 +epoch:10 +--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile +flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98;FLAGS_conv_workspace_size_limit=4096 \ No newline at end of file diff --git a/test_tipc/configs/stylegan2/train_infer_python.txt b/test_tipc/configs/stylegan2/train_infer_python.txt index 57da04f86725e5882f56accb39a4f7ba227b3088..70592e22f1b27ad0254c6ee60c31277a58f2fc08 100644 --- a/test_tipc/configs/stylegan2/train_infer_python.txt +++ b/test_tipc/configs/stylegan2/train_infer_python.txt @@ -48,4 +48,10 @@ null:null null:null null:null --benchmark:True -null:null \ No newline at end of file +null:null +===========================train_benchmark_params========================== +batch_size:8|16 +fp_items:fp32 +epoch:100 +--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile +flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98;FLAGS_conv_workspace_size_limit=4096 \ No newline at end of file diff --git a/test_tipc/docs/benchmark_train.md b/test_tipc/docs/benchmark_train.md new file mode 100644 index 0000000000000000000000000000000000000000..4f33d3e988741d3b4595b58a2a137603e765c2ed --- /dev/null +++ b/test_tipc/docs/benchmark_train.md @@ -0,0 +1,52 @@ + +# TIPC Linux端Benchmark测试文档 + +该文档为Benchmark测试说明,Benchmark预测功能测试的主程序为`benchmark_train.sh`,用于验证监控模型训练的性能。 + +# 1. 测试流程 +## 1.1 准备数据和环境安装 +运行`test_tipc/prepare.sh`,完成训练数据准备和安装环境流程。 + +```shell +# 运行格式:bash test_tipc/prepare.sh train_benchmark.txt mode +bash test_tipc/prepare.sh test_tipc/configs/basicvsr/train_benchmark.txt benchmark_train +``` + +## 1.2 功能测试 +执行`test_tipc/benchmark_train.sh`,完成模型训练和日志解析 + +```shell +# 运行格式:bash test_tipc/benchmark_train.sh train_benchmark.txt mode +bash test_tipc/benchmark_train.sh test_tipc/configs/basicvsr/train_infer_python.txt benchmark_train +``` + +`test_tipc/benchmark_train.sh`支持根据传入的第三个参数实现只运行某一个训练配置,如下: +```shell +# 运行格式:bash test_tipc/benchmark_train.sh train_benchmark.txt mode +bash test_tipc/benchmark_train.sh test_tipc/configs/basicvsr/train_infer_python.txt benchmark_train dynamic_bs4_fp32_DP_N1C1 +``` +dynamic_bs4_fp32_DP_N1C1为test_tipc/benchmark_train.sh传入的参数,格式如下: +`${modeltype}_${batch_size}_${fp_item}_${run_mode}_${device_num}` +包含的信息有:模型类型、batchsize大小、训练精度如fp32,fp16等、分布式运行模式以及分布式训练使用的机器信息如单机单卡(N1C1)。 + + +## 2. 日志输出 + +运行后将保存模型的训练日志和解析日志,使用 `test_tipc/configs/basicvsr/train_benchmark.txt` 参数文件的训练日志解析结果是: + +``` +{"model_branch": "dygaph", "model_commit": "7c39a1996b19087737c05d883fd346d2f39dbcc0", "model_name": "basicvsr_bs4_fp32_SingleP_DP", "batch_size": 4, "fp_item": "fp32", "run_process_type": "SingleP", "run_mode": "DP", "convergence_value": "5.413110", "convergence_key": "loss:", "ips": 19.333, "speed_unit": "samples/s", "device_num": "N1C1", "model_run_time": "0", "frame_commit": "8cc09552473b842c651ead3b9848d41827a3dbab", "frame_version": "0.0.0"} +``` + +训练日志和日志解析结果保存在benchmark_log目录下,文件组织格式如下: +``` +train_log/ +├── index +│ ├── PaddleGAN_basicvsr_bs4_fp32_SingleP_DP_N1C1_speed +│ └── PaddleGAN_basicvsr_bs4_fp32_SingleP_DP_N1C4_speed +├── profiling_log +│ └── PaddleGAN_basicvsr_bs4_fp32_SingleP_DP_N1C1_profiling +└── train_log + ├── PaddleGAN_basicvsr_bs4_fp32_SingleP_DP_N1C1_log + └── PaddleGAN_basicvsr_bs4_fp32_MultiP_DP_N1C4_log +``` diff --git a/test_tipc/prepare.sh b/test_tipc/prepare.sh index 4f0b8967ff09ec25a8114d837d37189585899906..15b4849812790158c6aabc32800b40c46001f74b 100644 --- a/test_tipc/prepare.sh +++ b/test_tipc/prepare.sh @@ -31,9 +31,13 @@ model_name=$(func_parser_value "${lines[1]}") trainer_list=$(func_parser_value "${lines[14]}") +if [ ${MODE} = "benchmark_train" ];then + pip install -v -e . + MODE="lite_train_lite_infer" +fi + # MODE be one of ['lite_train_lite_infer' 'lite_train_whole_infer' 'whole_train_whole_infer', -# 'whole_infer'] -MODE=$2 +# 'whole_infer if [ ${MODE} = "lite_train_lite_infer" ];then if [ ${model_name} == "pix2pix" ]; then diff --git a/test_tipc/test_train_inference_python.sh b/test_tipc/test_train_inference_python.sh index be1849555c1d36e5d8f3bf32e6f94ccbf89a9f48..63ec8e6351024d9609fc2f9627670cb2edc1060b 100644 --- a/test_tipc/test_train_inference_python.sh +++ b/test_tipc/test_train_inference_python.sh @@ -239,11 +239,11 @@ else fi set_save_model=$(func_set_params "${save_model_key}" "${save_log}") if [ ${#gpu} -le 2 ];then # train with cpu or single gpu - cmd="${python} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config} " + cmd="${python} ${run_train} ${set_use_gpu} ${set_save_model} ${set_train_params1} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_amp_config} " elif [ ${#ips} -le 26 ];then # train with multi-gpu - cmd="${python} -m paddle.distributed.launch --gpus=${gpu} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config}" + cmd="${python} -m paddle.distributed.launch --gpus=${gpu} ${run_train} ${set_use_gpu} ${set_save_model} ${set_train_params1} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_amp_config}" else # train with multi-machine - cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} ${set_use_gpu} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config}" + cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} ${set_use_gpu} ${set_save_model} ${set_train_params1} ${set_pretrain} ${set_epoch} ${set_autocast} ${set_batchsize} ${set_amp_config}" fi # run train eval "unset CUDA_VISIBLE_DEVICES"