diff --git a/benchmark/readme.md b/benchmark/readme.md index 7f7704cca5341d495dfbcdc66ddfd29fbea1e1df..d90d21468e7c9d0c9068a273ae704c0c8a086eab 100644 --- a/benchmark/readme.md +++ b/benchmark/readme.md @@ -1,5 +1,5 @@ -# PaddleOCR DB/EAST 算法训练benchmark测试 +# PaddleOCR DB/EAST/PSE 算法训练benchmark测试 PaddleOCR/benchmark目录下的文件用于获取并分析训练日志。 训练采用icdar2015数据集,包括1000张训练图像和500张测试图像。模型配置采用resnet18_vd作为backbone,分别训练batch_size=8和batch_size=16的情况。 @@ -18,7 +18,7 @@ run_det.sh 执行方式如下: ``` # cd PaddleOCR/ -bash benchmark/run_det.sh +bash benchmark/run_det.sh ``` 以DB为例,将得到四个日志文件,如下: @@ -28,7 +28,3 @@ det_res18_db_v2.0_sp_bs8_fp32_1 det_res18_db_v2.0_mp_bs16_fp32_1 det_res18_db_v2.0_mp_bs8_fp32_1 ``` - - - - diff --git a/benchmark/run_benchmark_det.sh b/benchmark/run_benchmark_det.sh index 26bcda5d20ba4e4d0498da28aafb93f29468169d..46144b43a09baf787216728eabaab5f8548fa924 100644 --- a/benchmark/run_benchmark_det.sh +++ b/benchmark/run_benchmark_det.sh @@ -6,7 +6,7 @@ function _set_params(){ run_mode=${1:-"sp"} # 单卡sp|多卡mp batch_size=${2:-"64"} fp_item=${3:-"fp32"} # fp32|fp16 - max_iter=${4:-"500"} # 可选,如果需要修改代码提前中断 + max_iter=${4:-"10"} # 可选,如果需要修改代码提前中断 model_name=${5:-"model_name"} run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # TRAIN_LOG_DIR 后续QA设置该参数 @@ -20,7 +20,7 @@ function _train(){ echo "Train on ${num_gpu_devices} GPUs" echo "current CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES, gpus=$num_gpu_devices, batch_size=$batch_size" - train_cmd="-c configs/det/${model_name}.yml -o Train.loader.batch_size_per_card=${batch_size} Global.epoch_num=${max_iter} " + train_cmd="-c configs/det/${model_name}.yml -o Train.loader.batch_size_per_card=${batch_size} Global.epoch_num=${max_iter} Global.eval_batch_step=[0,20000] Global.print_batch_step=2" case ${run_mode} in sp) train_cmd="python3.7 tools/train.py "${train_cmd}"" @@ -39,18 +39,24 @@ function _train(){ echo -e "${model_name}, SUCCESS" export job_fail_flag=0 fi - kill -9 `ps -ef|grep 'python3.7'|awk '{print $2}'` if [ $run_mode = "mp" -a -d mylog ]; then rm ${log_file} cp mylog/workerlog.0 ${log_file} fi +} - # run log analysis - analysis_cmd="python3.7 benchmark/analysis.py --filename ${log_file} --mission_name ${model_name} --run_mode ${mode} --direction_id 0 --keyword 'ips:' --base_batch_size ${batch_szie} --skip_steps 1 --gpu_num ${num_gpu_devices} --index 1 --model_mode=-1 --ips_unit=samples/sec" +function _analysis_log(){ + analysis_cmd="python3.7 benchmark/analysis.py --filename ${log_file} --mission_name ${model_name} --run_mode ${run_mode} --direction_id 0 --keyword 'ips:' --base_batch_size ${batch_size} --skip_steps 1 --gpu_num ${num_gpu_devices} --index 1 --model_mode=-1 --ips_unit=samples/sec" eval $analysis_cmd } +function _kill_process(){ + kill -9 `ps -ef|grep 'python3.7'|awk '{print $2}'` +} + + _set_params $@ _train - +_analysis_log +_kill_process \ No newline at end of file diff --git a/benchmark/run_det.sh b/benchmark/run_det.sh index c507510c615a60177e07300976947b010dbae990..68109b3ab2c3b8b61a0c90b4b31fd855c1ba2d46 100644 --- a/benchmark/run_det.sh +++ b/benchmark/run_det.sh @@ -3,11 +3,11 @@ # 1 安装该模型需要的依赖 (如需开启优化策略请注明) python3.7 -m pip install -r requirements.txt # 2 拷贝该模型需要数据、预训练模型 -wget -c -p ./tain_data/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/icdar2015.tar && cd train_data && tar xf icdar2015.tar && cd ../ -wget -c -p ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNet50_vd_pretrained.pdparams +wget -P ./train_data/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/icdar2015.tar && cd train_data && tar xf icdar2015.tar && cd ../ +wget -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNet50_vd_pretrained.pdparams # 3 批量运行(如不方便批量,1,2需放到单个模型中) -model_mode_list=(det_res18_db_v2.0 det_r50_vd_east) +model_mode_list=(det_res18_db_v2.0 det_r50_vd_east det_r50_vd_pse) fp_item_list=(fp32) bs_list=(8 16) for model_mode in ${model_mode_list[@]}; do @@ -15,11 +15,11 @@ for model_mode in ${model_mode_list[@]}; do for bs_item in ${bs_list[@]}; do echo "index is speed, 1gpus, begin, ${model_name}" run_mode=sp - CUDA_VISIBLE_DEVICES=0 bash benchmark/run_benchmark_det.sh ${run_mode} ${bs_item} ${fp_item} 10 ${model_mode} # (5min) + CUDA_VISIBLE_DEVICES=0 bash benchmark/run_benchmark_det.sh ${run_mode} ${bs_item} ${fp_item} 2 ${model_mode} # (5min) sleep 60 echo "index is speed, 8gpus, run_mode is multi_process, begin, ${model_name}" run_mode=mp - CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash benchmark/run_benchmark_det.sh ${run_mode} ${bs_item} ${fp_item} 10 ${model_mode} + CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash benchmark/run_benchmark_det.sh ${run_mode} ${bs_item} ${fp_item} 2 ${model_mode} sleep 60 done done diff --git a/tools/program.py b/tools/program.py index 227b21f85023305d0e5318126cf45a26ecb973e9..d110f70704028948dff2bc889e07d128e0bc94ea 100755 --- a/tools/program.py +++ b/tools/program.py @@ -212,15 +212,15 @@ def train(config, for epoch in range(start_epoch, epoch_num + 1): train_dataloader = build_dataloader( config, 'Train', device, logger, seed=epoch) - train_batch_cost = 0.0 train_reader_cost = 0.0 - batch_sum = 0 - batch_start = time.time() + train_run_cost = 0.0 + total_samples = 0 + reader_start = time.time() max_iter = len(train_dataloader) - 1 if platform.system( ) == "Windows" else len(train_dataloader) for idx, batch in enumerate(train_dataloader): profiler.add_profiler_step(profiler_options) - train_reader_cost += time.time() - batch_start + train_reader_cost += time.time() - reader_start if idx >= max_iter: break lr = optimizer.get_lr() @@ -228,6 +228,7 @@ def train(config, if use_srn: model_average = True + train_start = time.time() # use amp if scaler: with paddle.amp.auto_cast(): @@ -252,8 +253,8 @@ def train(config, optimizer.step() optimizer.clear_grad() - train_batch_cost += time.time() - batch_start - batch_sum += len(images) + train_run_cost += time.time() - train_start + total_samples += len(images) if not isinstance(lr_scheduler, float): lr_scheduler.step() @@ -284,12 +285,13 @@ def train(config, logs = train_stats.log() strs = 'epoch: [{}/{}], iter: {}, {}, reader_cost: {:.5f} s, batch_cost: {:.5f} s, samples: {}, ips: {:.5f}'.format( epoch, epoch_num, global_step, logs, train_reader_cost / - print_batch_step, train_batch_cost / print_batch_step, - batch_sum, batch_sum / train_batch_cost) + print_batch_step, (train_reader_cost + train_run_cost) / + print_batch_step, total_samples, + total_samples / (train_reader_cost + train_run_cost)) logger.info(strs) - train_batch_cost = 0.0 train_reader_cost = 0.0 - batch_sum = 0 + train_run_cost = 0.0 + total_samples = 0 # eval if global_step > start_eval_step and \ (global_step - start_eval_step) % eval_batch_step == 0 and dist.get_rank() == 0: @@ -342,7 +344,7 @@ def train(config, global_step) global_step += 1 optimizer.clear_grad() - batch_start = time.time() + reader_start = time.time() if dist.get_rank() == 0: save_model( model, @@ -383,7 +385,11 @@ def eval(model, with paddle.no_grad(): total_frame = 0.0 total_time = 0.0 - pbar = tqdm(total=len(valid_dataloader), desc='eval model:') + pbar = tqdm( + total=len(valid_dataloader), + desc='eval model:', + position=0, + leave=True) max_iter = len(valid_dataloader) - 1 if platform.system( ) == "Windows" else len(valid_dataloader) for idx, batch in enumerate(valid_dataloader):