Merge branch 'dygraph' into PTDN_ppocrv2

a434b133 · Double_V · GitHub · ce92ff2e · bbf4625e · a434b133
8 changed file
--- a/benchmark/readme.md
+++ b/benchmark/readme.md
-# PaddleOCR DB/EAST 算法训练benchmark测试
+# PaddleOCR DB/EAST/PSE 算法训练benchmark测试
 PaddleOCR/benchmark目录下的文件用于获取并分析训练日志。
 训练采用icdar2015数据集，包括1000张训练图像和500张测试图像。模型配置采用resnet18_vd作为backbone，分别训练batch_size=8和batch_size=16的情况。
@@ -28,7 +28,3 @@ det_res18_db_v2.0_sp_bs8_fp32_1
 det_res18_db_v2.0_mp_bs16_fp32_1
 det_res18_db_v2.0_mp_bs8_fp32_1
 ```
--- a/benchmark/run_benchmark_det.sh
+++ b/benchmark/run_benchmark_det.sh
@@ -6,7 +6,7 @@ function _set_params(){
    run_mode=${1:-"sp"}          # 单卡sp|多卡mp
    batch_size=${2:-"64"}
    fp_item=${3:-"fp32"}        # fp32|fp16
-    max_iter=${4:-"500"}       # 可选，如果需要修改代码提前中断
+    max_iter=${4:-"10"}       # 可选，如果需要修改代码提前中断
    model_name=${5:-"model_name"}
    run_log_path=${TRAIN_LOG_DIR:-$(pwd)}  # TRAIN_LOG_DIR 后续QA设置该参数
@@ -20,7 +20,7 @@ function _train(){
    echo "Train on ${num_gpu_devices} GPUs"
    echo "current CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES, gpus=$num_gpu_devices, batch_size=$batch_size"
-    train_cmd="-c configs/det/${model_name}.yml -o Train.loader.batch_size_per_card=${batch_size} Global.epoch_num=${max_iter} "   
+    train_cmd="-c configs/det/${model_name}.yml -o Train.loader.batch_size_per_card=${batch_size} Global.epoch_num=${max_iter} Global.eval_batch_step=[0,20000] Global.print_batch_step=2"   
    case ${run_mode} in
      sp) 
        train_cmd="python3.7 tools/train.py "${train_cmd}""
@@ -39,18 +39,24 @@ function _train(){
        echo -e "${model_name}, SUCCESS"
        export job_fail_flag=0
    fi
-    kill -9 `ps -ef|grep 'python3.7'|awk '{print $2}'`
    if [ $run_mode = "mp" -a -d mylog ]; then
        rm ${log_file}
        cp mylog/workerlog.0 ${log_file}
    fi
+}
-    # run log analysis
+function _analysis_log(){
-    analysis_cmd="python3.7 benchmark/analysis.py --filename ${log_file}  --mission_name ${model_name} --run_mode ${mode} --direction_id 0 --keyword 'ips:' --base_batch_size ${batch_szie} --skip_steps 1 --gpu_num ${num_gpu_devices}  --index 1  --model_mode=-1  --ips_unit=samples/sec"
+    analysis_cmd="python3.7 benchmark/analysis.py --filename ${log_file}  --mission_name ${model_name} --run_mode ${run_mode} --direction_id 0 --keyword 'ips:' --base_batch_size ${batch_size} --skip_steps 1 --gpu_num ${num_gpu_devices}  --index 1  --model_mode=-1  --ips_unit=samples/sec"
    eval $analysis_cmd
 }
+function _kill_process(){
+    kill -9 `ps -ef|grep 'python3.7'|awk '{print $2}'`
+}
 _set_params $@
 _train
+_analysis_log
+_kill_process
\ No newline at end of file
--- a/benchmark/run_det.sh
+++ b/benchmark/run_det.sh
@@ -3,11 +3,11 @@
 # 1 安装该模型需要的依赖 (如需开启优化策略请注明)
 python3.7 -m pip install -r requirements.txt
 # 2 拷贝该模型需要数据、预训练模型
-wget -c  -p ./tain_data/  https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/icdar2015.tar && cd train_data  && tar xf icdar2015.tar && cd ../
+wget -P ./train_data/  https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/icdar2015.tar && cd train_data  && tar xf icdar2015.tar && cd ../
-wget -c -p ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNet50_vd_pretrained.pdparams
+wget -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNet50_vd_pretrained.pdparams
 # 3 批量运行（如不方便批量，1，2需放到单个模型中）
-model_mode_list=(det_res18_db_v2.0 det_r50_vd_east)
+model_mode_list=(det_res18_db_v2.0 det_r50_vd_east det_r50_vd_pse)
 fp_item_list=(fp32)
 bs_list=(8 16)
 for model_mode in ${model_mode_list[@]}; do
@@ -15,11 +15,11 @@ for model_mode in ${model_mode_list[@]}; do
          for bs_item in ${bs_list[@]}; do
            echo "index is speed, 1gpus, begin, ${model_name}"
            run_mode=sp
-            CUDA_VISIBLE_DEVICES=0 bash benchmark/run_benchmark_det.sh ${run_mode} ${bs_item} ${fp_item} 10 ${model_mode}     #  (5min)
+            CUDA_VISIBLE_DEVICES=0 bash benchmark/run_benchmark_det.sh ${run_mode} ${bs_item} ${fp_item} 2 ${model_mode}     #  (5min)
            sleep 60
            echo "index is speed, 8gpus, run_mode is multi_process, begin, ${model_name}"
            run_mode=mp
-            CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash benchmark/run_benchmark_det.sh ${run_mode} ${bs_item} ${fp_item} 10 ${model_mode} 
+            CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash benchmark/run_benchmark_det.sh ${run_mode} ${bs_item} ${fp_item} 2 ${model_mode} 
            sleep 60
            done
      done

--- a/test_tipc/docs/install.md
+++ b/test_tipc/docs/install.md
+## 1. 环境准备
-## 环境配置
 本教程适用于PTDN目录下基础功能测试的运行环境搭建。
 推荐环境：
- CUDA 10.1
+- CUDA 10.1/10.2
- CUDNN 7.6
+- CUDNN 7.6/cudnn8.1
- TensorRT 6.1.0.5 / 7.1
+- TensorRT 6.1.0.5 / 7.1 / 7.2
+环境配置可以选择docker镜像安装，或者在本地环境Python搭建环境。推荐使用docker镜像安装，避免不必要的环境配置。
+## 2. Docker 镜像安装
 推荐docker镜像安装，按照如下命令创建镜像，当前目录映射到镜像中的`/paddle`目录下
 ```
@@ -16,7 +18,79 @@ cd /paddle
 # 安装带TRT的paddle
 pip3.7 install https://paddle-wheel.bj.bcebos.com/with-trt/2.1.3/linux-gpu-cuda10.1-cudnn7-mkl-gcc8.2-trt6-avx/paddlepaddle_gpu-2.1.3.post101-cp37-cp37m-linux_x86_64.whl
+```
+## 3 Python 环境构建
+非docker环境下，环境配置比较灵活，推荐环境组合配置：
+- CUDA10.1 + CUDNN7.6 + TensorRT 6
+- CUDA10.2 + CUDNN8.1 + TensorRT 7
+- CUDA11.1 + CUDNN8.1 + TensorRT 7
+下面以 CUDA10.2 + CUDNN8.1 + TensorRT 7 配置为例，介绍环境配置的流程。
+### 3.1 安装CUDNN
+如果当前环境满足CUDNN版本的要求，可以跳过此步骤。
+以CUDNN8.1 安装安装为例，安装步骤如下，首先下载CUDNN，从[Nvidia官网](https://developer.nvidia.com/rdp/cudnn-archive)下载CUDNN8.1版本，下载符合当前系统版本的三个deb文件，分别是：
+- cuDNN Runtime Library ，如：libcudnn8_8.1.0.77-1+cuda10.2_amd64.deb
+- cuDNN Developer Library ，如：libcudnn8-dev_8.1.0.77-1+cuda10.2_amd64.deb
+- cuDNN Code Samples，如：libcudnn8-samples_8.1.0.77-1+cuda10.2_amd64.deb
+deb安装可以参考[官方文档](https://docs.nvidia.com/deeplearning/cudnn/install-guide/index.html#installlinux-deb)，安装方式如下
+```
+# x.x.x表示下载的版本号
+# $HOME为工作目录
+sudo dpkg -i libcudnn8_x.x.x-1+cudax.x_arm64.deb
+sudo dpkg -i libcudnn8-dev_8.x.x.x-1+cudax.x_arm64.deb
+sudo dpkg -i libcudnn8-samples_8.x.x.x-1+cudax.x_arm64.deb
+# 验证是否正确安装
+cp -r /usr/src/cudnn_samples_v8/ $HOME
+cd  $HOME/cudnn_samples_v8/mnistCUDNN
+# 编译
+make clean && make
+./mnistCUDNN
+```
+如果运行mnistCUDNN完后提示运行成功，则表示安装成功。如果运行后出现freeimage相关的报错，需要按照提示安装freeimage库:
+```
+sudo apt-get install libfreeimage-dev
+sudo apt-get install libfreeimage
+```
+### 3.2 安装TensorRT
+首先，从[Nvidia官网TensorRT板块](https://developer.nvidia.com/tensorrt-getting-started)下载TensorRT，这里选择7.1.3.4版本的TensorRT，注意选择适合自己系统版本和CUDA版本的TensorRT，另外建议下载TAR package的安装包。
+以Ubuntu16.04+CUDA10.2为例，下载并解压后可以参考[官方文档](https://docs.nvidia.com/deeplearning/tensorrt/archives/tensorrt-713/install-guide/index.html#installing-tar)的安装步骤，按照如下步骤安装:
+```
+# 以下安装命令中 '${version}' 为下载的TensorRT版本，如7.1.3.4
+# 设置环境变量，<TensorRT-${version}/lib> 为解压后的TensorRT的lib目录
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<TensorRT-${version}/lib>
+# 安装TensorRT
+cd TensorRT-${version}/python
+pip3.7 install tensorrt-*-cp3x-none-linux_x86_64.whl
+# 安装graphsurgeon
+cd TensorRT-${version}/graphsurgeon
+```
+### 3.3 安装PaddlePaddle
+下载支持TensorRT版本的Paddle安装包，注意安装包的TensorRT版本需要与本地TensorRT一致，下载[链接](https://paddleinference.paddlepaddle.org.cn/user_guides/download_lib.html#python)
+选择下载 linux-cuda10.2-trt7-gcc8.2 Python3.7版本的Paddle：
+```
+# 从下载链接中可以看到是paddle2.1.1-cuda10.2-cudnn8.1版本
+wget  https://paddle-wheel.bj.bcebos.com/with-trt/2.1.1-gpu-cuda10.2-cudnn8.1-mkl-gcc8.2/paddlepaddle_gpu-2.1.1-cp37-cp37m-linux_x86_64.whl
+pip3.7 install -U paddlepaddle_gpu-2.1.1-cp37-cp37m-linux_x86_64.whl
+```
+## 4. 安装PaddleOCR依赖
+```
 # 安装AutoLog
 git clone https://github.com/LDOUBLEV/AutoLog
 cd AutoLog
@@ -24,7 +98,6 @@ pip3.7 install -r requirements.txt
 python3.7 setup.py bdist_wheel
 pip3.7 install ./dist/auto_log-1.0.0-py3-none-any.whl
 # 下载OCR代码
 cd ../
 git clone https://github.com/PaddlePaddle/PaddleOCR
@@ -45,4 +118,4 @@ A. 问题一般是当前安装paddle版本带TRT，但是本地环境找不到Te
 ```
 export LD_LIBRARY_PATH=/usr/local/python3.7.0/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/paddle/package/TensorRT-6.0.1.5/lib
 ```
-或者问题是下载的TensorRT版本和当前paddle中编译的TRT版本不匹配，需要下载版本相符的TRT。
+或者问题是下载的TensorRT版本和当前paddle中编译的TRT版本不匹配，需要下载版本相符的TensorRT重新安装。
--- a/test_tipc/docs/test_train_inference_python.md
+++ b/test_tipc/docs/test_train_inference_python.md
@@ -51,37 +51,37 @@
 `test_train_inference_python.sh`包含5种运行模式，每种模式的运行数据不同，分别用于测试速度和精度，分别是：
- 模式1：lite_train_infer，使用少量数据训练，用于快速验证训练到预测的走通流程，不验证精度和速度；
+- 模式1：lite_train_lite_infer，使用少量数据训练，用于快速验证训练到预测的走通流程，不验证精度和速度；
 ```shell
-bash test_tipc/prepare.sh ./test_tipc/configs/ppocr_det_mobile_params.txt 'lite_train_infer'
+bash test_tipc/prepare.sh ./test_tipc/configs/ppocr_det_mobile_params.txt 'lite_train_lite_infer'
-bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/ppocr_det_mobile_params.txt 'lite_train_infer'
+bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/ppocr_det_mobile_params.txt 'lite_train_lite_infer'
 ```  
- 模式2：whole_infer，使用少量数据训练，一定量数据预测，用于验证训练后的模型执行预测，预测速度是否合理；
+- 模式2：lite_train_whole_infer，使用少量数据训练，一定量数据预测，用于验证训练后的模型执行预测，预测速度是否合理；
 ```shell
-bash test_tipc/prepare.sh ./test_tipc/configs/ppocr_det_mobile_params.txt 'whole_infer'
+bash test_tipc/prepare.sh ./test_tipc/configs/ppocr_det_mobile_params.txt 'lite_train_whole_infer'
-bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/ppocr_det_mobile_params.txt 'whole_infer'
+bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/ppocr_det_mobile_params.txt 'lite_train_whole_infer'
 ```  
- 模式3：infer，不训练，全量数据预测，走通开源模型评估、动转静，检查inference model预测时间和精度;
+- 模式3：whole_infer，不训练，全量数据预测，走通开源模型评估、动转静，检查inference model预测时间和精度;
 ```shell
-bash test_tipc/prepare.sh ./test_tipc/configs/ppocr_det_mobile_params.txt 'infer'
+bash test_tipc/prepare.sh ./test_tipc/configs/ppocr_det_mobile_params.txt 'whole_infer'
 # 用法1:
-bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/ppocr_det_mobile_params.txt 'infer'
+bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/ppocr_det_mobile_params.txt 'whole_infer'
 # 用法2: 指定GPU卡预测，第三个传入参数为GPU卡号
-bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/ppocr_det_mobile_params.txt 'infer' '1'
+bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/ppocr_det_mobile_params.txt 'whole_infer' '1'
 ```  
- 模式4：whole_train_infer，CE： 全量数据训练，全量数据预测，验证模型训练精度，预测精度，预测速度；
+- 模式4：whole_train_whole_infer，CE： 全量数据训练，全量数据预测，验证模型训练精度，预测精度，预测速度；
 ```shell
-bash test_tipc/prepare.sh ./test_tipc/configs/ppocr_det_mobile_params.txt 'whole_train_infer'
+bash test_tipc/prepare.sh ./test_tipc/configs/ppocr_det_mobile_params.txt 'whole_train_whole_infer'
-bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/ppocr_det_mobile_params.txt 'whole_train_infer'
+bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/ppocr_det_mobile_params.txt 'whole_train_whole_infer'
 ```  
- 模式5：klquant_infer，测试离线量化；
+- 模式5：klquant_whole_infer，测试离线量化；
 ```shell
-bash test_tipc/prepare.sh ./test_tipc/configs/ppocr_det_mobile_params.txt 'klquant_infer'
+bash test_tipc/prepare.sh ./test_tipc/configs/ppocr_det_mobile_params.txt 'klquant_whole_infer'
-bash test_tipc/test_train_inference_python.sh test_tipc/configs/ppocr_det_mobile_params.txt  'klquant_infer'
+bash test_tipc/test_train_inference_python.sh test_tipc/configs/ppocr_det_mobile_params.txt  'klquant_whole_infer'
 ```

--- a/test_tipc/prepare.sh
+++ b/test_tipc/prepare.sh
 #!/bin/bash
 FILENAME=$1
-# MODE be one of ['lite_train_infer' 'whole_infer' 'whole_train_infer',  'infer', 
+# MODE be one of ['lite_train_lite_infer' 'lite_train_whole_infer' 'whole_train_whole_infer',  
-#                 'cpp_infer', 'serving_infer', 'klquant_infer', 'lite_infer']
+#                 'whole_infer', 'klquant_whole_infer',
+#                 'cpp_infer', 'serving_infer',  'lite_infer']
 MODE=$2
@@ -34,7 +35,7 @@ trainer_list=$(func_parser_value "${lines[14]}")
 # MODE be one of ['lite_train_infer' 'whole_infer' 'whole_train_infer']
 MODE=$2
-if [ ${MODE} = "lite_train_infer" ];then
+if [ ${MODE} = "lite_train_lite_infer" ];then
    # pretrain lite train data
    wget -nc -P  ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV3_large_x0_5_pretrained.pdparams
    wget -nc -P ./pretrain_models/  https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_mv3_db_v2.0_train.tar
@@ -54,7 +55,7 @@ if [ ${MODE} = "lite_train_infer" ];then
    ln -s ./icdar2015_lite ./icdar2015
    cd ../
    cd ./inference && tar xf rec_inference.tar && cd ../
-elif [ ${MODE} = "whole_train_infer" ];then
+elif [ ${MODE} = "whole_train_whole_infer" ];then
    wget -nc -P  ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV3_large_x0_5_pretrained.pdparams
    rm -rf ./train_data/icdar2015
    rm -rf ./train_data/ic15_data
@@ -65,7 +66,7 @@ elif [ ${MODE} = "whole_train_infer" ];then
        wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar
        cd ./pretrain_models/ && tar xf ch_PP-OCRv2_det_distill_train.tar && cd ../
    fi
-elif [ ${MODE} = "whole_infer" ];then
+elif [ ${MODE} = "lite_train_whole_infer" ];then
    wget -nc -P  ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV3_large_x0_5_pretrained.pdparams
    rm -rf ./train_data/icdar2015
    rm -rf ./train_data/ic15_data
@@ -78,7 +79,7 @@ elif [ ${MODE} = "whole_infer" ];then
        wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar
        cd ./pretrain_models/ && tar xf ch_PP-OCRv2_det_distill_train.tar && cd ../
    fi
-elif [ ${MODE} = "infer" ];then
+elif [ ${MODE} = "whole_infer" ];then
    if [ ${model_name} = "ocr_det" ]; then
        eval_model_name="ch_ppocr_mobile_v2.0_det_train"
        rm -rf ./train_data/icdar2015
@@ -112,13 +113,16 @@ elif [ ${MODE} = "infer" ];then
        wget -nc  -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_infer.tar
        cd ./inference && tar xf ${eval_model_name}.tar && tar xf rec_inference.tar && cd ../
    fi 
    elif [ ${model_name} = "PPOCRv2_ocr_det" ]; then
        eval_model_name="ch_PP-OCRv2_det_infer"
        wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/ch_det_data_50.tar
        wget -nc -P ./inference/ https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar
        cd ./inference && tar xf ${eval_model_name}.tar && tar xf ch_det_data_50.tar && cd ../
    fi 
-elif [ ${MODE} = "klquant_infer" ];then
+elif [ ${MODE} = "klquant_whole_infer" ];then
    if [ ${model_name} = "ocr_det" ]; then
        wget -nc  -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar
        wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/ch_det_data_50.tar

--- a/test_tipc/test_train_inference_python.sh
+++ b/test_tipc/test_train_inference_python.sh
@@ -2,7 +2,7 @@
 source tests/common_func.sh
 FILENAME=$1
-# MODE be one of ['lite_train_infer' 'whole_infer' 'whole_train_infer', 'infer', 'klquant_infer']
+# MODE be one of ['lite_train_lite_infer' 'lite_train_whole_infer' 'whole_train_whole_infer', 'whole_infer', 'klquant_whole_infer']
 MODE=$2
 dataline=$(awk 'NR==1, NR==51{print}'  $FILENAME)
@@ -89,7 +89,7 @@ infer_key1=$(func_parser_key "${lines[50]}")
 infer_value1=$(func_parser_value "${lines[50]}")
 # parser klquant_infer
-if [ ${MODE} = "klquant_infer" ]; then
+if [ ${MODE} = "klquant_whole_infer" ]; then
    dataline=$(awk 'NR==82, NR==98{print}'  $FILENAME)
    lines=(${dataline})
    # parser inference model 
@@ -203,7 +203,7 @@ function func_inference(){
    done
 }
-if [ ${MODE} = "infer" ] || [ ${MODE} = "klquant_infer" ]; then
+if [ ${MODE} = "whole_infer" ] || [ ${MODE} = "klquant_whole_infer" ]; then
    GPUID=$3
    if [ ${#GPUID} -le 0 ];then
        env=" "

--- a/tools/program.py
+++ b/tools/program.py
@@ -212,15 +212,15 @@ def train(config,
    for epoch in range(start_epoch, epoch_num + 1):
        train_dataloader = build_dataloader(
            config, 'Train', device, logger, seed=epoch)
-        train_batch_cost = 0.0
        train_reader_cost = 0.0
-        batch_sum = 0
+        train_run_cost = 0.0
-        batch_start = time.time()
+        total_samples = 0
+        reader_start = time.time()
        max_iter = len(train_dataloader) - 1 if platform.system(
        ) == "Windows" else len(train_dataloader)
        for idx, batch in enumerate(train_dataloader):
            profiler.add_profiler_step(profiler_options)
-            train_reader_cost += time.time() - batch_start
+            train_reader_cost += time.time() - reader_start
            if idx >= max_iter:
                break
            lr = optimizer.get_lr()
@@ -228,6 +228,7 @@ def train(config,
            if use_srn:
                model_average = True
+            train_start = time.time()
            # use amp
            if scaler:
                with paddle.amp.auto_cast():
@@ -252,8 +253,8 @@ def train(config,
                optimizer.step()
            optimizer.clear_grad()
-            train_batch_cost += time.time() - batch_start
+            train_run_cost += time.time() - train_start
-            batch_sum += len(images)
+            total_samples += len(images)
            if not isinstance(lr_scheduler, float):
                lr_scheduler.step()
@@ -284,12 +285,13 @@ def train(config,
                logs = train_stats.log()
                strs = 'epoch: [{}/{}], iter: {}, {}, reader_cost: {:.5f} s, batch_cost: {:.5f} s, samples: {}, ips: {:.5f}'.format(
                    epoch, epoch_num, global_step, logs, train_reader_cost /
-                    print_batch_step, train_batch_cost / print_batch_step,
+                    print_batch_step, (train_reader_cost + train_run_cost) /
-                    batch_sum, batch_sum / train_batch_cost)
+                    print_batch_step, total_samples,
+                    total_samples / (train_reader_cost + train_run_cost))
                logger.info(strs)
-                train_batch_cost = 0.0
                train_reader_cost = 0.0
-                batch_sum = 0
+                train_run_cost = 0.0
+                total_samples = 0
            # eval
            if global_step > start_eval_step and \
                    (global_step - start_eval_step) % eval_batch_step == 0 and dist.get_rank() == 0:
@@ -342,7 +344,7 @@ def train(config,
                                          global_step)
            global_step += 1
            optimizer.clear_grad()
-            batch_start = time.time()
+            reader_start = time.time()
        if dist.get_rank() == 0:
            save_model(
                model,
@@ -383,7 +385,11 @@ def eval(model,
    with paddle.no_grad():
        total_frame = 0.0
        total_time = 0.0
-        pbar = tqdm(total=len(valid_dataloader), desc='eval model:')
+        pbar = tqdm(
+            total=len(valid_dataloader),
+            desc='eval model:',
+            position=0,
+            leave=True)
        max_iter = len(valid_dataloader) - 1 if platform.system(
        ) == "Windows" else len(valid_dataloader)
        for idx, batch in enumerate(valid_dataloader):
@@ -452,8 +458,6 @@ def get_center(model, eval_dataloader, post_process_class):
        batch = [item.numpy() for item in batch]
        # Obtain usable results from post-processing methods
-        total_time += time.time() - start
-        # Evaluate the results of the current batch
        post_result = post_process_class(preds, batch[1])
        #update char_center