Merge branch 'dygraph' of https://github.com/PaddlePaddle/PaddleOCR into feature_amp_train

025692aa · stephon · 82f19a31 · 3bc7670f · 025692aa · 025692aa
32 changed file
--- a/tests/common_func.sh
+++ b/tests/common_func.sh
--- a/tests/compare_results.py
+++ b/tests/compare_results.py
--- a/tests/configs/det_mv3_db.yml
+++ b/tests/configs/det_mv3_db.yml
--- a/tests/configs/det_r50_vd_db.yml
+++ b/tests/configs/det_r50_vd_db.yml
--- a/tests/configs/ppocr_det_mobile_params.txt
+++ b/tests/configs/ppocr_det_mobile_params.txt
@@ -65,6 +65,8 @@ inference:./deploy/cpp_infer/build/ppocr det
 null:null
 --benchmark:True
 ===========================serving_params===========================
+model_name:ocr_det
+python:python3.7
 trans_model:-m paddle_serving_client.convert
 --dirname:./inference/ch_ppocr_mobile_v2.0_det_infer/
 --model_filename:inference.pdmodel
@@ -82,14 +84,14 @@ pipline:pipeline_http_client.py --image_dir=../../doc/imgs
 ===========================kl_quant_params===========================
 infer_model:./inference/ch_ppocr_mobile_v2.0_det_infer/
 infer_export:tools/export_model.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o
-infer_quant:False
+infer_quant:True
 inference:tools/infer/predict_det.py
 --use_gpu:True|False
 --enable_mkldnn:True|False
 --cpu_threads:1|6
 --rec_batch_num:1
 --use_tensorrt:False|True
--precision:fp32|fp16|int8
+--precision:int8
 --det_model_dir:
 --image_dir:./inference/ch_det_data_50/all-sum-510/
 null:null

--- a/tests/configs/ppocr_det_server_params.txt
+++ b/tests/configs/ppocr_det_server_params.txt
@@ -49,4 +49,35 @@ inference:tools/infer/predict_det.py
 --save_log_path:null
 --benchmark:True
 null:null
+===========================cpp_infer_params===========================
+use_opencv:True
+infer_model:./inference/ch_ppocr_server_v2.0_det_infer/
+infer_quant:False
+inference:./deploy/cpp_infer/build/ppocr det
+--use_gpu:True|False
+--enable_mkldnn:True|False
+--cpu_threads:1|6
+--rec_batch_num:1
+--use_tensorrt:False|True
+--precision:fp32|fp16
+--det_model_dir:
+--image_dir:./inference/ch_det_data_50/all-sum-510/
+null:null
+--benchmark:True
+===========================serving_params===========================
+model_name:ocr_det_server
+python:python3.7
+trans_model:-m paddle_serving_client.convert
+--dirname:./inference/ch_ppocr_server_v2.0_det_infer/
+--model_filename:inference.pdmodel
+--params_filename:inference.pdiparams
+--serving_server:./deploy/pdserving/ppocr_det_mobile_2.0_serving/
+--serving_client:./deploy/pdserving/ppocr_det_mobile_2.0_client/
+serving_dir:./deploy/pdserving
+web_service:web_service_det.py --config=config.yml --opt op.det.concurrency=1
+op.det.local_service_conf.devices:null|0
+op.det.local_service_conf.use_mkldnn:True|False
+op.det.local_service_conf.thread_num:1|6
+op.det.local_service_conf.use_trt:False|True
+op.det.local_service_conf.precision:fp32|fp16|int8
+pipline:pipeline_http_client.py --image_dir=../../doc/imgs
--- a/tests/configs/ppocr_rec_mobile_params.txt
+++ b/tests/configs/ppocr_rec_mobile_params.txt
@@ -65,6 +65,8 @@ inference:./deploy/cpp_infer/build/ppocr rec
 null:null
 --benchmark:True
 ===========================serving_params===========================
+model_name:ocr_rec
+python:python3.7
 trans_model:-m paddle_serving_client.convert
 --dirname:./inference/ch_ppocr_mobile_v2.0_rec_infer/
 --model_filename:inference.pdmodel

--- a/tests/configs/ppocr_rec_server_params.txt
+++ b/tests/configs/ppocr_rec_server_params.txt
@@ -65,12 +65,14 @@ inference:./deploy/cpp_infer/build/ppocr rec
 null:null
 --benchmark:True
 ===========================serving_params===========================
+model_name:ocr_server_rec
+python:python3.7
 trans_model:-m paddle_serving_client.convert
 --dirname:./inference/ch_ppocr_server_v2.0_rec_infer/
 --model_filename:inference.pdmodel
 --params_filename:inference.pdiparams
--serving_server:./deploy/pdserving/ppocr_rec_server_2.0_serving/
+--serving_server:./deploy/pdserving/ppocr_rec_mobile_2.0_serving/
--serving_client:./deploy/pdserving/ppocr_rec_server_2.0_client/
+--serving_client:./deploy/pdserving/ppocr_rec_mobile_2.0_client/
 serving_dir:./deploy/pdserving
 web_service:web_service_rec.py --config=config.yml --opt op.rec.concurrency=1
 op.rec.local_service_conf.devices:null|0

--- a/tests/configs/ppocr_sys_mobile_params.txt
+++ b/tests/configs/ppocr_sys_mobile_params.txt
--- a/tests/configs/ppocr_sys_server_params.txt
+++ b/tests/configs/ppocr_sys_server_params.txt
--- a/tests/configs/rec_icdar15_r34_train.yml
+++ b/tests/configs/rec_icdar15_r34_train.yml
--- a/PTDN/docs/compare_cpp_right.png
+++ b/PTDN/docs/compare_cpp_right.png
--- a/PTDN/docs/compare_cpp_wrong.png
+++ b/PTDN/docs/compare_cpp_wrong.png
--- a/tests/docs/compare_right.png
+++ b/tests/docs/compare_right.png
--- a/tests/docs/compare_wrong.png
+++ b/tests/docs/compare_wrong.png
--- a/tests/docs/guide.png
+++ b/tests/docs/guide.png
--- a/PTDN/docs/test.png
+++ b/PTDN/docs/test.png
--- a/tests/docs/test_cpp.md
+++ b/tests/docs/test_cpp.md
 # C++预测功能测试
-C++预测功能测试的主程序为`test_cpp.sh`，可以测试基于C++预测库的模型推理功能。
+C++预测功能测试的主程序为`test_inference_cpp.sh`，可以测试基于C++预测库的模型推理功能。
-## 测试结论汇总
+## 1. 测试结论汇总
-| 算法名称 | 模型名称 |device | batchsize | mkldnn | cpu多线程 | tensorrt | 离线量化 |
+基于训练是否使用量化，进行本测试的模型可以分为`正常模型`和`量化模型`，这两类模型对应的C++预测功能汇总如下：
-|  ----  |   ----  |  ----  |  ---- |  ---- |  ----  |  ----| --- | 
-| DB   |ch_ppocr_mobile_v2.0_det| CPU/GPU | 1/6 | 支持 | 支持 | fp32/fp16/int8 | 支持 |
-| DB   |ch_ppocr_server_v2.0_det| CPU/GPU | 1/6 | 支持 | 支持 | fp32/fp16/int8 | 支持 |
-| CRNN |ch_ppocr_mobile_v2.0_rec| CPU/GPU | 1/6 | 支持 | 支持 | fp32/fp16/int8 | 支持 |
-| CRNN |ch_ppocr_server_v2.0_rec| CPU/GPU | 1/6 | 支持 | 支持 | fp32/fp16/int8 | 支持 |
-|PP-OCR|ch_ppocr_server_v2.0    | CPU/GPU | 1/6 | 支持 | 支持 | fp32/fp16/int8 | 支持 |
-|PP-OCR|ch_ppocr_server_v2.0    | CPU/GPU | 1/6 | 支持 | 支持 | fp32/fp16/int8 | 支持 |
+| 模型类型 |device | batchsize | tensorrt | mkldnn | cpu多线程 | 
+|  ----   |  ---- |   ----   |  :----:  |   :----:   |  :----:  |
+| 正常模型 | GPU | 1/6 | fp32/fp16 | - | - |
+| 正常模型 | CPU | 1/6 | - | fp32 | 支持 |
+| 量化模型 | GPU | 1/6 | int8 | - | - |
+| 量化模型 | CPU | 1/6 | - | int8 | 支持 |
+## 2. 测试流程
-## 1. 功能测试
+### 2.1 功能测试
-先运行`prepare.sh`准备数据和模型，然后运行`test_cpp.sh`进行测试，最终在```tests/output```目录下生成`cpp_infer_*.log`后缀的日志文件。
+先运行`prepare.sh`准备数据和模型，然后运行`test_inference_cpp.sh`进行测试，最终在```tests/output```目录下生成`cpp_infer_*.log`后缀的日志文件。
 ```shell
-bash tests/prepare.sh ./tests/configs/ppocr_det_mobile_params.txt
+bash tests/prepare.sh ./tests/configs/ppocr_det_mobile_params.txt "cpp_infer"
 # 用法1:
-bash tests/test_cpp.sh ./tests/configs/ppocr_det_mobile_params.txt
+bash tests/test_inference_cpp.sh ./tests/configs/ppocr_det_mobile_params.txt
 # 用法2: 指定GPU卡预测，第三个传入参数为GPU卡号
-bash tests/test_cpp.sh ./tests/configs/ppocr_det_mobile_params.txt '1'
+bash tests/test_inference_cpp.sh ./tests/configs/ppocr_det_mobile_params.txt '1'
 ```  
-## 2. 精度测试
+### 2.2 精度测试
 使用compare_results.py脚本比较模型预测的结果是否符合预期，主要步骤包括：
 - 提取日志中的预测坐标；
 - 从本地文件中提取保存好的坐标结果；
 - 比较上述两个结果是否符合精度预期，误差大于设置阈值时会报错。
-### 使用方式
+#### 使用方式
 运行命令：
 ```shell
-python3.7 tests/compare_results.py --gt_file=./tests/results/*.txt  --log_file=./tests/output/infer_*.log --atol=1e-3 --rtol=1e-3
+python3.7 tests/compare_results.py --gt_file=./tests/results/cpp_*.txt  --log_file=./tests/output/cpp_*.log --atol=1e-3 --rtol=1e-3
 ```
 参数介绍：  
@@ -47,10 +46,15 @@ python3.7 tests/compare_results.py --gt_file=./tests/results/*.txt  --log_file=.
 - atol: 设置的绝对误差
 - rtol: 设置的相对误差
-### 运行结果
+#### 运行结果
 正常运行效果如下图：
-<img src="compare_right.png" width="1000">
+<img src="compare_cpp_right.png" width="1000">
 出现不一致结果时的运行输出：
-<img src="compare_wrong.png" width="1000">
+<img src="compare_cpp_wrong.png" width="1000">
+## 3. 更多教程
+本文档为功能测试用，更详细的c++预测使用教程请参考：[服务器端C++预测](https://github.com/PaddlePaddle/PaddleOCR/tree/dygraph/deploy/cpp_infer)  
--- a/tests/docs/test_python.md
+++ b/tests/docs/test_python.md
-# Python功能测试
+# 基础训练预测功能测试
-Python功能测试的主程序为`test_python.sh`，可以测试基于Python的模型训练、评估、推理等基本功能，包括裁剪、量化、蒸馏。
+基础训练预测功能测试的主程序为`test_train_inference_python.sh`，可以测试基于Python的模型训练、评估、推理等基本功能，包括裁剪、量化、蒸馏。
-## 测试结论汇总
+## 1. 测试结论汇总
 - 训练相关：
 | 算法名称 | 模型名称 | 单机单卡 | 单机多卡 | 多机多卡 | 模型压缩（单机多卡） |
 |  :----  |   :----  |    :----  |  :----   |  :----   |  :----   |
-|  DB  | ch_ppocr_mobile_v2.0_det| 正常训练 <br> 混合精度 | 正常训练 <br> 混合精度 | 正常训练 <br> 混合精度 | 正常训练：FPGM裁剪、PACT量化 |
+|  DB  | ch_ppocr_mobile_v2.0_det| 正常训练 <br> 混合精度 | 正常训练 <br> 混合精度 | 正常训练 <br> 混合精度 | 正常训练：FPGM裁剪、PACT量化 <br> 离线量化（无需训练） |
-|  DB  | ch_ppocr_server_v2.0_det| 正常训练 <br> 混合精度 | 正常训练 <br> 混合精度 | 正常训练 <br> 混合精度 | 正常训练：FPGM裁剪、PACT量化 |
+|  DB  | ch_ppocr_server_v2.0_det| 正常训练 <br> 混合精度 | 正常训练 <br> 混合精度 | 正常训练 <br> 混合精度 | 正常训练：FPGM裁剪、PACT量化 <br> 离线量化（无需训练） |
-| CRNN | ch_ppocr_mobile_v2.0_rec| 正常训练 <br> 混合精度 | 正常训练 <br> 混合精度 | 正常训练 <br> 混合精度 | 正常训练：FPGM裁剪、PACT量化 |
+| CRNN | ch_ppocr_mobile_v2.0_rec| 正常训练 <br> 混合精度 | 正常训练 <br> 混合精度 | 正常训练 <br> 混合精度 | 正常训练：PACT量化 <br> 离线量化（无需训练） |
-| CRNN | ch_ppocr_server_v2.0_rec| 正常训练 <br> 混合精度 | 正常训练 <br> 混合精度 | 正常训练 <br> 混合精度 | 正常训练：FPGM裁剪、PACT量化 |
+| CRNN | ch_ppocr_server_v2.0_rec| 正常训练 <br> 混合精度 | 正常训练 <br> 混合精度 | 正常训练 <br> 混合精度 | 正常训练：PACT量化 <br> 离线量化（无需训练） |
-|PP-OCR| ch_ppocr_mobile_v2.0| 正常训练 <br> 混合精度 | 正常训练 <br> 混合精度 | 正常训练 <br> 混合精度 | 正常训练：FPGM裁剪、PACT量化 |
+|PP-OCR| ch_ppocr_mobile_v2.0| 正常训练 <br> 混合精度 | 正常训练 <br> 混合精度 | 正常训练 <br> 混合精度 | - |
-|PP-OCR| ch_ppocr_server_v2.0| 正常训练 <br> 混合精度 | 正常训练 <br> 混合精度 | 正常训练 <br> 混合精度 | 正常训练：FPGM裁剪、PACT量化 |
+|PP-OCR| ch_ppocr_server_v2.0| 正常训练 <br> 混合精度 | 正常训练 <br> 混合精度 | 正常训练 <br> 混合精度 | - |
+|PP-OCRv2| ch_PP-OCRv2 | 正常训练 <br> 混合精度 | 正常训练 <br> 混合精度 | 正常训练 <br> 混合精度 | - |
- 预测相关：
+- 预测相关：基于训练是否使用量化，可以将训练产出的模型可以分为`正常模型`和`量化模型`，这两类模型对应的预测功能汇总如下，
-| 算法名称 | 模型名称 |device | batchsize | mkldnn | cpu多线程 | tensorrt | 离线量化 |
+| 模型类型 |device | batchsize | tensorrt | mkldnn | cpu多线程 | 
-|  ----  |   ----  |  ----  |  ---- |  ---- |  ----  |  ----| --- | 
+|  ----   |  ---- |   ----   |  :----:  |   :----:   |  :----:  |
-| DB   |ch_ppocr_mobile_v2.0_det| CPU/GPU | 1/6 | 支持 | 支持 | fp32/fp16/int8 | 支持 |
+| 正常模型 | GPU | 1/6 | fp32/fp16 | - | - |
-| DB   |ch_ppocr_server_v2.0_det| CPU/GPU | 1/6 | 支持 | 支持 | fp32/fp16/int8 | 支持 |
+| 正常模型 | CPU | 1/6 | - | fp32 | 支持 |
-| CRNN |ch_ppocr_mobile_v2.0_rec| CPU/GPU | 1/6 | 支持 | 支持 | fp32/fp16/int8 | 支持 |
+| 量化模型 | GPU | 1/6 | int8 | - | - |
-| CRNN |ch_ppocr_server_v2.0_rec| CPU/GPU | 1/6 | 支持 | 支持 | fp32/fp16/int8 | 支持 |
+| 量化模型 | CPU | 1/6 | - | int8 | 支持 |
-|PP-OCR|ch_ppocr_server_v2.0    | CPU/GPU | 1/6 | 支持 | 支持 | fp32/fp16/int8 | 支持 |
-|PP-OCR|ch_ppocr_server_v2.0    | CPU/GPU | 1/6 | 支持 | 支持 | fp32/fp16/int8 | 支持 |
+## 2. 测试流程
-## 1. 安装依赖
+### 2.1 安装依赖
 - 安装PaddlePaddle >= 2.0
 - 安装PaddleOCR依赖
    ```
@@ -46,50 +45,57 @@ Python功能测试的主程序为`test_python.sh`，可以测试基于Python的
    ```
-## 2. 功能测试
+### 2.2 功能测试
-先运行`prepare.sh`准备数据和模型，然后运行`test_python.sh`进行测试，最终在```tests/output```目录下生成`infer_*.log`格式的日志文件。
+先运行`prepare.sh`准备数据和模型，然后运行`test_train_inference_python.sh`进行测试，最终在```tests/output```目录下生成`python_infer_*.log`格式的日志文件。
-test_python.sh包含四种运行模式，每种模式的运行数据不同，分别用于测试速度和精度，分别是：
+`test_train_inference_python.sh`包含5种运行模式，每种模式的运行数据不同，分别用于测试速度和精度，分别是：
 - 模式1：lite_train_infer，使用少量数据训练，用于快速验证训练到预测的走通流程，不验证精度和速度；
 ```shell
 bash tests/prepare.sh ./tests/configs/ppocr_det_mobile_params.txt 'lite_train_infer'
-bash tests/test_python.sh ./tests/configs/ppocr_det_mobile_params.txt 'lite_train_infer'
+bash tests/test_train_inference_python.sh ./tests/configs/ppocr_det_mobile_params.txt 'lite_train_infer'
 ```  
 - 模式2：whole_infer，使用少量数据训练，一定量数据预测，用于验证训练后的模型执行预测，预测速度是否合理；
 ```shell
 bash tests/prepare.sh ./tests/configs/ppocr_det_mobile_params.txt 'whole_infer'
-bash tests/test_python.sh ./tests/configs/ppocr_det_mobile_params.txt 'whole_infer'
+bash tests/test_train_inference_python.sh ./tests/configs/ppocr_det_mobile_params.txt 'whole_infer'
 ```  
- 模式3：infer 不训练，全量数据预测，走通开源模型评估、动转静，检查inference model预测时间和精度;
+- 模式3：infer，不训练，全量数据预测，走通开源模型评估、动转静，检查inference model预测时间和精度;
 ```shell
 bash tests/prepare.sh ./tests/configs/ppocr_det_mobile_params.txt 'infer'
 # 用法1:
-bash tests/test_python.sh ./tests/configs/ppocr_det_mobile_params.txt 'infer'
+bash tests/test_train_inference_python.sh ./tests/configs/ppocr_det_mobile_params.txt 'infer'
 # 用法2: 指定GPU卡预测，第三个传入参数为GPU卡号
-bash tests/test_python.sh ./tests/configs/ppocr_det_mobile_params.txt 'infer' '1'
+bash tests/test_train_inference_python.sh ./tests/configs/ppocr_det_mobile_params.txt 'infer' '1'
 ```  
- 模式4：whole_train_infer , CE： 全量数据训练，全量数据预测，验证模型训练精度，预测精度，预测速度；
+- 模式4：whole_train_infer，CE： 全量数据训练，全量数据预测，验证模型训练精度，预测精度，预测速度；
 ```shell
 bash tests/prepare.sh ./tests/configs/ppocr_det_mobile_params.txt 'whole_train_infer'
-bash tests/test.sh ./tests/configs/ppocr_det_mobile_params.txt 'whole_train_infer'
+bash tests/test_train_inference_python.sh ./tests/configs/ppocr_det_mobile_params.txt 'whole_train_infer'
 ```  
+- 模式5：klquant_infer，测试离线量化；
+```shell
+bash tests/prepare.sh ./tests/configs/ppocr_det_mobile_params.txt 'klquant_infer'
+bash tests/test_train_inference_python.sh tests/configs/ppocr_det_mobile_params.txt  'klquant_infer'
+```
-## 3. 精度测试
+### 2.3 精度测试
 使用compare_results.py脚本比较模型预测的结果是否符合预期，主要步骤包括：
 - 提取日志中的预测坐标；
 - 从本地文件中提取保存好的坐标结果；
 - 比较上述两个结果是否符合精度预期，误差大于设置阈值时会报错。
-### 使用方式
+#### 使用方式
 运行命令：
 ```shell
-python3.7 tests/compare_results.py --gt_file=./tests/results/*.txt  --log_file=./tests/output/infer_*.log --atol=1e-3 --rtol=1e-3
+python3.7 tests/compare_results.py --gt_file=./tests/results/python_*.txt  --log_file=./tests/output/python_*.log --atol=1e-3 --rtol=1e-3
 ```
 参数介绍：  
@@ -98,10 +104,16 @@ python3.7 tests/compare_results.py --gt_file=./tests/results/*.txt  --log_file=.
 - atol: 设置的绝对误差
 - rtol: 设置的相对误差
-### 运行结果
+#### 运行结果
 正常运行效果如下图：
 <img src="compare_right.png" width="1000">
 出现不一致结果时的运行输出：
 <img src="compare_wrong.png" width="1000">
+## 3. 更多教程
+本文档为功能测试用，更丰富的训练预测使用教程请参考：  
+[模型训练](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/doc/doc_ch/training.md)  
+[基于Python预测引擎推理](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/doc/doc_ch/inference.md)
--- a/tests/prepare.sh
+++ b/tests/prepare.sh
@@ -134,5 +134,5 @@ if [ ${MODE} = "serving_infer" ];then
    wget -nc  -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar
    wget -nc  -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_infer.tar
    wget -nc  -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_infer.tar
-    cd ./inference && tar xf ch_ppocr_mobile_v2.0_det_infer.tar && tar xf ch_ppocr_mobile_v2.0_rec_infer.tar && tar xf ch_ppocr_server_v2.0_rec_infer.tar && tar xf ch_ppocr_server_v2.0_det_infer.tar cd ../
+    cd ./inference && tar xf ch_ppocr_mobile_v2.0_det_infer.tar && tar xf ch_ppocr_mobile_v2.0_rec_infer.tar && tar xf ch_ppocr_server_v2.0_rec_infer.tar && tar xf ch_ppocr_server_v2.0_det_infer.tar && cd ../
 fi
--- a/tests/readme.md
+++ b/tests/readme.md
 # 推理部署导航
-飞桨除了基本的模型训练和预测，还提供了支持多端多平台的高性能推理部署工具。本文档提供了PaddleOCR中所有模型的推理部署导航，方便用户查阅每种模型的推理部署打通情况，并可以进行一键测试。
+## 1. 简介
+飞桨除了基本的模型训练和预测，还提供了支持多端多平台的高性能推理部署工具。本文档提供了PaddleOCR中所有模型的推理部署导航PTDN（Paddle Train Deploy Navigation），方便用户查阅每种模型的推理部署打通情况，并可以进行一键测试。
 <div align="center">
    <img src="docs/guide.png" width="1000">
 </div>
+## 2. 汇总信息
 打通情况汇总如下，已填写的部分表示可以使用本工具进行一键测试，未填写的表示正在支持中。
-| 算法论文 | 模型名称 | 模型类型 | python训练预测 |   其他  |
+**字段说明：**
-| :--- | :--- |  :----  | :-------- |  :----  |
+- 基础训练预测：包括模型训练、Paddle Inference Python预测。
-| DB     |ch_ppocr_mobile_v2.0_det | 检测  | 支持 | Paddle Inference: C++预测 <br> Paddle Serving: Python, C++  <br> Paddle-Lite: Python, C++ / ARM CPU |
+- 其他：包括Paddle Inference C++预测、Paddle Serving部署、Paddle-Lite部署等。
-| DB     |ch_ppocr_server_v2.0_det | 检测  | 支持 | Paddle Inference: C++预测 <br> Paddle Serving: Python, C++  <br> Paddle-Lite: Python, C++ / ARM CPU |
+| 算法论文 | 模型名称 | 模型类型 | 基础训练预测 |   其他  |
+| :--- | :--- |  :----:  | :--------: |  :----  |
+| DB     |ch_ppocr_mobile_v2.0_det | 检测  | 支持 | Paddle Inference: C++ <br> Paddle Serving: Python, C++ <br> Paddle-Lite: <br> (1) ARM CPU(C++) |
+| DB     |ch_ppocr_server_v2.0_det | 检测  | 支持 | Paddle Inference: C++ <br> Paddle Serving: Python, C++ <br> Paddle-Lite: <br> (1) ARM CPU(C++) |
 | DB     |ch_PP-OCRv2_det          | 检测  |
-| CRNN   |ch_ppocr_mobile_v2.0_rec | 识别  | 支持 | Paddle Inference: C++预测 <br> Paddle Serving: Python, C++  <br> Paddle-Lite: Python, C++ / ARM CPU |
+| CRNN   |ch_ppocr_mobile_v2.0_rec | 识别  | 支持 | Paddle Inference: C++ <br> Paddle Serving: Python, C++ <br> Paddle-Lite: <br> (1) ARM CPU(C++) |
-| CRNN   |ch_ppocr_server_v2.0_rec | 识别  | 支持 | Paddle Inference: C++预测 <br> Paddle Serving: Python, C++  <br> Paddle-Lite: Python, C++ / ARM CPU |
+| CRNN   |ch_ppocr_server_v2.0_rec | 识别  | 支持 | Paddle Inference: C++ <br> Paddle Serving: Python, C++ <br> Paddle-Lite: <br> (1) ARM CPU(C++) |
 | CRNN   |ch_PP-OCRv2_rec          | 识别  |
+| PP-OCR |ch_ppocr_mobile_v2.0 | 检测+识别  | 支持 | Paddle Inference: C++ <br> Paddle Serving: Python, C++ <br> Paddle-Lite: <br> (1) ARM CPU(C++) |
+| PP-OCR |ch_ppocr_server_v2.0 | 检测+识别  | 支持 | Paddle Inference: C++ <br> Paddle Serving: Python, C++ <br> Paddle-Lite: <br> (1) ARM CPU(C++) |
+|PP-OCRv2|ch_PP-OCRv2 | 检测+识别  | 支持 | Paddle Inference: C++ <br> Paddle Serving: Python, C++ <br> Paddle-Lite: <br> (1) ARM CPU(C++) |
 | DB     |det_mv3_db_v2.0                | 检测  |
 | DB     |det_r50_vd_db_v2.0             | 检测  |
 | EAST   |det_mv3_east_v2.0              | 检测  |
@@ -39,7 +51,7 @@
-## 一键测试工具使用
+## 3. 一键测试工具使用
 ### 目录介绍
 ```shell
@@ -56,14 +68,14 @@ tests/
 	├── ppocr_rec_server_params.txt     # 测试server版ppocr识别模型的参数配置文件
 	├── ...                                
 ├── results/   # 预先保存的预测结果，用于和实际预测结果进行精读比对
-	├── ppocr_det_mobile_results_fp32.txt           # 预存的mobile版ppocr检测模型fp32精度的结果
+	├── python_ppocr_det_mobile_results_fp32.txt           # 预存的mobile版ppocr检测模型python预测fp32精度的结果
-	├── ppocr_det_mobile_results_fp16.txt           # 预存的mobile版ppocr检测模型fp16精度的结果
+	├── python_ppocr_det_mobile_results_fp16.txt           # 预存的mobile版ppocr检测模型python预测fp16精度的结果
-	├── ppocr_det_mobile_results_fp32_cpp.txt       # 预存的mobile版ppocr检测模型c++预测的fp32精度的结果
+	├── cpp_ppocr_det_mobile_results_fp32.txt       # 预存的mobile版ppocr检测模型c++预测的fp32精度的结果
-	├── ppocr_det_mobile_results_fp16_cpp.txt       # 预存的mobile版ppocr检测模型c++预测的fp16精度的结果
+	├── cpp_ppocr_det_mobile_results_fp16.txt       # 预存的mobile版ppocr检测模型c++预测的fp16精度的结果
 	├── ...
 ├── prepare.sh                        # 完成test_*.sh运行所需要的数据和模型下载
-├── test_python.sh            # 测试python训练预测的主程序
+├── test_train_inference_python.sh    # 测试python训练预测的主程序
-├── test_cpp.sh               # 测试c++预测的主程序
+├── test_inference_cpp.sh             # 测试c++预测的主程序
 ├── test_serving.sh                   # 测试serving部署预测的主程序
 ├── test_lite.sh                      # 测试lite部署预测的主程序
 ├── compare_results.py                # 用于对比log中的预测结果与results中的预存结果精度误差是否在限定范围内
@@ -81,13 +93,13 @@ tests/
 3. 用`compare_results.py`对比log中的预测结果和预存在results目录下的结果，判断预测精度是否符合预期（在误差范围内）。
 其中，有4个测试主程序，功能如下：
- `test_python.sh`：测试基于Python的模型训练、评估、推理等基本功能，包括裁剪、量化、蒸馏。
+- `test_train_inference_python.sh`：测试基于Python的模型训练、评估、推理等基本功能，包括裁剪、量化、蒸馏。
- `test_cpp.sh`：测试基于C++的模型推理。
+- `test_inference_cpp.sh`：测试基于C++的模型推理。
 - `test_serving.sh`：测试基于Paddle Serving的服务化部署功能。
 - `test_lite.sh`：测试基于Paddle-Lite的端侧预测部署功能。
-各功能测试中涉及GPU/CPU、mkldnn、Tensorrt等多种参数配置，点击相应链接了解更多细节和使用教程：  
+各功能测试中涉及混合精度、裁剪、量化等训练相关，及mkldnn、Tensorrt等多种预测相关参数配置，请点击下方相应链接了解更多细节和使用教程：  
-[test_python使用](docs/test_python.md)  
+[test_train_inference_python 使用](docs/test_train_inference_python.md)  
-[test_cpp使用](docs/test_cpp.md)  
+[test_inference_cpp 使用](docs/test_inference_cpp.md)  
-[test_serving使用](docs/test_serving.md)  
+[test_serving 使用](docs/test_serving.md)  
-[test_lite使用](docs/test_lite.md)  
+[test_lite 使用](docs/test_lite.md)  
--- a/tests/results/ppocr_det_mobile_results_fp16_cpp.txt
+++ b/tests/results/ppocr_det_mobile_results_fp16_cpp.txt
--- a/tests/results/ppocr_det_mobile_results_fp32_cpp.txt
+++ b/tests/results/ppocr_det_mobile_results_fp32_cpp.txt
--- a/tests/results/ppocr_det_mobile_results_fp16.txt
+++ b/tests/results/ppocr_det_mobile_results_fp16.txt
--- a/tests/results/ppocr_det_mobile_results_fp32.txt
+++ b/tests/results/ppocr_det_mobile_results_fp32.txt
--- a/tests/test_cpp.sh
+++ b/tests/test_cpp.sh
@@ -56,7 +56,11 @@ function func_cpp_inference(){
                fi
                for threads in ${cpp_cpu_threads_list[*]}; do
                    for batch_size in ${cpp_batch_size_list[*]}; do
-                        _save_log_path="${_log_path}/cpp_infer_cpu_usemkldnn_${use_mkldnn}_threads_${threads}_batchsize_${batch_size}.log"
+                        precision="fp32"
+                        if [ ${use_mkldnn} = "False" ] && [ ${_flag_quant} = "True" ]; then
+                            precison="int8"
+                        fi
+                        _save_log_path="${_log_path}/cpp_infer_cpu_usemkldnn_${use_mkldnn}_threads_${threads}_precision_${precision}_batchsize_${batch_size}.log"
                        set_infer_data=$(func_set_params "${cpp_image_dir_key}" "${_img_dir}")
                        set_benchmark=$(func_set_params "${cpp_benchmark_key}" "${cpp_benchmark_value}")
                        set_batchsize=$(func_set_params "${cpp_batch_size_key}" "${batch_size}")

--- a/tests/test_serving.sh
+++ b/tests/test_serving.sh
@@ -2,44 +2,44 @@
 source tests/common_func.sh
 FILENAME=$1
-dataline=$(awk 'NR==67, NR==81{print}'  $FILENAME)
+dataline=$(awk 'NR==67, NR==83{print}'  $FILENAME)
 # parser params
 IFS=$'\n'
 lines=(${dataline})
 # parser serving
-trans_model_py=$(func_parser_value "${lines[1]}")
+model_name=$(func_parser_value "${lines[1]}")
-infer_model_dir_key=$(func_parser_key "${lines[2]}")
+python=$(func_parser_value "${lines[2]}")
-infer_model_dir_value=$(func_parser_value "${lines[2]}")
+trans_model_py=$(func_parser_value "${lines[3]}")
-model_filename_key=$(func_parser_key "${lines[3]}")
+infer_model_dir_key=$(func_parser_key "${lines[4]}")
-model_filename_value=$(func_parser_value "${lines[3]}")
+infer_model_dir_value=$(func_parser_value "${lines[4]}")
-params_filename_key=$(func_parser_key "${lines[4]}")
+model_filename_key=$(func_parser_key "${lines[5]}")
-params_filename_value=$(func_parser_value "${lines[4]}")
+model_filename_value=$(func_parser_value "${lines[5]}")
-serving_server_key=$(func_parser_key "${lines[5]}")
+params_filename_key=$(func_parser_key "${lines[6]}")
-serving_server_value=$(func_parser_value "${lines[5]}")
+params_filename_value=$(func_parser_value "${lines[6]}")
-serving_client_key=$(func_parser_key "${lines[6]}")
+serving_server_key=$(func_parser_key "${lines[7]}")
-serving_client_value=$(func_parser_value "${lines[6]}")
+serving_server_value=$(func_parser_value "${lines[7]}")
-serving_dir_value=$(func_parser_value "${lines[7]}")
+serving_client_key=$(func_parser_key "${lines[8]}")
-web_service_py=$(func_parser_value "${lines[8]}")
+serving_client_value=$(func_parser_value "${lines[8]}")
-web_use_gpu_key=$(func_parser_key "${lines[9]}")
+serving_dir_value=$(func_parser_value "${lines[9]}")
-web_use_gpu_list=$(func_parser_value "${lines[9]}")
+web_service_py=$(func_parser_value "${lines[10]}")
-web_use_mkldnn_key=$(func_parser_key "${lines[10]}")
+web_use_gpu_key=$(func_parser_key "${lines[11]}")
-web_use_mkldnn_list=$(func_parser_value "${lines[10]}")
+web_use_gpu_list=$(func_parser_value "${lines[11]}")
-web_cpu_threads_key=$(func_parser_key "${lines[11]}")
+web_use_mkldnn_key=$(func_parser_key "${lines[12]}")
-web_cpu_threads_list=$(func_parser_value "${lines[11]}")
+web_use_mkldnn_list=$(func_parser_value "${lines[12]}")
-web_use_trt_key=$(func_parser_key "${lines[12]}")
+web_cpu_threads_key=$(func_parser_key "${lines[13]}")
-web_use_trt_list=$(func_parser_value "${lines[12]}")
+web_cpu_threads_list=$(func_parser_value "${lines[13]}")
-web_precision_key=$(func_parser_key "${lines[13]}")
+web_use_trt_key=$(func_parser_key "${lines[14]}")
-web_precision_list=$(func_parser_value "${lines[13]}")
+web_use_trt_list=$(func_parser_value "${lines[14]}")
-pipeline_py=$(func_parser_value "${lines[14]}")
+web_precision_key=$(func_parser_key "${lines[15]}")
+web_precision_list=$(func_parser_value "${lines[15]}")
+pipeline_py=$(func_parser_value "${lines[16]}")
+LOG_PATH="../../tests/output"
-LOG_PATH="./tests/output"
+mkdir -p ./tests/output
-mkdir -p ${LOG_PATH}
 status_log="${LOG_PATH}/results_serving.log"
 function func_serving(){
    IFS='|'
    _python=$1
@@ -65,12 +65,12 @@ function func_serving(){
                    continue
                fi
                for threads in ${web_cpu_threads_list[*]}; do
-                      _save_log_path="${_log_path}/server_cpu_usemkldnn_${use_mkldnn}_threads_${threads}_batchsize_1.log"
+                      _save_log_path="${LOG_PATH}/server_infer_cpu_usemkldnn_${use_mkldnn}_threads_${threads}_batchsize_1.log"
                      set_cpu_threads=$(func_set_params "${web_cpu_threads_key}" "${threads}")
-                      web_service_cmd="${python} ${web_service_py} ${web_use_gpu_key}=${use_gpu} ${web_use_mkldnn_key}=${use_mkldnn} ${set_cpu_threads} &>${_save_log_path} &"
+                      web_service_cmd="${python} ${web_service_py} ${web_use_gpu_key}=${use_gpu} ${web_use_mkldnn_key}=${use_mkldnn} ${set_cpu_threads} &"
                      eval $web_service_cmd
                      sleep 2s
-                      pipeline_cmd="${python} ${pipeline_py}"
+                      pipeline_cmd="${python} ${pipeline_py} > ${_save_log_path} 2>&1 "
                      eval $pipeline_cmd
                      last_status=${PIPESTATUS[0]}
                      eval "cat ${_save_log_path}"
@@ -93,13 +93,13 @@ function func_serving(){
                    if [[ ${use_trt} = "False" || ${precision} =~ "int8" ]] && [[ ${_flag_quant} = "True" ]]; then
                        continue
                    fi
-                    _save_log_path="${_log_path}/infer_gpu_usetrt_${use_trt}_precision_${precision}_batchsize_1.log"
+                    _save_log_path="${LOG_PATH}/server_infer_gpu_usetrt_${use_trt}_precision_${precision}_batchsize_1.log"
                    set_tensorrt=$(func_set_params "${web_use_trt_key}" "${use_trt}")
                    set_precision=$(func_set_params "${web_precision_key}" "${precision}")
-                    web_service_cmd="${python} ${web_service_py} ${web_use_gpu_key}=${use_gpu} ${set_tensorrt} ${set_precision} &>${_save_log_path} & "
+                    web_service_cmd="${python} ${web_service_py} ${web_use_gpu_key}=${use_gpu} ${set_tensorrt} ${set_precision} & "
                    eval $web_service_cmd
                    sleep 2s
-                    pipeline_cmd="${python} ${pipeline_py}"
+                    pipeline_cmd="${python} ${pipeline_py} > ${_save_log_path} 2>&1"
                    eval $pipeline_cmd
                    last_status=${PIPESTATUS[0]}
                    eval "cat ${_save_log_path}"
@@ -129,3 +129,7 @@ eval $env
 echo "################### run test ###################"
+export Count=0
+IFS="|"
+func_serving "${web_service_cmd}"
--- a/tests/test_python.sh
+++ b/tests/test_python.sh
@@ -5,11 +5,7 @@ FILENAME=$1
 # MODE be one of ['lite_train_infer' 'whole_infer' 'whole_train_infer', 'infer', 'klquant_infer']
 MODE=$2
-if [ ${MODE} = "klquant_infer" ]; then
+dataline=$(awk 'NR==1, NR==51{print}'  $FILENAME)
-    dataline=$(awk 'NR==82, NR==98{print}'  $FILENAME)
-else
-    dataline=$(awk 'NR==1, NR==51{print}'  $FILENAME)
-fi
 # parser params
 IFS=$'\n'
@@ -93,6 +89,8 @@ infer_value1=$(func_parser_value "${lines[50]}")
 # parser klquant_infer
 if [ ${MODE} = "klquant_infer" ]; then
+    dataline=$(awk 'NR==82, NR==98{print}'  $FILENAME)
+    lines=(${dataline})
    # parser inference model 
    infer_model_dir_list=$(func_parser_value "${lines[1]}")
    infer_export_list=$(func_parser_value "${lines[2]}")
@@ -143,7 +141,11 @@ function func_inference(){
                fi
                for threads in ${cpu_threads_list[*]}; do
                    for batch_size in ${batch_size_list[*]}; do
-                        _save_log_path="${_log_path}/python_infer_cpu_usemkldnn_${use_mkldnn}_threads_${threads}_batchsize_${batch_size}.log"
+                        precison="fp32"
+                        if [ ${use_mkldnn} = "False" ] && [ ${_flag_quant} = "True" ]; then
+                            precision="int8"
+                        fi
+                        _save_log_path="${_log_path}/python_infer_cpu_usemkldnn_${use_mkldnn}_threads_${threads}_precision_${precision}_batchsize_${batch_size}.log"
                        set_infer_data=$(func_set_params "${image_dir_key}" "${_img_dir}")
                        set_benchmark=$(func_set_params "${benchmark_key}" "${benchmark_value}")
                        set_batchsize=$(func_set_params "${batch_size_key}" "${batch_size}")
@@ -224,6 +226,9 @@ if [ ${MODE} = "infer" ] || [ ${MODE} = "klquant_infer" ]; then
        fi
        #run inference
        is_quant=${infer_quant_flag[Count]}
+        if [ ${MODE} = "klquant_infer" ]; then
+            is_quant="True"
+        fi
        func_inference "${python}" "${inference_py}" "${save_infer_dir}" "${LOG_PATH}" "${infer_img_dir}" ${is_quant}
        Count=$(($Count + 1))
    done

--- a/doc/doc_ch/enhanced_ctc_loss.md
+++ b/doc/doc_ch/enhanced_ctc_loss.md
@@ -16,7 +16,7 @@ Focal Loss 出自论文《Focal Loss for Dense Object Detection》, 该loss最
 从上图可以看到, 当&gamma;> 0时，调整系数（1-y’）^&gamma; 赋予易分类样本损失一个更小的权重，使得网络更关注于困难的、错分的样本。 调整因子&gamma;用于调节简单样本权重降低的速率，当&gamma;为0时即为交叉熵损失函数，当&gamma;增加时，调整因子的影响也会随之增大。实验发现&gamma;为2是最优。平衡因子&alpha;用来平衡正负样本本身的比例不均，文中&alpha;取0.25。
-对于经典的CTC算法，假设某个特征序列（f<sub>1</sub>, f<sub>2</sub>, ......f<sub>t</sub>), 经过CTC解码之后结果等于label的概率为y’, 则CTC解码结果不为label的概率即为（1-y’)；不难发现 CTCLoss值和y’有如下关系：
+对于经典的CTC算法，假设某个特征序列（f<sub>1</sub>, f<sub>2</sub>, ......f<sub>t</sub>), 经过CTC解码之后结果等于label的概率为y’, 则CTC解码结果不为label的概率即为（1-y’)；不难发现, CTCLoss值和y’有如下关系：
 <div align="center"> 
 <img src="./equation_ctcloss.png" width = "250" /> 
 </div>
@@ -38,7 +38,7 @@ A-CTC Loss是CTC Loss + ACE Loss的简称。 其中ACE Loss出自论文< Aggrega
 <img src="./rec_algo_compare.png" width = "1000" /> 
 </div>
-虽然ACELoss确实如上图所说，可以处理2D预测，在内存占用及推理速度方面具备优势，但在实践过程中，我们发现单独使用ACE Loss,  识别效果并不如CTCLoss.  因此，我们尝试将CTCLoss和ACELoss进行组合，同时以CTCLoss为主，将ACELoss 定位为一个辅助监督loss。 这一尝试收到了效果，在我们内部的实验数据集上，相比单独使用CTCLoss，识别准确率可以提升1%左右。
+虽然ACELoss确实如上图所说，可以处理2D预测，在内存占用及推理速度方面具备优势，但在实践过程中，我们发现单独使用ACE Loss,  识别效果并不如CTCLoss.  因此，我们尝试将CTCLoss和ACELoss进行结合，同时以CTCLoss为主，将ACELoss 定位为一个辅助监督loss。 这一尝试收到了效果，在我们内部的实验数据集上，相比单独使用CTCLoss，识别准确率可以提升1%左右。
 A_CTC Loss定义如下:  
 <div align="center">
 <img src="./equation_a_ctc.png" width = "300" /> 
@@ -47,7 +47,7 @@ A_CTC Loss定义如下:
 实验中，λ = 0.1.  ACE loss实现代码见:  [ace_loss.py](../../ppocr/losses/ace_loss.py)
 ## 3. C-CTC Loss
-C-CTC Loss是CTC Loss + Center Loss的简称。 其中Center Loss出自论文 < A Discriminative Feature Learning Approach for Deep Face Recognition>.  最早用于人脸识别任务，用于增大累间距离，减小类内距离,  是Metric Learning领域一种较早的、也比较常用的一种算法。 
+C-CTC Loss是CTC Loss + Center Loss的简称。 其中Center Loss出自论文 < A Discriminative Feature Learning Approach for Deep Face Recognition>.  最早用于人脸识别任务，用于增大类间距离，减小类内距离,  是Metric Learning领域一种较早的、也比较常用的一种算法。 
 在中文OCR识别任务中，通过对badcase分析， 我们发现中文识别的一大难点是相似字符多，容易误识。 由此我们想到是否可以借鉴Metric Learing的想法， 增大相似字符的类间距，从而提高识别准确率。然而，MetricLearning主要用于图像识别领域，训练数据的标签为一个固定的值；而对于OCR识别来说，其本质上是一个序列识别任务，特征和label之间并不具有显式的对齐关系，因此两者如何结合依然是一个值得探索的方向。
 通过尝试Arcmargin, Cosmargin等方法， 我们最终发现Centerloss 有助于进一步提升识别的准确率。C_CTC Loss定义如下：
 <div align="center">

--- a/ppocr/losses/ace_loss.py
+++ b/ppocr/losses/ace_loss.py
@@ -32,6 +32,7 @@ class ACELoss(nn.Layer):
    def __call__(self, predicts, batch):
        if isinstance(predicts, (list, tuple)):
            predicts = predicts[-1]
        B, N = predicts.shape[:2]
        div = paddle.to_tensor([N]).astype('float32')
@@ -42,9 +43,7 @@ class ACELoss(nn.Layer):
        length = batch[2].astype("float32")
        batch = batch[3].astype("float32")
        batch[:, 0] = paddle.subtract(div, length)
        batch = paddle.divide(batch, div)
        loss = self.loss_func(aggregation_preds, batch)
        return {"loss_ace": loss}
--- a/ppocr/losses/center_loss.py
+++ b/ppocr/losses/center_loss.py
@@ -27,7 +27,6 @@ class CenterLoss(nn.Layer):
    """
    Reference: Wen et al. A Discriminative Feature Learning Approach for Deep Face Recognition. ECCV 2016.
    """
    def __init__(self,
                 num_classes=6625,
                 feat_dim=96,
@@ -37,8 +36,7 @@ class CenterLoss(nn.Layer):
        self.num_classes = num_classes
        self.feat_dim = feat_dim
        self.centers = paddle.randn(
-            shape=[self.num_classes, self.feat_dim]).astype(
+            shape=[self.num_classes, self.feat_dim]).astype("float64")
-                "float64")  #random center
        if init_center:
            assert os.path.exists(
@@ -60,22 +58,23 @@ class CenterLoss(nn.Layer):
        batch_size = feats_reshape.shape[0]
-        #calc feat * feat   
+        #calc l2 distance between feats and centers  
-        dist1 = paddle.sum(paddle.square(feats_reshape), axis=1, keepdim=True)
+        square_feat = paddle.sum(paddle.square(feats_reshape),
-        dist1 = paddle.expand(dist1, [batch_size, self.num_classes])
+                                 axis=1,
+                                 keepdim=True)
+        square_feat = paddle.expand(square_feat, [batch_size, self.num_classes])
-        #dist2 of centers
+        square_center = paddle.sum(paddle.square(self.centers),
-        dist2 = paddle.sum(paddle.square(self.centers), axis=1,
+                                   axis=1,
-                           keepdim=True)  #num_classes
+                                   keepdim=True)
-        dist2 = paddle.expand(dist2,
+        square_center = paddle.expand(
-                              [self.num_classes, batch_size]).astype("float64")
+            square_center, [self.num_classes, batch_size]).astype("float64")
-        dist2 = paddle.transpose(dist2, [1, 0])
+        square_center = paddle.transpose(square_center, [1, 0])
-        #first x * x + y * y
+        distmat = paddle.add(square_feat, square_center)
-        distmat = paddle.add(dist1, dist2)
+        feat_dot_center = paddle.matmul(feats_reshape,
-        tmp = paddle.matmul(feats_reshape,
                                        paddle.transpose(self.centers, [1, 0]))
-        distmat = distmat - 2.0 * tmp
+        distmat = distmat - 2.0 * feat_dot_center
        #generate the mask
        classes = paddle.arange(self.num_classes).astype("int64")
@@ -83,7 +82,8 @@ class CenterLoss(nn.Layer):
            paddle.unsqueeze(label, 1), (batch_size, self.num_classes))
        mask = paddle.equal(
            paddle.expand(classes, [batch_size, self.num_classes]),
-            label).astype("float64")  #get mask
+            label).astype("float64")
        dist = paddle.multiply(distmat, mask)
        loss = paddle.sum(paddle.clip(dist, min=1e-12, max=1e+12)) / batch_size
        return {'loss_center': loss}
--- a/tests/docs/test.png
+++ b/tests/docs/test.png