diff --git a/README.md b/README.md
index df47cc64b0c33dc3dc2798de4e7ff8092b7bb2bb..887d486c87fcfeab6e703f22f77d8c13b2dc1dc2 100755
--- a/README.md
+++ b/README.md
@@ -44,7 +44,7 @@ The above pictures are the visualizations of the English recognition model. For
- Scan the QR code below with your Wechat, you can access to official technical exchange group. Look forward to your participation.
-
+
diff --git a/README_ch.md b/README_ch.md
index eafce6c2eecee4f65d4c8cfbfd9b0558deed3d9e..0c5d4e1c3e8237a0e3a96d6a581f8662e221e37a 100755
--- a/README_ch.md
+++ b/README_ch.md
@@ -8,7 +8,7 @@ PaddleOCR同时支持动态图与静态图两种编程范式
- 静态图版本:develop分支
**近期更新**
-- 2021.4.20 [FAQ](./doc/doc_ch/FAQ.md)新增5个高频问题,总数208个,每周一都会更新,欢迎大家持续关注。
+- 2021.4.26 [FAQ](./doc/doc_ch/FAQ.md)新增5个高频问题,总数213个,每周一都会更新,欢迎大家持续关注。
- PaddleOCR研发团队对最新发版内容技术深入解读,4月13日晚上19:00,[直播地址](https://live.bilibili.com/21689802)。
- 2021.4.8 release 2.1版本,新增AAAI 2021论文[端到端识别算法PGNet](./doc/doc_ch/pgnet.md)开源,[多语言模型](./doc/doc_ch/multi_languages.md)支持种类增加到80+。
- 2021.2.8 正式发布PaddleOCRv2.0(branch release/2.0)并设置为推荐用户使用的默认分支. 发布的详细内容,请参考: https://github.com/PaddlePaddle/PaddleOCR/releases/tag/v2.0.0
@@ -45,7 +45,7 @@ PaddleOCR同时支持动态图与静态图两种编程范式
- 微信扫描二维码加入官方交流群,获得更高效的问题答疑,与各行各业开发者充分交流,期待您的加入。
-
+
## 快速体验
@@ -78,7 +78,7 @@ PaddleOCR同时支持动态图与静态图两种编程范式
- 算法介绍
- [文本检测](./doc/doc_ch/algorithm_overview.md)
- [文本识别](./doc/doc_ch/algorithm_overview.md)
- - [PP-OCR Pipline](#PP-OCR)
+ - [PP-OCR Pipeline](#PP-OCR)
- [端到端PGNet算法](./doc/doc_ch/pgnet.md)
- 模型训练/评估
- [文本检测](./doc/doc_ch/detection.md)
@@ -113,7 +113,7 @@ PaddleOCR同时支持动态图与静态图两种编程范式
-## PP-OCR Pipline
+## PP-OCR Pipeline
diff --git a/configs/rec/ch_ppocr_v2.0/rec_chinese_common_train_v2.0.yml b/configs/rec/ch_ppocr_v2.0/rec_chinese_common_train_v2.0.yml
index 6a524e22cf4dea4c573d6b67e752c8527e973185..717c16814bac2f6fca78aa63566df12bd8cbf67b 100644
--- a/configs/rec/ch_ppocr_v2.0/rec_chinese_common_train_v2.0.yml
+++ b/configs/rec/ch_ppocr_v2.0/rec_chinese_common_train_v2.0.yml
@@ -19,6 +19,7 @@ Global:
max_text_length: 25
infer_mode: False
use_space_char: True
+ save_res_path: ./output/rec/predicts_chinese_common_v2.0.txt
Optimizer:
diff --git a/configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml b/configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml
index c96621c5684f3861a7bc2f5aa8d9684e6512e228..660465f301047110db7001db7a32e687f2917b61 100644
--- a/configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml
+++ b/configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml
@@ -19,6 +19,7 @@ Global:
max_text_length: 25
infer_mode: False
use_space_char: True
+ save_res_path: ./output/rec/predicts_chinese_lite_v2.0.txt
Optimizer:
diff --git a/configs/rec/rec_icdar15_train.yml b/configs/rec/rec_icdar15_train.yml
index 5ae47c67d8b062746d422daac44011fb5aca38e2..79e3ff88869d9e0fc7be5563830eed1244e38b76 100644
--- a/configs/rec/rec_icdar15_train.yml
+++ b/configs/rec/rec_icdar15_train.yml
@@ -19,6 +19,7 @@ Global:
max_text_length: 25
infer_mode: False
use_space_char: False
+ save_res_path: ./output/rec/predicts_ic15.txt
Optimizer:
name: Adam
diff --git a/configs/rec/rec_mv3_none_bilstm_ctc.yml b/configs/rec/rec_mv3_none_bilstm_ctc.yml
index 900e98b6b34de824d9afb91b1867a86fe2debc24..9e0bd23edba053b44fc7241c0a587ced5cd1ac76 100644
--- a/configs/rec/rec_mv3_none_bilstm_ctc.yml
+++ b/configs/rec/rec_mv3_none_bilstm_ctc.yml
@@ -19,6 +19,7 @@ Global:
max_text_length: 25
infer_mode: False
use_space_char: False
+ save_res_path: ./output/rec/predicts_mv3_none_bilstm_ctc.txt
Optimizer:
name: Adam
diff --git a/configs/rec/rec_mv3_none_none_ctc.yml b/configs/rec/rec_mv3_none_none_ctc.yml
index 6d86b90c007cab9708cab6db6d8e3045dd5187fb..904afe1134b565d6459cdcda4cbfa43ae4925b92 100644
--- a/configs/rec/rec_mv3_none_none_ctc.yml
+++ b/configs/rec/rec_mv3_none_none_ctc.yml
@@ -19,6 +19,7 @@ Global:
max_text_length: 25
infer_mode: False
use_space_char: False
+ save_res_path: ./output/rec/predicts_mv3_none_none_ctc.txt
Optimizer:
name: Adam
diff --git a/configs/rec/rec_mv3_tps_bilstm_att.yml b/configs/rec/rec_mv3_tps_bilstm_att.yml
index 33aed74d83f9ab18b7bdfb12d0ed315c6eebc010..feaeb0545c687774938521e4c45c026207172f11 100644
--- a/configs/rec/rec_mv3_tps_bilstm_att.yml
+++ b/configs/rec/rec_mv3_tps_bilstm_att.yml
@@ -19,6 +19,7 @@ Global:
max_text_length: 25
infer_mode: False
use_space_char: False
+ save_res_path: ./output/rec/predicts_mv3_tps_bilstm_att.txt
Optimizer:
diff --git a/configs/rec/rec_mv3_tps_bilstm_ctc.yml b/configs/rec/rec_mv3_tps_bilstm_ctc.yml
index 026c6a9dfbd6b6b543c0b4260c43cbf98e192e7b..65ab23c42aff54ee548867e3482d7400603551ad 100644
--- a/configs/rec/rec_mv3_tps_bilstm_ctc.yml
+++ b/configs/rec/rec_mv3_tps_bilstm_ctc.yml
@@ -19,6 +19,7 @@ Global:
max_text_length: 25
infer_mode: False
use_space_char: False
+ save_res_path: ./output/rec/predicts_mv3_tps_bilstm_ctc.txt
Optimizer:
name: Adam
diff --git a/configs/rec/rec_r34_vd_none_bilstm_ctc.yml b/configs/rec/rec_r34_vd_none_bilstm_ctc.yml
index 4052d426e51aa8c6e82ec216cfd65226922be602..331bb36ed84b83dc62a0f9b15524457238dedc13 100644
--- a/configs/rec/rec_r34_vd_none_bilstm_ctc.yml
+++ b/configs/rec/rec_r34_vd_none_bilstm_ctc.yml
@@ -19,6 +19,7 @@ Global:
max_text_length: 25
infer_mode: False
use_space_char: False
+ save_res_path: ./output/rec/predicts_r34_vd_none_bilstm_ctc.txt
Optimizer:
name: Adam
diff --git a/configs/rec/rec_r34_vd_none_none_ctc.yml b/configs/rec/rec_r34_vd_none_none_ctc.yml
index c3e1d9a3a91ab6a51e28d458623aea788b952ca0..695a46958f669e4cb9508646080b45ac0767b8c9 100644
--- a/configs/rec/rec_r34_vd_none_none_ctc.yml
+++ b/configs/rec/rec_r34_vd_none_none_ctc.yml
@@ -19,6 +19,7 @@ Global:
max_text_length: 25
infer_mode: False
use_space_char: False
+ save_res_path: ./output/rec/predicts_r34_vd_none_none_ctc.txt
Optimizer:
name: Adam
diff --git a/configs/rec/rec_r34_vd_tps_bilstm_att.yml b/configs/rec/rec_r34_vd_tps_bilstm_att.yml
index 87a14559849abd0a47c45f52b80d06bb0790ef0e..fdd3588c844ffd7ed61de73077ae2994f0ad498d 100644
--- a/configs/rec/rec_r34_vd_tps_bilstm_att.yml
+++ b/configs/rec/rec_r34_vd_tps_bilstm_att.yml
@@ -19,6 +19,7 @@ Global:
max_text_length: 25
infer_mode: False
use_space_char: False
+ save_res_path: ./output/rec/predicts_b3_rare_r34_none_gru.txt
Optimizer:
diff --git a/configs/rec/rec_r34_vd_tps_bilstm_ctc.yml b/configs/rec/rec_r34_vd_tps_bilstm_ctc.yml
index 9c51962e58763838eb4a101d1d52a1a030e4b643..67108a6eaca2dd6f239261f5184341e5ade00dc0 100644
--- a/configs/rec/rec_r34_vd_tps_bilstm_ctc.yml
+++ b/configs/rec/rec_r34_vd_tps_bilstm_ctc.yml
@@ -19,6 +19,7 @@ Global:
max_text_length: 25
infer_mode: False
use_space_char: False
+ save_res_path: ./output/rec/predicts_r34_vd_tps_bilstm_ctc.txt
Optimizer:
name: Adam
diff --git a/configs/rec/rec_r50_fpn_srn.yml b/configs/rec/rec_r50_fpn_srn.yml
index 34a997f3c3f6bc3dccdc62757c6adb1e1d17cc53..fa7b1ae4e5fed41d3aa3670d6672cca01b63c359 100644
--- a/configs/rec/rec_r50_fpn_srn.yml
+++ b/configs/rec/rec_r50_fpn_srn.yml
@@ -20,6 +20,7 @@ Global:
num_heads: 8
infer_mode: False
use_space_char: False
+ save_res_path: ./output/rec/predicts_srn.txt
Optimizer:
diff --git a/deploy/cpp_infer/include/config.h b/deploy/cpp_infer/include/config.h
index dbfbc2df141042f1065b380010e1ea3ff3ccedab..cd02a997e304850ebc04ce2288f4e497dbb4be4a 100644
--- a/deploy/cpp_infer/include/config.h
+++ b/deploy/cpp_infer/include/config.h
@@ -49,6 +49,8 @@ public:
this->det_db_unclip_ratio = stod(config_map_["det_db_unclip_ratio"]);
+ this->use_polygon_score = bool(stoi(config_map_["use_polygon_score"]));
+
this->det_model_dir.assign(config_map_["det_model_dir"]);
this->rec_model_dir.assign(config_map_["rec_model_dir"]);
@@ -86,6 +88,8 @@ public:
double det_db_unclip_ratio = 2.0;
+ bool use_polygon_score = false;
+
std::string det_model_dir;
std::string rec_model_dir;
diff --git a/deploy/cpp_infer/include/ocr_det.h b/deploy/cpp_infer/include/ocr_det.h
index bab9c95fa4a3f1cb160ccbf9ca4587fa4c2ba16a..18318c9c4e37136db62c1338db1b58f82859f037 100644
--- a/deploy/cpp_infer/include/ocr_det.h
+++ b/deploy/cpp_infer/include/ocr_det.h
@@ -44,7 +44,8 @@ public:
const bool &use_mkldnn, const int &max_side_len,
const double &det_db_thresh,
const double &det_db_box_thresh,
- const double &det_db_unclip_ratio, const bool &visualize,
+ const double &det_db_unclip_ratio,
+ const bool &use_polygon_score, const bool &visualize,
const bool &use_tensorrt, const bool &use_fp16) {
this->use_gpu_ = use_gpu;
this->gpu_id_ = gpu_id;
@@ -57,6 +58,7 @@ public:
this->det_db_thresh_ = det_db_thresh;
this->det_db_box_thresh_ = det_db_box_thresh;
this->det_db_unclip_ratio_ = det_db_unclip_ratio;
+ this->use_polygon_score_ = use_polygon_score;
this->visualize_ = visualize;
this->use_tensorrt_ = use_tensorrt;
@@ -85,6 +87,7 @@ private:
double det_db_thresh_ = 0.3;
double det_db_box_thresh_ = 0.5;
double det_db_unclip_ratio_ = 2.0;
+ bool use_polygon_score_ = false;
bool visualize_ = true;
bool use_tensorrt_ = false;
diff --git a/deploy/cpp_infer/include/postprocess_op.h b/deploy/cpp_infer/include/postprocess_op.h
index a600ea6d106706af2fbadb249c862fc764714f9e..b384b79b3041bfcb96f042c6450d3c6e54f00498 100644
--- a/deploy/cpp_infer/include/postprocess_op.h
+++ b/deploy/cpp_infer/include/postprocess_op.h
@@ -55,7 +55,8 @@ public:
std::vector>>
BoxesFromBitmap(const cv::Mat pred, const cv::Mat bitmap,
- const float &box_thresh, const float &det_db_unclip_ratio);
+ const float &box_thresh, const float &det_db_unclip_ratio,
+ const bool &use_polygon_score);
std::vector>>
FilterTagDetRes(std::vector>> boxes,
diff --git a/deploy/cpp_infer/readme.md b/deploy/cpp_infer/readme.md
index 4b65c51f63d70c2e62315a839e1765b4d61d5c6c..da74abb93730c3d7dcd7a265532a3c2e0a3ff5c7 100644
--- a/deploy/cpp_infer/readme.md
+++ b/deploy/cpp_infer/readme.md
@@ -183,7 +183,7 @@ cmake .. \
make -j
```
-`OPENCV_DIR`为opencv编译安装的地址;`LIB_DIR`为下载(`paddle_inference`文件夹)或者编译生成的Paddle预测库地址(`build/paddle_inference_install_dir`文件夹);`CUDA_LIB_DIR`为cuda库文件地址,在docker中;为`/usr/local/cuda/lib64`;`CUDNN_LIB_DIR`为cudnn库文件地址,在docker中为`/usr/lib/x86_64-linux-gnu/`。
+`OPENCV_DIR`为opencv编译安装的地址;`LIB_DIR`为下载(`paddle_inference`文件夹)或者编译生成的Paddle预测库地址(`build/paddle_inference_install_dir`文件夹);`CUDA_LIB_DIR`为cuda库文件地址,在docker中为`/usr/local/cuda/lib64`;`CUDNN_LIB_DIR`为cudnn库文件地址,在docker中为`/usr/lib/x86_64-linux-gnu/`。
* 编译完成之后,会在`build`文件夹下生成一个名为`ocr_system`的可执行文件。
@@ -211,6 +211,7 @@ max_side_len 960 # 输入图像长宽大于960时,等比例缩放图像,使
det_db_thresh 0.3 # 用于过滤DB预测的二值化图像,设置为0.-0.3对结果影响不明显
det_db_box_thresh 0.5 # DB后处理过滤box的阈值,如果检测存在漏框情况,可酌情减小
det_db_unclip_ratio 1.6 # 表示文本框的紧致程度,越小则文本框更靠近文本
+use_polygon_score 1 # 是否使用多边形框计算bbox score,0表示使用矩形框计算。矩形框计算速度更快,多边形框对弯曲文本区域计算更准确。
det_model_dir ./inference/det_db # 检测模型inference model地址
# cls config
diff --git a/deploy/cpp_infer/readme_en.md b/deploy/cpp_infer/readme_en.md
index 1749a12b5690e5a4b4bfff8cfe380cc7efa7367c..4a02ac38a3306a4f54abeeb759e062648bfb4cca 100644
--- a/deploy/cpp_infer/readme_en.md
+++ b/deploy/cpp_infer/readme_en.md
@@ -219,6 +219,7 @@ max_side_len 960 # Limit the maximum image height and width to 960
det_db_thresh 0.3 # Used to filter the binarized image of DB prediction, setting 0.-0.3 has no obvious effect on the result
det_db_box_thresh 0.5 # DDB post-processing filter box threshold, if there is a missing box detected, it can be reduced as appropriate
det_db_unclip_ratio 1.6 # Indicates the compactness of the text box, the smaller the value, the closer the text box to the text
+use_polygon_score 1 # Whether to use polygon box to calculate bbox score, 0 means to use rectangle box to calculate. Use rectangular box to calculate faster, and polygonal box more accurate for curved text area.
det_model_dir ./inference/det_db # Address of detection inference model
# cls config
diff --git a/deploy/cpp_infer/src/main.cpp b/deploy/cpp_infer/src/main.cpp
index 5c9042d4e80c569cfd4a57a84ad5c594e69dc350..588c8374ab341163835aea2ba6c7132640c74c64 100644
--- a/deploy/cpp_infer/src/main.cpp
+++ b/deploy/cpp_infer/src/main.cpp
@@ -59,7 +59,8 @@ int main(int argc, char **argv) {
config.gpu_mem, config.cpu_math_library_num_threads,
config.use_mkldnn, config.max_side_len, config.det_db_thresh,
config.det_db_box_thresh, config.det_db_unclip_ratio,
- config.visualize, config.use_tensorrt, config.use_fp16);
+ config.use_polygon_score, config.visualize,
+ config.use_tensorrt, config.use_fp16);
Classifier *cls = nullptr;
if (config.use_angle_cls == true) {
diff --git a/deploy/cpp_infer/src/ocr_det.cpp b/deploy/cpp_infer/src/ocr_det.cpp
index 489940f062fa9f8093282d20441704dd5cb8b382..9bfee6138577288156496d9b533b4da906ae7268 100644
--- a/deploy/cpp_infer/src/ocr_det.cpp
+++ b/deploy/cpp_infer/src/ocr_det.cpp
@@ -109,9 +109,9 @@ void DBDetector::Run(cv::Mat &img,
cv::Mat dilation_map;
cv::Mat dila_ele = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(2, 2));
cv::dilate(bit_map, dilation_map, dila_ele);
- boxes = post_processor_.BoxesFromBitmap(pred_map, dilation_map,
- this->det_db_box_thresh_,
- this->det_db_unclip_ratio_);
+ boxes = post_processor_.BoxesFromBitmap(
+ pred_map, dilation_map, this->det_db_box_thresh_,
+ this->det_db_unclip_ratio_, this->use_polygon_score_);
boxes = post_processor_.FilterTagDetRes(boxes, ratio_h, ratio_w, srcimg);
diff --git a/deploy/cpp_infer/src/postprocess_op.cpp b/deploy/cpp_infer/src/postprocess_op.cpp
index 1b71c210497778fcb70ffe8630e29245ad00136d..e7db70f3bff81390728c6b373b89cf06c74e4eca 100644
--- a/deploy/cpp_infer/src/postprocess_op.cpp
+++ b/deploy/cpp_infer/src/postprocess_op.cpp
@@ -160,35 +160,49 @@ std::vector> PostProcessor::GetMiniBoxes(cv::RotatedRect box,
}
float PostProcessor::PolygonScoreAcc(std::vector contour,
- cv::Mat pred){
+ cv::Mat pred) {
int width = pred.cols;
int height = pred.rows;
std::vector box_x;
std::vector box_y;
- for(int i=0; i> box_array,
return score;
}
-std::vector>>
-PostProcessor::BoxesFromBitmap(const cv::Mat pred, const cv::Mat bitmap,
- const float &box_thresh,
- const float &det_db_unclip_ratio) {
+std::vector>> PostProcessor::BoxesFromBitmap(
+ const cv::Mat pred, const cv::Mat bitmap, const float &box_thresh,
+ const float &det_db_unclip_ratio, const bool &use_polygon_score) {
const int min_size = 3;
const int max_candidates = 1000;
@@ -267,9 +280,12 @@ PostProcessor::BoxesFromBitmap(const cv::Mat pred, const cv::Mat bitmap,
}
float score;
- score = BoxScoreFast(array, pred);
- /* compute using polygon*/
- // score = PolygonScoreAcc(contours[_i], pred);
+ if (use_polygon_score)
+ /* compute using polygon*/
+ score = PolygonScoreAcc(contours[_i], pred);
+ else
+ score = BoxScoreFast(array, pred);
+
if (score < box_thresh)
continue;
diff --git a/deploy/cpp_infer/src/preprocess_op.cpp b/deploy/cpp_infer/src/preprocess_op.cpp
old mode 100644
new mode 100755
index 87d8dbbd774f7dd29843d9a2c2533368f1914272..37e33aee44167738261db4138793754aa657f22c
--- a/deploy/cpp_infer/src/preprocess_op.cpp
+++ b/deploy/cpp_infer/src/preprocess_op.cpp
@@ -77,19 +77,10 @@ void ResizeImgType0::Run(const cv::Mat &img, cv::Mat &resize_img,
int resize_h = int(float(h) * ratio);
int resize_w = int(float(w) * ratio);
- if (resize_h % 32 == 0)
- resize_h = resize_h;
- else if (resize_h / 32 < 1 + 1e-5)
- resize_h = 32;
- else
- resize_h = (resize_h / 32) * 32;
- if (resize_w % 32 == 0)
- resize_w = resize_w;
- else if (resize_w / 32 < 1 + 1e-5)
- resize_w = 32;
- else
- resize_w = (resize_w / 32) * 32;
+ resize_h = max(int(round(float(resize_h) / 32) * 32), 32);
+ resize_w = max(int(round(float(resize_w) / 32) * 32), 32);
+
if (!use_tensorrt) {
cv::resize(img, resize_img, cv::Size(resize_w, resize_h));
ratio_h = float(resize_h) / float(h);
diff --git a/deploy/cpp_infer/tools/config.txt b/deploy/cpp_infer/tools/config.txt
index 28085ca408d279fc61a1bce1abf1df9c05115c78..5f415a6af2f3b1a695f14e8191ed7b9e55932910 100644
--- a/deploy/cpp_infer/tools/config.txt
+++ b/deploy/cpp_infer/tools/config.txt
@@ -10,6 +10,7 @@ max_side_len 960
det_db_thresh 0.3
det_db_box_thresh 0.5
det_db_unclip_ratio 1.6
+use_polygon_score 1
det_model_dir ./inference/ch_ppocr_mobile_v2.0_det_infer/
# cls config
diff --git a/deploy/hubserving/ocr_cls/module.py b/deploy/hubserving/ocr_cls/module.py
index 803d5ac27e74d04177b21b5886675ac0b5f31698..e159e0d3f23e9654c2d0342fbe6fa86b257ed24b 100644
--- a/deploy/hubserving/ocr_cls/module.py
+++ b/deploy/hubserving/ocr_cls/module.py
@@ -16,6 +16,7 @@ import paddlehub as hub
from tools.infer.utility import base64_to_cv2
from tools.infer.predict_cls import TextClassifier
from tools.infer.utility import parse_args
+from deploy.hubserving.ocr_cls.params import read_params
@moduleinfo(
@@ -55,7 +56,6 @@ class OCRCls(hub.Module):
sys.argv = sys.argv[:1]
cfg = parse_args()
- from ocr_det.params import read_params
update_cfg_map = vars(read_params())
for key in update_cfg_map:
diff --git a/doc/doc_ch/FAQ.md b/doc/doc_ch/FAQ.md
index 6f008d97faacc80cdf28dc6b47fe3a403a98425e..631b2594c24e4a0d34b4967ea13fd8106052e605 100755
--- a/doc/doc_ch/FAQ.md
+++ b/doc/doc_ch/FAQ.md
@@ -9,41 +9,35 @@
## PaddleOCR常见问题汇总(持续更新)
-* [近期更新(2021.4.20)](#近期更新)
+* [近期更新(2021.4.26)](#近期更新)
* [【精选】OCR精选10个问题](#OCR精选10个问题)
* [【理论篇】OCR通用43个问题](#OCR通用问题)
* [基础知识13题](#基础知识)
* [数据集9题](#数据集2)
* [模型训练调优21题](#模型训练调优2)
-* [【实战篇】PaddleOCR实战150个问题](#PaddleOCR实战问题)
- * [使用咨询61题](#使用咨询)
+* [【实战篇】PaddleOCR实战160个问题](#PaddleOCR实战问题)
+ * [使用咨询63题](#使用咨询)
* [数据集18题](#数据集3)
- * [模型训练调优34题](#模型训练调优3)
- * [预测部署42题](#预测部署3)
+ * [模型训练调优35题](#模型训练调优3)
+ * [预测部署44题](#预测部署3)
-## 近期更新(2021.4.20)
+## 近期更新(2021.4.26)
-#### Q3.1.58: 使用PGNet进行eval报错?
-**A**: 需要注意,我们目前在release/2.1更新了评测代码,目前支持A,B两种评测模式:
-* A模式:该模式主要为了方便用户使用,与训练集一样的标注文件就可以正常进行eval操作, 代码中默认是A模式。
-* B模式:该模式主要为了保证我们的评测代码可以和Total Text官方的评测方式对齐,该模式下直接加载官方提供的mat文件进行eval。
+#### Q3.1.62: 弯曲文本(如略微形变的文档图像)漏检问题
+**A**: db后处理中计算文本框平均得分时,是求rectangle区域的平均分数,容易造成弯曲文本漏检,已新增求polygon区域的平均分数,会更准确,但速度有所降低,可按需选择,在相关pr中可查看[可视化对比效果](https://github.com/PaddlePaddle/PaddleOCR/pull/2604)。该功能通过参数 [det_db_score_mode](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/tools/infer/utility.py#L51)进行选择,参数值可选[`fast`(默认)、`slow`],`fast`对应原始的rectangle方式,`slow`对应polygon方式。感谢用户[buptlihang](https://github.com/buptlihang)提[pr](https://github.com/PaddlePaddle/PaddleOCR/pull/2574)帮助解决该问题🌹。
-#### Q3.1.59: 使用预训练模型进行预测,对于特定字符识别识别效果较差,怎么解决?
-**A**: 由于我们所提供的识别模型是基于通用大规模数据集进行训练的,部分字符可能在训练集中包含较少,因此您可以构建特定场景的数据集,基于我们提供的预训练模型进行微调。建议用于微调的数据集中,每个字符出现的样本数量不低于300,但同时需要注意不同字符的数量均衡。具体可以参考:[微调](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/doc/doc_ch/recognition.md#2-%E5%90%AF%E5%8A%A8%E8%AE%AD%E7%BB%83)
+#### Q3.1.63: 请问端到端的pgnet相比于DB+CRNN在准确率上有优势吗?或者是pgnet最擅长的场景是什么场景呢?
+**A**: pgnet是端到端算法,检测识别一步到位,不用分开训练2个模型,也支持弯曲文本的识别,但是在中文上的效果还没有充分验证;db+crnn的验证更充分,应用相对成熟,常规非弯曲的文本都能解的不错。
-#### Q3.1.60: PGNet有中文预训练模型吗?
-**A**: 目前我们尚未提供针对中文的预训练模型,如有需要,可以尝试自己训练。具体需要修改的地方有:
- 1. [config文件中](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/configs/e2e/e2e_r50_vd_pg.yml#L23-L24),字典文件路径及语种设置;
- 1. [网络结构中](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/ppocr/modeling/heads/e2e_pg_head.py#L181),`out_channels`修改为字典中的字符数目+1(考虑到空格);
- 1. [loss中](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/ppocr/losses/e2e_pg_loss.py#L93),修改`37`为字典中的字符数目+1(考虑到空格);
+#### Q3.3.35: SRN训练不收敛(loss不降)或SRN训练acc一直为0。
+**A**: 如果loss下降不正常,需要确认没有修改yml文件中的image_shape,默认[1, 64, 256],代码中针对这个配置写死了,修改可能会造成无法收敛。如果确认参数无误,loss正常下降,可以多迭代一段时间观察下,开始acc为0是正常的。
-#### Q3.1.61: 用于PGNet的训练集,文本框的标注有要求吗?
-**A**: PGNet支持多点标注,比如4点、8点、14点等。但需要注意的是,标注点尽可能分布均匀(相邻标注点间隔距离均匀一致),且label文件中的标注点需要从标注框的左上角开始,按标注点顺时针顺序依次编写,以上问题都可能对训练精度造成影响。
-我们提供的,基于Total Text数据集的PGNet预训练模型使用了14点标注方式。
+#### Q3.4.43: 预测时显存爆炸、内存泄漏问题?
+**A**: 打开显存/内存优化开关`enable_memory_optim`可以解决该问题,相关代码已合入,[查看详情](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/tools/infer/utility.py#L153)。
-#### Q3.4.42: 在使用PaddleLite进行预测部署时,启动预测后卡死/手机死机?
-**A**: 请检查模型转换时所用PaddleLite的版本,和预测库的版本是否对齐。即PaddleLite版本为2.8,则预测库版本也要为2.8。
+#### Q3.4.44: 如何多进程预测?
+**A**: 近期PaddleOCR新增了[多进程预测控制参数](https://github.com/PaddlePaddle/PaddleOCR/blob/a312647be716776c1aac33ff939ae358a39e8188/tools/infer/utility.py#L103),`use_mp`表示是否使用多进程,`total_process_num`表示在使用多进程时的进程数。具体使用方式请参考[文档](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/doc/doc_ch/inference.md#1-%E8%B6%85%E8%BD%BB%E9%87%8F%E4%B8%AD%E6%96%87ocr%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86)。
## 【精选】OCR精选10个问题
@@ -638,6 +632,11 @@ repo中config.yml文件的前后处理参数和inference预测默认的超参数
**A**: PGNet支持多点标注,比如4点、8点、14点等。但需要注意的是,标注点尽可能分布均匀(相邻标注点间隔距离均匀一致),且label文件中的标注点需要从标注框的左上角开始,按标注点顺时针顺序依次编写,以上问题都可能对训练精度造成影响。
我们提供的,基于Total Text数据集的PGNet预训练模型使用了14点标注方式。
+#### Q3.1.62: 弯曲文本(如略微形变的文档图像)漏检问题
+**A**: db后处理中计算文本框平均得分时,是求rectangle区域的平均分数,容易造成弯曲文本漏检,已新增求polygon区域的平均分数,会更准确,但速度有所降低,可按需选择,在相关pr中可查看[可视化对比效果](https://github.com/PaddlePaddle/PaddleOCR/pull/2604)。该功能通过参数 [det_db_score_mode](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/tools/infer/utility.py#L51)进行选择,参数值可选[`fast`(默认)、`slow`],`fast`对应原始的rectangle方式,`slow`对应polygon方式。感谢用户[buptlihang](https://github.com/buptlihang)提[pr](https://github.com/PaddlePaddle/PaddleOCR/pull/2574)帮助解决该问题🌹。
+
+#### Q3.1.63: 请问端到端的pgnet相比于DB+CRNN在准确率上有优势吗?或者是pgnet最擅长的场景是什么场景呢?
+**A**: pgnet是端到端算法,检测识别一步到位,不用分开训练2个模型,也支持弯曲文本的识别,但是在中文上的效果还没有充分验证;db+crnn的验证更充分,应用相对成熟,常规非弯曲的文本都能解的不错。
@@ -911,8 +910,10 @@ lr:
#### Q3.3.34: 表格识别中,如何提高单字的识别结果?
**A**: 首先需要确认一下检测模型有没有有效的检测出单个字符,如果没有的话,需要在训练集当中添加相应的单字数据集。
-
+#### Q3.3.35: SRN训练不收敛(loss不降)或SRN训练acc一直为0。
+**A**: 如果loss下降不正常,需要确认没有修改yml文件中的image_shape,默认[1, 64, 256],代码中针对这个配置写死了,修改可能会造成无法收敛。如果确认参数无误,loss正常下降,可以多迭代一段时间观察下,开始acc为0是正常的。
+
### 预测部署
@@ -956,10 +957,6 @@ lr:
**A**:在安卓APK上无法设置,没有暴露这个接口,如果使用的是PaddledOCR/deploy/lite/的demo,可以修改config.txt中的对应参数来设置
-#### Q3.4.9:PaddleOCR模型是否可以转换成ONNX模型?
-
-**A**:目前暂不支持转ONNX,相关工作在研发中。
-
#### Q3.4.10:使用opt工具对检测模型转换时报错 can not found op arguments for node conv2_b_attr
**A**:这个问题大概率是编译opt工具的Paddle-Lite不是develop分支,建议使用Paddle-Lite 的develop分支编译opt工具。
@@ -1114,3 +1111,9 @@ nvidia-smi --lock-gpu-clocks=1590 -i 0
#### Q3.4.42: 在使用PaddleLite进行预测部署时,启动预测后卡死/手机死机?
**A**: 请检查模型转换时所用PaddleLite的版本,和预测库的版本是否对齐。即PaddleLite版本为2.8,则预测库版本也要为2.8。
+
+#### Q3.4.43: 预测时显存爆炸、内存泄漏问题?
+**A**: 打开显存/内存优化开关`enable_memory_optim`可以解决该问题,相关代码已合入,[查看详情](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/tools/infer/utility.py#L153)。
+
+#### Q3.4.44: 如何多进程预测?
+**A**: 近期PaddleOCR新增了[多进程预测控制参数](https://github.com/PaddlePaddle/PaddleOCR/blob/a312647be716776c1aac33ff939ae358a39e8188/tools/infer/utility.py#L103),`use_mp`表示是否使用多进程,`total_process_num`表示在使用多进程时的进程数。具体使用方式请参考[文档](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/doc/doc_ch/inference.md#1-%E8%B6%85%E8%BD%BB%E9%87%8F%E4%B8%AD%E6%96%87ocr%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86)。
diff --git a/doc/doc_ch/multi_languages.md b/doc/doc_ch/multi_languages.md
index a3bc6f1083a285fa818de7089cfaf337b36e45e5..306eba36e463cb4aef20a1d8ff895ecfcc77d0ef 100755
--- a/doc/doc_ch/multi_languages.md
+++ b/doc/doc_ch/multi_languages.md
@@ -47,7 +47,7 @@ PaddleOCR 旨在打造一套丰富、领先、且实用的OCR工具库,不仅
pip install paddlepaddle
# gpu
-pip instll paddlepaddle-gpu
+pip install paddlepaddle-gpu
```
@@ -179,11 +179,11 @@ ppocr 支持使用自己的数据进行自定义训练或finetune, 其中识别
## 4 预测部署
除了安装whl包进行快速预测,ppocr 也提供了多种预测部署方式,如有需求可阅读相关文档:
-- [基于Python脚本预测引擎推理](./doc/doc_ch/inference.md)
-- [基于C++预测引擎推理](./deploy/cpp_infer/readme.md)
-- [服务化部署](./deploy/hubserving/readme.md)
+- [基于Python脚本预测引擎推理](./inference.md)
+- [基于C++预测引擎推理](../../deploy/cpp_infer/readme.md)
+- [服务化部署](../../deploy/hubserving/readme.md)
- [端侧部署](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/deploy/lite/readme.md)
-- [Benchmark](./doc/doc_ch/benchmark.md)
+- [Benchmark](./benchmark.md)
diff --git a/doc/doc_en/multi_languages_en.md b/doc/doc_en/multi_languages_en.md
index fa00bdd3f8c9917cb501bfbb8bf95e8e0c51edde..e58b782ca18d55dbd954382fd0df6f53910e2e52 100755
--- a/doc/doc_en/multi_languages_en.md
+++ b/doc/doc_en/multi_languages_en.md
@@ -48,7 +48,7 @@ This document will briefly introduce how to use the multilingual model.
pip install paddlepaddle
# gpu
-pip instll paddlepaddle-gpu
+pip install paddlepaddle-gpu
```
@@ -181,11 +181,11 @@ In addition to installing the whl package for quick forecasting,
ppocr also provides a variety of forecasting deployment methods.
If necessary, you can read related documents:
-- [Python Inference](./doc/doc_en/inference_en.md)
-- [C++ Inference](./deploy/cpp_infer/readme_en.md)
-- [Serving](./deploy/hubserving/readme_en.md)
+- [Python Inference](./inference_en.md)
+- [C++ Inference](../../deploy/cpp_infer/readme_en.md)
+- [Serving](../../deploy/hubserving/readme_en.md)
- [Mobile](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/deploy/lite/readme_en.md)
-- [Benchmark](./doc/doc_en/benchmark_en.md)
+- [Benchmark](./benchmark_en.md)
diff --git a/ppocr/modeling/backbones/rec_resnet_vd.py b/ppocr/modeling/backbones/rec_resnet_vd.py
index 6837ea0fb2da3347fd8e115f859224e2a61fd578..0187deb96f111a2c2b545c7be42dba48c7352e17 100644
--- a/ppocr/modeling/backbones/rec_resnet_vd.py
+++ b/ppocr/modeling/backbones/rec_resnet_vd.py
@@ -249,7 +249,7 @@ class ResNet(nn.Layer):
name=conv_name))
shortcut = True
self.block_list.append(bottleneck_block)
- self.out_channels = num_filters[block]
+ self.out_channels = num_filters[block] * 4
else:
for block in range(len(depth)):
shortcut = False
diff --git a/ppocr/modeling/heads/self_attention.py b/ppocr/modeling/heads/self_attention.py
index 51d5198f558dcb7e0351f04b3a884b71707104d4..6c27fdbe434166e9277cc8d695bce2743cbd8ec6 100644
--- a/ppocr/modeling/heads/self_attention.py
+++ b/ppocr/modeling/heads/self_attention.py
@@ -285,8 +285,7 @@ class PrePostProcessLayer(nn.Layer):
elif cmd == "n": # add layer normalization
self.functors.append(
self.add_sublayer(
- "layer_norm_%d" % len(
- self.sublayers(include_sublayers=False)),
+ "layer_norm_%d" % len(self.sublayers()),
paddle.nn.LayerNorm(
normalized_shape=d_model,
weight_attr=fluid.ParamAttr(
@@ -320,9 +319,7 @@ class PrepareEncoder(nn.Layer):
self.src_emb_dim = src_emb_dim
self.src_max_len = src_max_len
self.emb = paddle.nn.Embedding(
- num_embeddings=self.src_max_len,
- embedding_dim=self.src_emb_dim,
- sparse=True)
+ num_embeddings=self.src_max_len, embedding_dim=self.src_emb_dim)
self.dropout_rate = dropout_rate
def forward(self, src_word, src_pos):
diff --git a/tools/infer/predict_det.py b/tools/infer/predict_det.py
index 717a09ecdad9362b7ea6e556c1136f5791d2f06c..3707cb1791482bf2ac458025677db8b7ee98f2ad 100755
--- a/tools/infer/predict_det.py
+++ b/tools/infer/predict_det.py
@@ -39,7 +39,10 @@ class TextDetector(object):
self.args = args
self.det_algorithm = args.det_algorithm
pre_process_list = [{
- 'DetResizeForTest': None
+ 'DetResizeForTest': {
+ 'limit_side_len': args.det_limit_side_len,
+ 'limit_type': args.det_limit_type
+ }
}, {
'NormalizeImage': {
'std': [0.229, 0.224, 0.225],
@@ -160,7 +163,6 @@ class TextDetector(object):
shape_list = np.expand_dims(shape_list, axis=0)
img = img.copy()
starttime = time.time()
-
self.input_tensor.copy_from_cpu(img)
self.predictor.run()
outputs = []
diff --git a/tools/infer_rec.py b/tools/infer_rec.py
index 075ec261e492cf21c668364ae6119fb4903f823b..2563f5a8197ed39b1b5d44c7cfee32797e760758 100755
--- a/tools/infer_rec.py
+++ b/tools/infer_rec.py
@@ -73,35 +73,45 @@ def main():
global_config['infer_mode'] = True
ops = create_operators(transforms, global_config)
+ save_res_path = config['Global'].get('save_res_path',
+ "./output/rec/predicts_rec.txt")
+ if not os.path.exists(os.path.dirname(save_res_path)):
+ os.makedirs(os.path.dirname(save_res_path))
+
model.eval()
- for file in get_image_file_list(config['Global']['infer_img']):
- logger.info("infer_img: {}".format(file))
- with open(file, 'rb') as f:
- img = f.read()
- data = {'image': img}
- batch = transform(data, ops)
- if config['Architecture']['algorithm'] == "SRN":
- encoder_word_pos_list = np.expand_dims(batch[1], axis=0)
- gsrm_word_pos_list = np.expand_dims(batch[2], axis=0)
- gsrm_slf_attn_bias1_list = np.expand_dims(batch[3], axis=0)
- gsrm_slf_attn_bias2_list = np.expand_dims(batch[4], axis=0)
-
- others = [
- paddle.to_tensor(encoder_word_pos_list),
- paddle.to_tensor(gsrm_word_pos_list),
- paddle.to_tensor(gsrm_slf_attn_bias1_list),
- paddle.to_tensor(gsrm_slf_attn_bias2_list)
- ]
-
- images = np.expand_dims(batch[0], axis=0)
- images = paddle.to_tensor(images)
- if config['Architecture']['algorithm'] == "SRN":
- preds = model(images, others)
- else:
- preds = model(images)
- post_result = post_process_class(preds)
- for rec_reuslt in post_result:
- logger.info('\t result: {}'.format(rec_reuslt))
+
+ with open(save_res_path, "w") as fout:
+ for file in get_image_file_list(config['Global']['infer_img']):
+ logger.info("infer_img: {}".format(file))
+ with open(file, 'rb') as f:
+ img = f.read()
+ data = {'image': img}
+ batch = transform(data, ops)
+ if config['Architecture']['algorithm'] == "SRN":
+ encoder_word_pos_list = np.expand_dims(batch[1], axis=0)
+ gsrm_word_pos_list = np.expand_dims(batch[2], axis=0)
+ gsrm_slf_attn_bias1_list = np.expand_dims(batch[3], axis=0)
+ gsrm_slf_attn_bias2_list = np.expand_dims(batch[4], axis=0)
+
+ others = [
+ paddle.to_tensor(encoder_word_pos_list),
+ paddle.to_tensor(gsrm_word_pos_list),
+ paddle.to_tensor(gsrm_slf_attn_bias1_list),
+ paddle.to_tensor(gsrm_slf_attn_bias2_list)
+ ]
+
+ images = np.expand_dims(batch[0], axis=0)
+ images = paddle.to_tensor(images)
+ if config['Architecture']['algorithm'] == "SRN":
+ preds = model(images, others)
+ else:
+ preds = model(images)
+ post_result = post_process_class(preds)
+ for rec_reuslt in post_result:
+ logger.info('\t result: {}'.format(rec_reuslt))
+ if len(rec_reuslt) >= 2:
+ fout.write(file + "\t" + rec_reuslt[0] + "\t" + str(
+ rec_reuslt[1]) + "\n")
logger.info("success!")
diff --git a/tools/program.py b/tools/program.py
index c22bf18b991a8aed6d47a1ea242aa3b7bb02aacc..ad6fcbd9b5a1213e7e88ef7c82fde07ff29bcb80 100755
--- a/tools/program.py
+++ b/tools/program.py
@@ -18,6 +18,7 @@ from __future__ import print_function
import os
import sys
+import platform
import yaml
import time
import shutil
@@ -196,9 +197,11 @@ def train(config,
train_reader_cost = 0.0
batch_sum = 0
batch_start = time.time()
+ max_iter = len(train_dataloader) - 1 if platform.system(
+ ) == "Windows" else len(train_dataloader)
for idx, batch in enumerate(train_dataloader):
train_reader_cost += time.time() - batch_start
- if idx >= len(train_dataloader):
+ if idx >= max_iter:
break
lr = optimizer.get_lr()
images = batch[0]
@@ -335,8 +338,10 @@ def eval(model, valid_dataloader, post_process_class, eval_class,
total_frame = 0.0
total_time = 0.0
pbar = tqdm(total=len(valid_dataloader), desc='eval model:')
+ max_iter = len(valid_dataloader) - 1 if platform.system(
+ ) == "Windows" else len(valid_dataloader)
for idx, batch in enumerate(valid_dataloader):
- if idx >= len(valid_dataloader):
+ if idx >= max_iter:
break
images = batch[0]
start = time.time()