diff --git a/README_ch.md b/README_ch.md index 76fad9630c50397b4f2c1e9f259e4c81eb5eec0d..80e17e2388c90488903d6e5c258cd855bb8d4f24 100755 --- a/README_ch.md +++ b/README_ch.md @@ -27,13 +27,6 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力 ## 📣 近期更新 -- 💼 **2022.9.5 飞桨智慧金融行业系列直播课** - - PaddleOCR发布四大范例:印章弯曲文本检测与识别、扫描版合同关键信息抽取、通用卡证结构化信息提取、中文表格识别与属性分析 - - 9月6日起每周二、周四19点直播,扫码免费加入微信群获取直播链接,与行业专家深度交流 -
- -
- - **🔥2022.8.24 发布 PaddleOCR [release/2.6](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.6)** - 发布[PP-Structurev2](./ppstructure/README_ch.md),系统功能性能全面升级,适配中文场景,新增支持[版面复原](./ppstructure/recovery/README_ch.md),支持**一行命令完成PDF转Word**; @@ -83,9 +76,10 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力 ## 👫 开源社区 - **📑项目合作:** 如果您是企业开发者且有明确的OCR垂类应用需求,填写[问卷](https://paddle.wjx.cn/vj/QwF7GKw.aspx)后可免费与官方团队展开不同层次的合作。 -- **👫加入社区:** 微信扫描二维码并填写问卷之后,加入交流群领取福利 - - **获取PaddleOCR最新发版解说《OCR超强技术详解与产业应用实战》系列直播课回放链接** - - **10G重磅OCR学习大礼包:**《动手学OCR》电子书,配套讲解视频和notebook项目;66篇OCR相关顶会前沿论文打包放送,包括CVPR、AAAI、IJCAI、ICCV等;PaddleOCR历次发版直播课视频;OCR社区优秀开发者项目分享视频。 +- **👫加入社区:** 微信扫描二维码并填写问卷之后,加入交流群领取20G重磅OCR学习大礼包 + - **包括《动手学OCR》电子书** ,配套讲解视频和notebook项目;PaddleOCR历次发版直播课视频; + - **OCR场景应用模型集合:** 包含数码管、液晶屏、车牌、高精度SVTR模型、手写体识别等垂类模型,覆盖通用,制造、金融、交通行业的主要OCR垂类应用。 + - PDF2Word应用程序;OCR社区优秀开发者项目分享视频。 - **🏅️社区项目**:[社区项目](./doc/doc_ch/thirdparty.md)文档中包含了社区用户**使用PaddleOCR开发的各种工具、应用**以及**为PaddleOCR贡献的功能、优化的文档与代码**等,是官方为社区开发者打造的荣誉墙,也是帮助优质项目宣传的广播站。 - **🎁社区常规赛**:社区常规赛是面向OCR开发者的积分赛事,覆盖文档、代码、模型和应用四大类型,以季度为单位评选并发放奖励,赛题详情与报名方法可参考[链接](https://github.com/PaddlePaddle/PaddleOCR/issues/4982)。 diff --git a/configs/rec/PP-OCRv3/ch_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/ch_PP-OCRv3_rec.yml index 6453934b7324b2b351aeb6fdf8e4e4de24b022bf..7e98280b32558b8d3d203084e6e327bc7cd782bf 100644 --- a/configs/rec/PP-OCRv3/ch_PP-OCRv3_rec.yml +++ b/configs/rec/PP-OCRv3/ch_PP-OCRv3_rec.yml @@ -88,6 +88,7 @@ Train: prob: 0.5 ext_data_num: 2 image_shape: [48, 320, 3] + max_text_length: *max_text_length - RecAug: - MultiLabelEncode: - RecResizeImg: diff --git a/configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml b/configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml index e7cbae59a14af73639e1a74a14021b9b2ef60057..427255738696d8e6a073829350c40b00ef30115f 100644 --- a/configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml +++ b/configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml @@ -162,6 +162,7 @@ Train: prob: 0.5 ext_data_num: 2 image_shape: [48, 320, 3] + max_text_length: *max_text_length - RecAug: - MultiLabelEncode: - RecResizeImg: diff --git a/configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml index ff536edec4d6e7a85a6e6c189d56a23ffabc5583..c728e0ac823b0bf835322dcbd0c385c3ac7b2489 100644 --- a/configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml +++ b/configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml @@ -88,6 +88,7 @@ Train: prob: 0.5 ext_data_num: 2 image_shape: [48, 320, 3] + max_text_length: *max_text_length - RecAug: - MultiLabelEncode: - RecResizeImg: diff --git a/deploy/cpp_infer/include/args.h b/deploy/cpp_infer/include/args.h index e0dd8bbcd1044fd695c90805bc770de5b47e51cf..f7fac9c92c421ca85818b2d04097ce8e55ea117e 100644 --- a/deploy/cpp_infer/include/args.h +++ b/deploy/cpp_infer/include/args.h @@ -54,6 +54,7 @@ DECLARE_string(table_model_dir); DECLARE_int32(table_max_len); DECLARE_int32(table_batch_num); DECLARE_string(table_char_dict_path); +DECLARE_bool(merge_no_span_structure); // forward related DECLARE_bool(det); DECLARE_bool(rec); diff --git a/deploy/cpp_infer/include/paddlestructure.h b/deploy/cpp_infer/include/paddlestructure.h index b30ac045b2a6552b69442b2e8b29673efc820e31..6d2c8b7d203a05f531b8d038d885061c42897373 100644 --- a/deploy/cpp_infer/include/paddlestructure.h +++ b/deploy/cpp_infer/include/paddlestructure.h @@ -54,15 +54,12 @@ private: std::vector &time_info_det, std::vector &time_info_rec, std::vector &time_info_cls); - std::string - rebuild_table(std::vector rec_html_tags, - std::vector>> rec_boxes, - std::vector &ocr_result); + std::string rebuild_table(std::vector rec_html_tags, + std::vector> rec_boxes, + std::vector &ocr_result); - float iou(std::vector> &box1, - std::vector> &box2); - float dis(std::vector> &box1, - std::vector> &box2); + float iou(std::vector &box1, std::vector &box2); + float dis(std::vector &box1, std::vector &box2); static bool comparison_dis(const std::vector &dis1, const std::vector &dis2) { diff --git a/deploy/cpp_infer/include/postprocess_op.h b/deploy/cpp_infer/include/postprocess_op.h index 77b3f8b660bda29815245b31ab8cac479b24498f..f5db52a6097f0fb916fc96fd8c76095f2ed1a9fa 100644 --- a/deploy/cpp_infer/include/postprocess_op.h +++ b/deploy/cpp_infer/include/postprocess_op.h @@ -92,14 +92,13 @@ private: class TablePostProcessor { public: - void init(std::string label_path); - void - Run(std::vector &loc_preds, std::vector &structure_probs, - std::vector &rec_scores, std::vector &loc_preds_shape, - std::vector &structure_probs_shape, - std::vector> &rec_html_tag_batch, - std::vector>>> &rec_boxes_batch, - std::vector &width_list, std::vector &height_list); + void init(std::string label_path, bool merge_no_span_structure = true); + void Run(std::vector &loc_preds, std::vector &structure_probs, + std::vector &rec_scores, std::vector &loc_preds_shape, + std::vector &structure_probs_shape, + std::vector> &rec_html_tag_batch, + std::vector>> &rec_boxes_batch, + std::vector &width_list, std::vector &height_list); private: std::vector label_list_; diff --git a/deploy/cpp_infer/include/structure_table.h b/deploy/cpp_infer/include/structure_table.h index 7449c6cd0e158425bccb75740191dd0b6d6ecc9b..c09e65654a7c8a4deb6729ddfd876531020f306b 100644 --- a/deploy/cpp_infer/include/structure_table.h +++ b/deploy/cpp_infer/include/structure_table.h @@ -44,7 +44,8 @@ public: const int &gpu_mem, const int &cpu_math_library_num_threads, const bool &use_mkldnn, const string &label_path, const bool &use_tensorrt, const std::string &precision, - const int &table_batch_num, const int &table_max_len) { + const int &table_batch_num, const int &table_max_len, + const bool &merge_no_span_structure) { this->use_gpu_ = use_gpu; this->gpu_id_ = gpu_id; this->gpu_mem_ = gpu_mem; @@ -55,7 +56,7 @@ public: this->table_batch_num_ = table_batch_num; this->table_max_len_ = table_max_len; - this->post_processor_.init(label_path); + this->post_processor_.init(label_path, merge_no_span_structure); LoadModel(model_dir); } @@ -65,7 +66,7 @@ public: void Run(std::vector img_list, std::vector> &rec_html_tags, std::vector &rec_scores, - std::vector>>> &rec_boxes, + std::vector>> &rec_boxes, std::vector ×); private: diff --git a/deploy/cpp_infer/include/utility.h b/deploy/cpp_infer/include/utility.h index 520804f64529303b5ecec27dc5f0895f1fff5c72..85b280fe25a46be70dba529891c3470a729dfbf1 100644 --- a/deploy/cpp_infer/include/utility.h +++ b/deploy/cpp_infer/include/utility.h @@ -42,6 +42,7 @@ struct OCRPredictResult { struct StructurePredictResult { std::vector box; + std::vector> cell_box; std::string type; std::vector text_res; std::string html; @@ -56,6 +57,10 @@ public: const std::vector &ocr_result, const std::string &save_path); + static void VisualizeBboxes(const cv::Mat &srcimg, + const StructurePredictResult &structure_result, + const std::string &save_path); + template inline static size_t argmax(ForwardIterator first, ForwardIterator last) { return std::distance(first, std::max_element(first, last)); @@ -81,6 +86,9 @@ public: static void sorted_boxes(std::vector &ocr_result); + static std::vector xyxyxyxy2xyxy(std::vector> &box); + static std::vector xyxyxyxy2xyxy(std::vector &box); + private: static bool comparison_box(const OCRPredictResult &result1, const OCRPredictResult &result2) { diff --git a/deploy/cpp_infer/readme.md b/deploy/cpp_infer/readme.md index 2afdf79521223c4f473ded8d4f930546fb762c46..2974f3227aa6f9cdd967665addc905f7b902bac2 100644 --- a/deploy/cpp_infer/readme.md +++ b/deploy/cpp_infer/readme.md @@ -350,6 +350,7 @@ More parameters are as follows, |table_model_dir|string|-|Address of table recognition inference model| |table_char_dict_path|string|../../ppocr/utils/dict/table_structure_dict.txt|dictionary file| |table_max_len|int|488|The size of the long side of the input image of the table recognition model, the final input image size of the network is(table_max_len,table_max_len)| +|merge_no_span_structure|bool|true|Whether to merge and to 和 为| * PaddleOCR也支持多语言的预测,更多支持的语言和模型可以参考[识别文档](../../doc/doc_ch/recognition.md)中的多语言字典与模型部分,如果希望进行多语言预测,只需将修改`rec_char_dict_path`(字典文件路径)以及`rec_model_dir`(inference模型路径)字段即可。 diff --git a/deploy/cpp_infer/src/args.cpp b/deploy/cpp_infer/src/args.cpp index df1b9e32a3aacc309d6485114f9b267001f79920..17e9c8b625baf53c2583a6d778aba552cdd19e97 100644 --- a/deploy/cpp_infer/src/args.cpp +++ b/deploy/cpp_infer/src/args.cpp @@ -55,8 +55,10 @@ DEFINE_int32(rec_img_w, 320, "rec image width"); DEFINE_string(table_model_dir, "", "Path of table struture inference model."); DEFINE_int32(table_max_len, 488, "max len size of input image."); DEFINE_int32(table_batch_num, 1, "table_batch_num."); +DEFINE_bool(merge_no_span_structure, true, + "Whether merge and to "); DEFINE_string(table_char_dict_path, - "../../ppocr/utils/dict/table_structure_dict.txt", + "../../ppocr/utils/dict/table_structure_dict_ch.txt", "Path of dictionary."); // ocr forward related diff --git a/deploy/cpp_infer/src/main.cpp b/deploy/cpp_infer/src/main.cpp index 66412a7b283f84107e117cfd59fb7d7aabff651c..34ffdc62674ef02b2d30c8e213a783495ceaff99 100644 --- a/deploy/cpp_infer/src/main.cpp +++ b/deploy/cpp_infer/src/main.cpp @@ -120,6 +120,7 @@ void structure(std::vector &cv_all_img_names) { engine.structure(cv_all_img_names, false, FLAGS_table); for (int i = 0; i < cv_all_img_names.size(); i++) { cout << "predict img: " << cv_all_img_names[i] << endl; + cv::Mat srcimg = cv::imread(cv_all_img_names[i], cv::IMREAD_COLOR); for (int j = 0; j < structure_results[i].size(); j++) { std::cout << j << "\ttype: " << structure_results[i][j].type << ", region: ["; @@ -129,6 +130,11 @@ void structure(std::vector &cv_all_img_names) { << structure_results[i][j].box[3] << "], res: "; if (structure_results[i][j].type == "table") { std::cout << structure_results[i][j].html << std::endl; + std::string file_name = Utility::basename(cv_all_img_names[i]); + + Utility::VisualizeBboxes(srcimg, structure_results[i][j], + FLAGS_output + "/" + std::to_string(j) + "_" + + file_name); } else { Utility::print_result(structure_results[i][j].text_res); } diff --git a/deploy/cpp_infer/src/ocr_cls.cpp b/deploy/cpp_infer/src/ocr_cls.cpp index 674630bf1e7e04841e027a7320d62af4a453ffc8..92d83600cea04419db231c0097caa53ed6fec58b 100644 --- a/deploy/cpp_infer/src/ocr_cls.cpp +++ b/deploy/cpp_infer/src/ocr_cls.cpp @@ -112,6 +112,11 @@ void Classifier::LoadModel(const std::string &model_dir) { precision = paddle_infer::Config::Precision::kInt8; } config.EnableTensorRtEngine(1 << 20, 10, 3, precision, false, false); + if (!Utility::PathExists("./trt_cls_shape.txt")){ + config.CollectShapeRangeInfo("./trt_cls_shape.txt"); + } else { + config.EnableTunedTensorRtDynamicShape("./trt_cls_shape.txt", true); + } } } else { config.DisableGpu(); diff --git a/deploy/cpp_infer/src/ocr_det.cpp b/deploy/cpp_infer/src/ocr_det.cpp index 56de195186a0d4d6c8b2482eb57c106347485928..0bfba4a2301a632696426f35b7be1dbacefe4cbf 100644 --- a/deploy/cpp_infer/src/ocr_det.cpp +++ b/deploy/cpp_infer/src/ocr_det.cpp @@ -32,49 +32,12 @@ void DBDetector::LoadModel(const std::string &model_dir) { if (this->precision_ == "int8") { precision = paddle_infer::Config::Precision::kInt8; } - config.EnableTensorRtEngine(1 << 20, 1, 20, precision, false, false); - std::map> min_input_shape = { - {"x", {1, 3, 50, 50}}, - {"conv2d_92.tmp_0", {1, 120, 20, 20}}, - {"conv2d_91.tmp_0", {1, 24, 10, 10}}, - {"conv2d_59.tmp_0", {1, 96, 20, 20}}, - {"nearest_interp_v2_1.tmp_0", {1, 256, 10, 10}}, - {"nearest_interp_v2_2.tmp_0", {1, 256, 20, 20}}, - {"conv2d_124.tmp_0", {1, 256, 20, 20}}, - {"nearest_interp_v2_3.tmp_0", {1, 64, 20, 20}}, - {"nearest_interp_v2_4.tmp_0", {1, 64, 20, 20}}, - {"nearest_interp_v2_5.tmp_0", {1, 64, 20, 20}}, - {"elementwise_add_7", {1, 56, 2, 2}}, - {"nearest_interp_v2_0.tmp_0", {1, 256, 2, 2}}}; - std::map> max_input_shape = { - {"x", {1, 3, 1536, 1536}}, - {"conv2d_92.tmp_0", {1, 120, 400, 400}}, - {"conv2d_91.tmp_0", {1, 24, 200, 200}}, - {"conv2d_59.tmp_0", {1, 96, 400, 400}}, - {"nearest_interp_v2_1.tmp_0", {1, 256, 200, 200}}, - {"nearest_interp_v2_2.tmp_0", {1, 256, 400, 400}}, - {"conv2d_124.tmp_0", {1, 256, 400, 400}}, - {"nearest_interp_v2_3.tmp_0", {1, 64, 400, 400}}, - {"nearest_interp_v2_4.tmp_0", {1, 64, 400, 400}}, - {"nearest_interp_v2_5.tmp_0", {1, 64, 400, 400}}, - {"elementwise_add_7", {1, 56, 400, 400}}, - {"nearest_interp_v2_0.tmp_0", {1, 256, 400, 400}}}; - std::map> opt_input_shape = { - {"x", {1, 3, 640, 640}}, - {"conv2d_92.tmp_0", {1, 120, 160, 160}}, - {"conv2d_91.tmp_0", {1, 24, 80, 80}}, - {"conv2d_59.tmp_0", {1, 96, 160, 160}}, - {"nearest_interp_v2_1.tmp_0", {1, 256, 80, 80}}, - {"nearest_interp_v2_2.tmp_0", {1, 256, 160, 160}}, - {"conv2d_124.tmp_0", {1, 256, 160, 160}}, - {"nearest_interp_v2_3.tmp_0", {1, 64, 160, 160}}, - {"nearest_interp_v2_4.tmp_0", {1, 64, 160, 160}}, - {"nearest_interp_v2_5.tmp_0", {1, 64, 160, 160}}, - {"elementwise_add_7", {1, 56, 40, 40}}, - {"nearest_interp_v2_0.tmp_0", {1, 256, 40, 40}}}; - - config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape, - opt_input_shape); + config.EnableTensorRtEngine(1 << 30, 1, 20, precision, false, false); + if (!Utility::PathExists("./trt_det_shape.txt")){ + config.CollectShapeRangeInfo("./trt_det_shape.txt"); + } else { + config.EnableTunedTensorRtDynamicShape("./trt_det_shape.txt", true); + } } } else { config.DisableGpu(); diff --git a/deploy/cpp_infer/src/ocr_rec.cpp b/deploy/cpp_infer/src/ocr_rec.cpp index 0f90ddfab4872f97829da081e64cb7437e72493a..90ad6598d325687bea1129d4db0a9ddb85409686 100644 --- a/deploy/cpp_infer/src/ocr_rec.cpp +++ b/deploy/cpp_infer/src/ocr_rec.cpp @@ -148,19 +148,12 @@ void CRNNRecognizer::LoadModel(const std::string &model_dir) { precision = paddle_infer::Config::Precision::kInt8; } config.EnableTensorRtEngine(1 << 20, 10, 15, precision, false, false); - int imgH = this->rec_image_shape_[1]; - int imgW = this->rec_image_shape_[2]; - std::map> min_input_shape = { - {"x", {1, 3, imgH, 10}}, {"lstm_0.tmp_0", {10, 1, 96}}}; - std::map> max_input_shape = { - {"x", {this->rec_batch_num_, 3, imgH, 2500}}, - {"lstm_0.tmp_0", {1000, 1, 96}}}; - std::map> opt_input_shape = { - {"x", {this->rec_batch_num_, 3, imgH, imgW}}, - {"lstm_0.tmp_0", {25, 1, 96}}}; - - config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape, - opt_input_shape); + if (!Utility::PathExists("./trt_rec_shape.txt")){ + config.CollectShapeRangeInfo("./trt_rec_shape.txt"); + } else { + config.EnableTunedTensorRtDynamicShape("./trt_rec_shape.txt", true); + } + } } else { config.DisableGpu(); diff --git a/deploy/cpp_infer/src/paddlestructure.cpp b/deploy/cpp_infer/src/paddlestructure.cpp index 1ca85a96bbcf09472ce5916375a24a9441a2da53..ea69977a1e45b0f7c1235a647d7c56db4d3cbc74 100644 --- a/deploy/cpp_infer/src/paddlestructure.cpp +++ b/deploy/cpp_infer/src/paddlestructure.cpp @@ -27,7 +27,7 @@ PaddleStructure::PaddleStructure() { FLAGS_table_model_dir, FLAGS_use_gpu, FLAGS_gpu_id, FLAGS_gpu_mem, FLAGS_cpu_threads, FLAGS_enable_mkldnn, FLAGS_table_char_dict_path, FLAGS_use_tensorrt, FLAGS_precision, FLAGS_table_batch_num, - FLAGS_table_max_len); + FLAGS_table_max_len, FLAGS_merge_no_span_structure); } }; @@ -42,7 +42,7 @@ PaddleStructure::structure(std::vector cv_all_img_names, std::vector> structure_results; if (!Utility::PathExists(FLAGS_output) && FLAGS_det) { - mkdir(FLAGS_output.c_str(), 0777); + Utility::CreateDir(FLAGS_output); } for (int i = 0; i < cv_all_img_names.size(); ++i) { std::vector structure_result; @@ -84,7 +84,7 @@ void PaddleStructure::table(cv::Mat img, // predict structure std::vector> structure_html_tags; std::vector structure_scores(1, 0); - std::vector>>> structure_boxes; + std::vector>> structure_boxes; std::vector structure_imes; std::vector img_list; img_list.push_back(img); @@ -103,20 +103,15 @@ void PaddleStructure::table(cv::Mat img, this->det(img_list[i], ocr_result, time_info_det); // crop image std::vector rec_img_list; + std::vector ocr_box; for (int j = 0; j < ocr_result.size(); j++) { - int x_collect[4] = {ocr_result[j].box[0][0], ocr_result[j].box[1][0], - ocr_result[j].box[2][0], ocr_result[j].box[3][0]}; - int y_collect[4] = {ocr_result[j].box[0][1], ocr_result[j].box[1][1], - ocr_result[j].box[2][1], ocr_result[j].box[3][1]}; - int left = int(*std::min_element(x_collect, x_collect + 4)); - int right = int(*std::max_element(x_collect, x_collect + 4)); - int top = int(*std::min_element(y_collect, y_collect + 4)); - int bottom = int(*std::max_element(y_collect, y_collect + 4)); - std::vector box{max(0, left - expand_pixel), - max(0, top - expand_pixel), - min(img_list[i].cols, right + expand_pixel), - min(img_list[i].rows, bottom + expand_pixel)}; - cv::Mat crop_img = Utility::crop_image(img_list[i], box); + ocr_box = Utility::xyxyxyxy2xyxy(ocr_result[j].box); + ocr_box[0] = max(0, ocr_box[0] - expand_pixel); + ocr_box[1] = max(0, ocr_box[1] - expand_pixel), + ocr_box[2] = min(img_list[i].cols, ocr_box[2] + expand_pixel); + ocr_box[3] = min(img_list[i].rows, ocr_box[3] + expand_pixel); + + cv::Mat crop_img = Utility::crop_image(img_list[i], ocr_box); rec_img_list.push_back(crop_img); } // rec @@ -125,38 +120,37 @@ void PaddleStructure::table(cv::Mat img, html = this->rebuild_table(structure_html_tags[i], structure_boxes[i], ocr_result); structure_result.html = html; + structure_result.cell_box = structure_boxes[i]; structure_result.html_score = structure_scores[i]; } }; -std::string PaddleStructure::rebuild_table( - std::vector structure_html_tags, - std::vector>> structure_boxes, - std::vector &ocr_result) { +std::string +PaddleStructure::rebuild_table(std::vector structure_html_tags, + std::vector> structure_boxes, + std::vector &ocr_result) { // match text in same cell std::vector> matched(structure_boxes.size(), std::vector()); + std::vector ocr_box; + std::vector structure_box; for (int i = 0; i < ocr_result.size(); i++) { + ocr_box = Utility::xyxyxyxy2xyxy(ocr_result[i].box); + ocr_box[0] -= 1; + ocr_box[1] -= 1; + ocr_box[2] += 1; + ocr_box[3] += 1; std::vector> dis_list(structure_boxes.size(), std::vector(3, 100000.0)); for (int j = 0; j < structure_boxes.size(); j++) { - int x_collect[4] = {ocr_result[i].box[0][0], ocr_result[i].box[1][0], - ocr_result[i].box[2][0], ocr_result[i].box[3][0]}; - int y_collect[4] = {ocr_result[i].box[0][1], ocr_result[i].box[1][1], - ocr_result[i].box[2][1], ocr_result[i].box[3][1]}; - int left = int(*std::min_element(x_collect, x_collect + 4)); - int right = int(*std::max_element(x_collect, x_collect + 4)); - int top = int(*std::min_element(y_collect, y_collect + 4)); - int bottom = int(*std::max_element(y_collect, y_collect + 4)); - std::vector> box(2, std::vector(2, 0)); - box[0][0] = left - 1; - box[0][1] = top - 1; - box[1][0] = right + 1; - box[1][1] = bottom + 1; - - dis_list[j][0] = this->dis(box, structure_boxes[j]); - dis_list[j][1] = 1 - this->iou(box, structure_boxes[j]); + if (structure_boxes[i].size() == 8) { + structure_box = Utility::xyxyxyxy2xyxy(structure_boxes[j]); + } else { + structure_box = structure_boxes[j]; + } + dis_list[j][0] = this->dis(ocr_box, structure_box); + dis_list[j][1] = 1 - this->iou(ocr_box, structure_box); dis_list[j][2] = j; } // find min dis idx @@ -164,6 +158,7 @@ std::string PaddleStructure::rebuild_table( PaddleStructure::comparison_dis); matched[dis_list[0][2]].push_back(ocr_result[i].text); } + // get pred html std::string html_str = ""; int td_tag_idx = 0; @@ -221,19 +216,18 @@ std::string PaddleStructure::rebuild_table( return html_str; } -float PaddleStructure::iou(std::vector> &box1, - std::vector> &box2) { - int area1 = max(0, box1[1][0] - box1[0][0]) * max(0, box1[1][1] - box1[0][1]); - int area2 = max(0, box2[1][0] - box2[0][0]) * max(0, box2[1][1] - box2[0][1]); +float PaddleStructure::iou(std::vector &box1, std::vector &box2) { + int area1 = max(0, box1[2] - box1[0]) * max(0, box1[3] - box1[1]); + int area2 = max(0, box2[2] - box2[0]) * max(0, box2[3] - box2[1]); // computing the sum_area int sum_area = area1 + area2; // find the each point of intersect rectangle - int x1 = max(box1[0][0], box2[0][0]); - int y1 = max(box1[0][1], box2[0][1]); - int x2 = min(box1[1][0], box2[1][0]); - int y2 = min(box1[1][1], box2[1][1]); + int x1 = max(box1[0], box2[0]); + int y1 = max(box1[1], box2[1]); + int x2 = min(box1[2], box2[2]); + int y2 = min(box1[3], box2[3]); // judge if there is an intersect if (y1 >= y2 || x1 >= x2) { @@ -244,17 +238,16 @@ float PaddleStructure::iou(std::vector> &box1, } } -float PaddleStructure::dis(std::vector> &box1, - std::vector> &box2) { - int x1_1 = box1[0][0]; - int y1_1 = box1[0][1]; - int x2_1 = box1[1][0]; - int y2_1 = box1[1][1]; +float PaddleStructure::dis(std::vector &box1, std::vector &box2) { + int x1_1 = box1[0]; + int y1_1 = box1[1]; + int x2_1 = box1[2]; + int y2_1 = box1[3]; - int x1_2 = box2[0][0]; - int y1_2 = box2[0][1]; - int x2_2 = box2[1][0]; - int y2_2 = box2[1][1]; + int x1_2 = box2[0]; + int y1_2 = box2[1]; + int x2_2 = box2[2]; + int y2_2 = box2[3]; float dis = abs(x1_2 - x1_1) + abs(y1_2 - y1_1) + abs(x2_2 - x2_1) + abs(y2_2 - y2_1); diff --git a/deploy/cpp_infer/src/postprocess_op.cpp b/deploy/cpp_infer/src/postprocess_op.cpp index 551f98a1668124f83ef615f0a41b081508898d6e..4b0c693c80467bceb75da2b3fef6e816b0690979 100644 --- a/deploy/cpp_infer/src/postprocess_op.cpp +++ b/deploy/cpp_infer/src/postprocess_op.cpp @@ -352,8 +352,21 @@ std::vector>> DBPostProcessor::FilterTagDetRes( return root_points; } -void TablePostProcessor::init(std::string label_path) { +void TablePostProcessor::init(std::string label_path, + bool merge_no_span_structure) { this->label_list_ = Utility::ReadDict(label_path); + if (merge_no_span_structure) { + this->label_list_.push_back(""); + std::vector::iterator it; + for (it = this->label_list_.begin(); it != this->label_list_.end();) { + if (*it == "") { + it = this->label_list_.erase(it); + } else { + ++it; + } + } + } + // add_special_char this->label_list_.insert(this->label_list_.begin(), this->beg); this->label_list_.push_back(this->end); } @@ -363,12 +376,12 @@ void TablePostProcessor::Run( std::vector &rec_scores, std::vector &loc_preds_shape, std::vector &structure_probs_shape, std::vector> &rec_html_tag_batch, - std::vector>>> &rec_boxes_batch, + std::vector>> &rec_boxes_batch, std::vector &width_list, std::vector &height_list) { for (int batch_idx = 0; batch_idx < structure_probs_shape[0]; batch_idx++) { // image tags and boxs std::vector rec_html_tags; - std::vector>> rec_boxes; + std::vector> rec_boxes; float score = 0.f; int count = 0; @@ -378,7 +391,7 @@ void TablePostProcessor::Run( // step for (int step_idx = 0; step_idx < structure_probs_shape[1]; step_idx++) { std::string html_tag; - std::vector> rec_box; + std::vector rec_box; // html tag int step_start_idx = (batch_idx * structure_probs_shape[1] + step_idx) * structure_probs_shape[2]; @@ -399,17 +412,19 @@ void TablePostProcessor::Run( count += 1; score += char_score; rec_html_tags.push_back(html_tag); + // box if (html_tag == "" || html_tag == "") { - for (int point_idx = 0; point_idx < loc_preds_shape[2]; - point_idx += 2) { - std::vector point(2, 0); + for (int point_idx = 0; point_idx < loc_preds_shape[2]; point_idx++) { step_start_idx = (batch_idx * structure_probs_shape[1] + step_idx) * loc_preds_shape[2] + point_idx; - point[0] = int(loc_preds[step_start_idx] * width_list[batch_idx]); - point[1] = - int(loc_preds[step_start_idx + 1] * height_list[batch_idx]); + float point = loc_preds[step_start_idx]; + if (point_idx % 2 == 0) { + point = int(point * width_list[batch_idx]); + } else { + point = int(point * height_list[batch_idx]); + } rec_box.push_back(point); } rec_boxes.push_back(rec_box); diff --git a/deploy/cpp_infer/src/structure_table.cpp b/deploy/cpp_infer/src/structure_table.cpp index bbc32580e49d6ed7b29e3f0931eab0b0969b02b9..7df0ab94b5df8a62148ceb01f48b35d73b14f78c 100644 --- a/deploy/cpp_infer/src/structure_table.cpp +++ b/deploy/cpp_infer/src/structure_table.cpp @@ -20,7 +20,7 @@ void StructureTableRecognizer::Run( std::vector img_list, std::vector> &structure_html_tags, std::vector &structure_scores, - std::vector>>> &structure_boxes, + std::vector>> &structure_boxes, std::vector ×) { std::chrono::duration preprocess_diff = std::chrono::steady_clock::now() - std::chrono::steady_clock::now(); @@ -89,8 +89,7 @@ void StructureTableRecognizer::Run( auto postprocess_start = std::chrono::steady_clock::now(); std::vector> structure_html_tag_batch; std::vector structure_score_batch; - std::vector>>> - structure_boxes_batch; + std::vector>> structure_boxes_batch; this->post_processor_.Run(loc_preds, structure_probs, structure_score_batch, predict_shape0, predict_shape1, structure_html_tag_batch, structure_boxes_batch, diff --git a/deploy/cpp_infer/src/utility.cpp b/deploy/cpp_infer/src/utility.cpp index 4bfc1d091d6124b10c79032beb702ba8727210fc..0e6ba17fc3bab5b5e005f8b5e41640899bee39d0 100644 --- a/deploy/cpp_infer/src/utility.cpp +++ b/deploy/cpp_infer/src/utility.cpp @@ -65,6 +65,37 @@ void Utility::VisualizeBboxes(const cv::Mat &srcimg, << std::endl; } +void Utility::VisualizeBboxes(const cv::Mat &srcimg, + const StructurePredictResult &structure_result, + const std::string &save_path) { + cv::Mat img_vis; + srcimg.copyTo(img_vis); + for (int n = 0; n < structure_result.cell_box.size(); n++) { + if (structure_result.cell_box[n].size() == 8) { + cv::Point rook_points[4]; + for (int m = 0; m < structure_result.cell_box[n].size(); m += 2) { + rook_points[m / 2] = + cv::Point(int(structure_result.cell_box[n][m]), + int(structure_result.cell_box[n][m + 1])); + } + const cv::Point *ppt[1] = {rook_points}; + int npt[] = {4}; + cv::polylines(img_vis, ppt, npt, 1, 1, CV_RGB(0, 255, 0), 2, 8, 0); + } else if (structure_result.cell_box[n].size() == 4) { + cv::Point rook_points[2]; + rook_points[0] = cv::Point(int(structure_result.cell_box[n][0]), + int(structure_result.cell_box[n][1])); + rook_points[1] = cv::Point(int(structure_result.cell_box[n][2]), + int(structure_result.cell_box[n][3])); + cv::rectangle(img_vis, rook_points[0], rook_points[1], CV_RGB(0, 255, 0), + 2, 8, 0); + } + } + + cv::imwrite(save_path, img_vis); + std::cout << "The table visualized image saved in " + save_path << std::endl; +} + // list all files under a directory void Utility::GetAllFiles(const char *dir_name, std::vector &all_inputs) { @@ -268,13 +299,46 @@ cv::Mat Utility::crop_image(cv::Mat &img, std::vector &area) { void Utility::sorted_boxes(std::vector &ocr_result) { std::sort(ocr_result.begin(), ocr_result.end(), Utility::comparison_box); - - for (int i = 0; i < ocr_result.size() - 1; i++) { - if (abs(ocr_result[i + 1].box[0][1] - ocr_result[i].box[0][1]) < 10 && - (ocr_result[i + 1].box[0][0] < ocr_result[i].box[0][0])) { - std::swap(ocr_result[i], ocr_result[i + 1]); + if (ocr_result.size() > 0) { + for (int i = 0; i < ocr_result.size() - 1; i++) { + for (int j = i; j > 0; j--) { + if (abs(ocr_result[j + 1].box[0][1] - ocr_result[j].box[0][1]) < 10 && + (ocr_result[j + 1].box[0][0] < ocr_result[j].box[0][0])) { + std::swap(ocr_result[i], ocr_result[i + 1]); + } + } } } } +std::vector Utility::xyxyxyxy2xyxy(std::vector> &box) { + int x_collect[4] = {box[0][0], box[1][0], box[2][0], box[3][0]}; + int y_collect[4] = {box[0][1], box[1][1], box[2][1], box[3][1]}; + int left = int(*std::min_element(x_collect, x_collect + 4)); + int right = int(*std::max_element(x_collect, x_collect + 4)); + int top = int(*std::min_element(y_collect, y_collect + 4)); + int bottom = int(*std::max_element(y_collect, y_collect + 4)); + std::vector box1(4, 0); + box1[0] = left; + box1[1] = top; + box1[2] = right; + box1[3] = bottom; + return box1; +} + +std::vector Utility::xyxyxyxy2xyxy(std::vector &box) { + int x_collect[4] = {box[0], box[2], box[4], box[6]}; + int y_collect[4] = {box[1], box[3], box[5], box[7]}; + int left = int(*std::min_element(x_collect, x_collect + 4)); + int right = int(*std::max_element(x_collect, x_collect + 4)); + int top = int(*std::min_element(y_collect, y_collect + 4)); + int bottom = int(*std::max_element(y_collect, y_collect + 4)); + std::vector box1(4, 0); + box1[0] = left; + box1[1] = top; + box1[2] = right; + box1[3] = bottom; + return box1; +} + } // namespace PaddleOCR \ No newline at end of file diff --git a/deploy/lite/config.txt b/deploy/lite/config.txt index dda0d2b0320544d3a82f59b0672c086c64d83d3d..404249323b6cb5de345438056a9a10abd64b38bc 100644 --- a/deploy/lite/config.txt +++ b/deploy/lite/config.txt @@ -5,4 +5,4 @@ det_db_unclip_ratio 1.6 det_db_use_dilate 0 det_use_polygon_score 1 use_direction_classify 1 -rec_image_height 32 \ No newline at end of file +rec_image_height 48 \ No newline at end of file diff --git a/deploy/lite/readme.md b/deploy/lite/readme.md index a1bef8120e52dd91db0fda4ac2a4d91cc2800818..fc91cbfa7d69f6a8c1086243e4df3f820bd78339 100644 --- a/deploy/lite/readme.md +++ b/deploy/lite/readme.md @@ -99,6 +99,8 @@ The following table also provides a series of models that can be deployed on mob |Version|Introduction|Model size|Detection model|Text Direction model|Recognition model|Paddle-Lite branch| |---|---|---|---|---|---|---| +|PP-OCRv3|extra-lightweight chinese OCR optimized model|16.2M|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_infer_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.nb)|v2.10| +|PP-OCRv3(slim)|extra-lightweight chinese OCR optimized model|5.9M|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_slim_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.nb)|v2.10| |PP-OCRv2|extra-lightweight chinese OCR optimized model|11M|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_det_infer_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_infer_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_rec_infer_opt.nb)|v2.10| |PP-OCRv2(slim)|extra-lightweight chinese OCR optimized model|4.6M|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_det_slim_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_slim_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_rec_slim_opt.nb)|v2.10| @@ -134,17 +136,16 @@ Introduction to paddle_lite_opt parameters: The following takes the ultra-lightweight Chinese model of PaddleOCR as an example to introduce the use of the compiled opt file to complete the conversion of the inference model to the Paddle-Lite optimized model ``` -# 【[Recommendation] Download the Chinese and English inference model of PP-OCRv2 -wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_quant_infer.tar && tar xf ch_PP-OCRv2_det_slim_quant_infer.tar -wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar && tar xf ch_PP-OCRv2_rec_slim_quant_infer.tar +# 【[Recommendation] Download the Chinese and English inference model of PP-OCRv3 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.tar && tar xf ch_PP-OCRv3_det_slim_infer.tar +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.tar && tar xf ch_PP-OCRv2_rec_slim_quant_infer.tar wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/slim/ch_ppocr_mobile_v2.0_cls_slim_infer.tar && tar xf ch_ppocr_mobile_v2.0_cls_slim_infer.tar # Convert detection model -./opt --model_file=./ch_PP-OCRv2_det_slim_quant_infer/inference.pdmodel --param_file=./ch_PP-OCRv2_det_slim_quant_infer/inference.pdiparams --optimize_out=./ch_PP-OCRv2_det_slim_opt --valid_targets=arm --optimize_out_type=naive_buffer +paddle_lite_opt --model_file=./ch_PP-OCRv3_det_slim_infer/inference.pdmodel --param_file=./ch_PP-OCRv3_det_slim_infer/inference.pdiparams --optimize_out=./ch_PP-OCRv3_det_slim_opt --valid_targets=arm --optimize_out_type=naive_buffer # Convert recognition model -./opt --model_file=./ch_PP-OCRv2_rec_slim_quant_infer/inference.pdmodel --param_file=./ch_PP-OCRv2_rec_slim_quant_infer/inference.pdiparams --optimize_out=./ch_PP-OCRv2_rec_slim_opt --valid_targets=arm --optimize_out_type=naive_buffer +paddle_lite_opt --model_file=./ch_PP-OCRv3_rec_slim_infer/inference.pdmodel --param_file=./ch_PP-OCRv3_rec_slim_infer/inference.pdiparams --optimize_out=./ch_PP-OCRv3_rec_slim_opt --valid_targets=arm --optimize_out_type=naive_buffer # Convert angle classifier model -./opt --model_file=./ch_ppocr_mobile_v2.0_cls_slim_infer/inference.pdmodel --param_file=./ch_ppocr_mobile_v2.0_cls_slim_infer/inference.pdiparams --optimize_out=./ch_ppocr_mobile_v2.0_cls_slim_opt --valid_targets=arm --optimize_out_type=naive_buffer - +paddle_lite_opt --model_file=./ch_ppocr_mobile_v2.0_cls_slim_infer/inference.pdmodel --param_file=./ch_ppocr_mobile_v2.0_cls_slim_infer/inference.pdiparams --optimize_out=./ch_ppocr_mobile_v2.0_cls_slim_opt --valid_targets=arm --optimize_out_type=naive_buffer ``` After the conversion is successful, there will be more files ending with `.nb` in the inference model directory, which is the successfully converted model file. @@ -197,15 +198,15 @@ Some preparatory work is required first. cp ../../../cxx/lib/libpaddle_light_api_shared.so ./debug/ ``` -Prepare the test image, taking PaddleOCR/doc/imgs/11.jpg as an example, copy the image file to the demo/cxx/ocr/debug/ folder. Prepare the model files optimized by the lite opt tool, ch_det_mv3_db_opt.nb, ch_rec_mv3_crnn_opt.nb, and place them under the demo/cxx/ocr/debug/ folder. +Prepare the test image, taking PaddleOCR/doc/imgs/11.jpg as an example, copy the image file to the demo/cxx/ocr/debug/ folder. Prepare the model files optimized by the lite opt tool, ch_PP-OCRv3_det_slim_opt.nb , ch_PP-OCRv3_rec_slim_opt.nb , and place them under the demo/cxx/ocr/debug/ folder. The structure of the OCR demo is as follows after the above command is executed: ``` demo/cxx/ocr/ |-- debug/ -| |--ch_PP-OCRv2_det_slim_opt.nb Detection model -| |--ch_PP-OCRv2_rec_slim_opt.nb Recognition model +| |--ch_PP-OCRv3_det_slim_opt.nb Detection model +| |--ch_PP-OCRv3_rec_slim_opt.nb Recognition model | |--ch_ppocr_mobile_v2.0_cls_slim_opt.nb Text direction classification model | |--11.jpg Image for OCR | |--ppocr_keys_v1.txt Dictionary file @@ -240,7 +241,7 @@ det_db_thresh 0.3 # Used to filter the binarized image of DB prediction, det_db_box_thresh 0.5 # DDB post-processing filter box threshold, if there is a missing box detected, it can be reduced as appropriate det_db_unclip_ratio 1.6 # Indicates the compactness of the text box, the smaller the value, the closer the text box to the text use_direction_classify 0 # Whether to use the direction classifier, 0 means not to use, 1 means to use -rec_image_height 32 # The height of the input image of the recognition model, the PP-OCRv3 model needs to be set to 48, and the PP-OCRv2 model needs to be set to 32 +rec_image_height 48 # The height of the input image of the recognition model, the PP-OCRv3 model needs to be set to 48, and the PP-OCRv2 model needs to be set to 32 ``` 5. Run Model on phone @@ -260,14 +261,14 @@ After the above steps are completed, you can use adb to push the file to the pho export LD_LIBRARY_PATH=${PWD}:$LD_LIBRARY_PATH # The use of ocr_db_crnn is: # ./ocr_db_crnn Mode Detection model file Orientation classifier model file Recognition model file Hardware Precision Threads Batchsize Test image path Dictionary file path - ./ocr_db_crnn system ch_PP-OCRv2_det_slim_opt.nb ch_PP-OCRv2_rec_slim_opt.nb ch_ppocr_mobile_v2.0_cls_slim_opt.nb arm8 INT8 10 1 ./11.jpg config.txt ppocr_keys_v1.txt True + ./ocr_db_crnn system ch_PP-OCRv3_det_slim_opt.nb ch_PP-OCRv3_rec_slim_opt.nb ch_ppocr_mobile_v2.0_cls_slim_opt.nb arm8 INT8 10 1 ./11.jpg config.txt ppocr_keys_v1.txt True # precision can be INT8 for quantitative model or FP32 for normal model. # Only using detection model -./ocr_db_crnn det ch_PP-OCRv2_det_slim_opt.nb arm8 INT8 10 1 ./11.jpg config.txt +./ocr_db_crnn det ch_PP-OCRv3_det_slim_opt.nb arm8 INT8 10 1 ./11.jpg config.txt # Only using recognition model -./ocr_db_crnn rec ch_PP-OCRv2_rec_slim_opt.nb arm8 INT8 10 1 word_1.jpg ppocr_keys_v1.txt config.txt +./ocr_db_crnn rec ch_PP-OCRv3_rec_slim_opt.nb arm8 INT8 10 1 word_1.jpg ppocr_keys_v1.txt config.txt ``` If you modify the code, you need to recompile and push to the phone. diff --git a/deploy/lite/readme_ch.md b/deploy/lite/readme_ch.md index 0793827fe647c470944fc36e2b243c8f7e704e99..78e2510917e0fd85c4a724ec74eccb0b7cfc6118 100644 --- a/deploy/lite/readme_ch.md +++ b/deploy/lite/readme_ch.md @@ -97,6 +97,8 @@ Paddle-Lite 提供了多种策略来自动优化原始的模型,其中包括 |模型版本|模型简介|模型大小|检测模型|文本方向分类模型|识别模型|Paddle-Lite版本| |---|---|---|---|---|---|---| +|PP-OCRv3|蒸馏版超轻量中文OCR移动端模型|16.2M|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.nb)|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_infer_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.nb)|v2.10| +|PP-OCRv3(slim)|蒸馏版超轻量中文OCR移动端模型|5.9M|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.nb)|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_slim_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.nb)|v2.10| |PP-OCRv2|蒸馏版超轻量中文OCR移动端模型|11M|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_det_infer_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_infer_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_rec_infer_opt.nb)|v2.10| |PP-OCRv2(slim)|蒸馏版超轻量中文OCR移动端模型|4.6M|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_det_slim_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_slim_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_rec_slim_opt.nb)|v2.10| @@ -131,16 +133,16 @@ paddle_lite_opt 参数介绍: 下面以PaddleOCR的超轻量中文模型为例,介绍使用编译好的opt文件完成inference模型到Paddle-Lite优化模型的转换。 ``` -# 【推荐】 下载 PP-OCRv2版本的中英文 inference模型 -wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_quant_infer.tar && tar xf ch_PP-OCRv2_det_slim_quant_infer.tar -wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar && tar xf ch_PP-OCRv2_rec_slim_quant_infer.tar +# 【推荐】 下载 PP-OCRv3版本的中英文 inference模型 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.tar && tar xf ch_PP-OCRv3_det_slim_infer.tar +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.tar && tar xf ch_PP-OCRv2_rec_slim_quant_infer.tar wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/slim/ch_ppocr_mobile_v2.0_cls_slim_infer.tar && tar xf ch_ppocr_mobile_v2.0_cls_slim_infer.tar # 转换检测模型 -./opt --model_file=./ch_PP-OCRv2_det_slim_quant_infer/inference.pdmodel --param_file=./ch_PP-OCRv2_det_slim_quant_infer/inference.pdiparams --optimize_out=./ch_PP-OCRv2_det_slim_opt --valid_targets=arm --optimize_out_type=naive_buffer +paddle_lite_opt --model_file=./ch_PP-OCRv3_det_slim_infer/inference.pdmodel --param_file=./ch_PP-OCRv3_det_slim_infer/inference.pdiparams --optimize_out=./ch_PP-OCRv3_det_slim_opt --valid_targets=arm --optimize_out_type=naive_buffer # 转换识别模型 -./opt --model_file=./ch_PP-OCRv2_rec_slim_quant_infer/inference.pdmodel --param_file=./ch_PP-OCRv2_rec_slim_quant_infer/inference.pdiparams --optimize_out=./ch_PP-OCRv2_rec_slim_opt --valid_targets=arm --optimize_out_type=naive_buffer +paddle_lite_opt --model_file=./ch_PP-OCRv3_rec_slim_infer/inference.pdmodel --param_file=./ch_PP-OCRv3_rec_slim_infer/inference.pdiparams --optimize_out=./ch_PP-OCRv3_rec_slim_opt --valid_targets=arm --optimize_out_type=naive_buffer # 转换方向分类器模型 -./opt --model_file=./ch_ppocr_mobile_v2.0_cls_slim_infer/inference.pdmodel --param_file=./ch_ppocr_mobile_v2.0_cls_slim_infer/inference.pdiparams --optimize_out=./ch_ppocr_mobile_v2.0_cls_slim_opt --valid_targets=arm --optimize_out_type=naive_buffer +paddle_lite_opt --model_file=./ch_ppocr_mobile_v2.0_cls_slim_infer/inference.pdmodel --param_file=./ch_ppocr_mobile_v2.0_cls_slim_infer/inference.pdiparams --optimize_out=./ch_ppocr_mobile_v2.0_cls_slim_opt --valid_targets=arm --optimize_out_type=naive_buffer ``` @@ -194,15 +196,15 @@ wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/slim/ch_ppocr_mobile_v2.0_cls ``` 准备测试图像,以`PaddleOCR/doc/imgs/11.jpg`为例,将测试的图像复制到`demo/cxx/ocr/debug/`文件夹下。 - 准备lite opt工具优化后的模型文件,比如使用`ch_PP-OCRv2_det_slim_opt.ch_PP-OCRv2_rec_slim_rec.nb, ch_ppocr_mobile_v2.0_cls_slim_opt.nb`,模型文件放置在`demo/cxx/ocr/debug/`文件夹下。 + 准备lite opt工具优化后的模型文件,比如使用`ch_PP-OCRv3_det_slim_opt.ch_PP-OCRv3_rec_slim_rec.nb, ch_ppocr_mobile_v2.0_cls_slim_opt.nb`,模型文件放置在`demo/cxx/ocr/debug/`文件夹下。 执行完成后,ocr文件夹下将有如下文件格式: ``` demo/cxx/ocr/ |-- debug/ -| |--ch_PP-OCRv2_det_slim_opt.nb 优化后的检测模型文件 -| |--ch_PP-OCRv2_rec_slim_opt.nb 优化后的识别模型文件 +| |--ch_PP-OCRv3_det_slim_opt.nb 优化后的检测模型文件 +| |--ch_PP-OCRv3_rec_slim_opt.nb 优化后的识别模型文件 | |--ch_ppocr_mobile_v2.0_cls_slim_opt.nb 优化后的文字方向分类器模型文件 | |--11.jpg 待测试图像 | |--ppocr_keys_v1.txt 中文字典文件 @@ -239,7 +241,7 @@ det_db_thresh 0.3 # 用于过滤DB预测的二值化图像,设置为0. det_db_box_thresh 0.5 # 检测器后处理过滤box的阈值,如果检测存在漏框情况,可酌情减小 det_db_unclip_ratio 1.6 # 表示文本框的紧致程度,越小则文本框更靠近文本 use_direction_classify 0 # 是否使用方向分类器,0表示不使用,1表示使用 -rec_image_height 32 # 识别模型输入图像的高度,PP-OCRv3模型设置为48,PP-OCRv2模型需要设置为32 +rec_image_height 48 # 识别模型输入图像的高度,PP-OCRv3模型设置为48,PP-OCRv2模型需要设置为32 ``` 5. 启动调试 @@ -259,13 +261,13 @@ rec_image_height 32 # 识别模型输入图像的高度,PP-OCRv3模型 export LD_LIBRARY_PATH=${PWD}:$LD_LIBRARY_PATH # 开始使用,ocr_db_crnn可执行文件的使用方式为: # ./ocr_db_crnn 预测模式 检测模型文件 方向分类器模型文件 识别模型文件 运行硬件 运行精度 线程数 batchsize 测试图像路径 参数配置路径 字典文件路径 是否使用benchmark参数 - ./ocr_db_crnn system ch_PP-OCRv2_det_slim_opt.nb ch_PP-OCRv2_rec_slim_opt.nb ch_ppocr_mobile_v2.0_cls_slim_opt.nb arm8 INT8 10 1 ./11.jpg config.txt ppocr_keys_v1.txt True + ./ocr_db_crnn system ch_PP-OCRv3_det_slim_opt.nb ch_PP-OCRv3_rec_slim_opt.nb ch_ppocr_mobile_v2.0_cls_slim_opt.nb arm8 INT8 10 1 ./11.jpg config.txt ppocr_keys_v1.txt True # 仅使用文本检测模型,使用方式如下: -./ocr_db_crnn det ch_PP-OCRv2_det_slim_opt.nb arm8 INT8 10 1 ./11.jpg config.txt +./ocr_db_crnn det ch_PP-OCRv3_det_slim_opt.nb arm8 INT8 10 1 ./11.jpg config.txt # 仅使用文本识别模型,使用方式如下: -./ocr_db_crnn rec ch_PP-OCRv2_rec_slim_opt.nb arm8 INT8 10 1 word_1.jpg ppocr_keys_v1.txt config.txt +./ocr_db_crnn rec ch_PP-OCRv3_rec_slim_opt.nb arm8 INT8 10 1 word_1.jpg ppocr_keys_v1.txt config.txt ``` 如果对代码做了修改,则需要重新编译并push到手机上。 diff --git a/deploy/slim/quantization/README.md b/deploy/slim/quantization/README.md index 4c1d784b99aade614d78b4bd6fb20afef15f0f6f..8b29693c9803f004f123b5497c9224ae5c31041d 100644 --- a/deploy/slim/quantization/README.md +++ b/deploy/slim/quantization/README.md @@ -22,7 +22,7 @@ ### 1. 安装PaddleSlim ```bash -pip3 install paddleslim==2.2.2 +pip3 install paddleslim==2.3.2 ``` ### 2. 准备训练好的模型 @@ -32,18 +32,7 @@ PaddleOCR提供了一系列训练好的[模型](../../../doc/doc_ch/models_list. ### 3. 量化训练 量化训练包括离线量化训练和在线量化训练,在线量化训练效果更好,需加载预训练模型,在定义好量化策略后即可对模型进行量化。 - -量化训练的代码位于slim/quantization/quant.py 中,比如训练检测模型,训练指令如下: -```bash -python deploy/slim/quantization/quant.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o Global.pretrained_model='your trained model' Global.save_model_dir=./output/quant_model - -# 比如下载提供的训练模型 -wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_train.tar -tar -xf ch_ppocr_mobile_v2.0_det_train.tar -python deploy/slim/quantization/quant.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o Global.pretrained_model=./ch_ppocr_mobile_v2.0_det_train/best_accuracy Global.save_model_dir=./output/quant_model -``` - -模型蒸馏和模型量化可以同时使用,以PPOCRv3检测模型为例: +量化训练的代码位于slim/quantization/quant.py 中,比如训练检测模型,以PPOCRv3检测模型为例,训练指令如下: ``` # 下载检测预训练模型: wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar @@ -58,7 +47,7 @@ python deploy/slim/quantization/quant.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_ 在得到量化训练保存的模型后,我们可以将其导出为inference_model,用于预测部署: ```bash -python deploy/slim/quantization/export_model.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o Global.checkpoints=output/quant_model/best_accuracy Global.save_inference_dir=./output/quant_inference_model +python deploy/slim/quantization/export_model.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml -o Global.checkpoints=output/quant_model/best_accuracy Global.save_inference_dir=./output/quant_inference_model ``` ### 5. 量化模型部署 diff --git a/deploy/slim/quantization/README_en.md b/deploy/slim/quantization/README_en.md index c6796ae9dc256496308e432023c45ef1026c3d92..f82c3d844e292ee76b95624f7632ed40301e5a4c 100644 --- a/deploy/slim/quantization/README_en.md +++ b/deploy/slim/quantization/README_en.md @@ -25,7 +25,7 @@ After training, if you want to further compress the model size and accelerate th ### 1. Install PaddleSlim ```bash -pip3 install paddleslim==2.2.2 +pip3 install paddleslim==2.3.2 ``` @@ -39,18 +39,7 @@ Quantization training includes offline quantization training and online quantiza Online quantization training is more effective. It is necessary to load the pre-trained model. After the quantization strategy is defined, the model can be quantified. -The code for quantization training is located in `slim/quantization/quant.py`. For example, to train a detection model, the training instructions are as follows: -```bash -python deploy/slim/quantization/quant.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o Global.pretrained_model='your trained model' Global.save_model_dir=./output/quant_model - -# download provided model -wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_train.tar -tar -xf ch_ppocr_mobile_v2.0_det_train.tar -python deploy/slim/quantization/quant.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o Global.pretrained_model=./ch_ppocr_mobile_v2.0_det_train/best_accuracy Global.save_model_dir=./output/quant_model -``` - - -Model distillation and model quantization can be used at the same time, taking the PPOCRv3 detection model as an example: +The code for quantization training is located in `slim/quantization/quant.py`. For example, the training instructions of slim PPOCRv3 detection model are as follows: ``` # download provided model wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar @@ -66,7 +55,7 @@ If you want to quantify the text recognition model, you can modify the configura Once we got the model after pruning and fine-tuning, we can export it as an inference model for the deployment of predictive tasks: ```bash -python deploy/slim/quantization/export_model.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o Global.checkpoints=output/quant_model/best_accuracy Global.save_inference_dir=./output/quant_inference_model +python deploy/slim/quantization/export_model.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml -o Global.checkpoints=output/quant_model/best_accuracy Global.save_inference_dir=./output/quant_inference_model ``` ### 5. Deploy diff --git a/deploy/slim/quantization/export_model.py b/deploy/slim/quantization/export_model.py index fd1c3e5e109667fa74f5ade18b78f634e4d325db..bd132b625181cab853961efd2e2c38c411e9edf4 100755 --- a/deploy/slim/quantization/export_model.py +++ b/deploy/slim/quantization/export_model.py @@ -151,17 +151,24 @@ def main(): arch_config = config["Architecture"] - arch_config = config["Architecture"] + if arch_config["algorithm"] == "SVTR" and arch_config["Head"][ + "name"] != 'MultiHead': + input_shape = config["Eval"]["dataset"]["transforms"][-2][ + 'SVTRRecResizeImg']['image_shape'] + else: + input_shape = None if arch_config["algorithm"] in ["Distillation", ]: # distillation model archs = list(arch_config["Models"].values()) for idx, name in enumerate(model.model_name_list): sub_model_save_path = os.path.join(save_path, name, "inference") export_single_model(model.model_list[idx], archs[idx], - sub_model_save_path, logger, quanter) + sub_model_save_path, logger, input_shape, + quanter) else: save_path = os.path.join(save_path, "inference") - export_single_model(model, arch_config, save_path, logger, quanter) + export_single_model(model, arch_config, save_path, logger, input_shape, + quanter) if __name__ == "__main__": diff --git a/doc/doc_en/algorithm_sdmgr_en.md b/doc/doc_en/algorithm_sdmgr_en.md index 7b3752223dd765e780d56d146c90bd0f892aac7b..4a10ec7dea5e942c5991991eef598d970c189d1b 100644 --- a/doc/doc_en/algorithm_sdmgr_en.md +++ b/doc/doc_en/algorithm_sdmgr_en.md @@ -31,7 +31,7 @@ The prediction result is saved as `./output/sdmgr_kie/predicts_kie.txt`, and the The visualization results are shown in the figure below:
- +
## 2. Model Training diff --git a/doc/doc_en/config_en.md b/doc/doc_en/config_en.md index d467a7f918ed57eb80754483715f3671fd2552c7..ea5c1a472e9cc45bdededc97475a7c423dff1047 100644 --- a/doc/doc_en/config_en.md +++ b/doc/doc_en/config_en.md @@ -28,7 +28,7 @@ Take rec_chinese_lite_train_v2.0.yml as an example | epoch_num | Maximum training epoch number | 500 | \ | | log_smooth_window | Log queue length, the median value in the queue each time will be printed | 20 | \ | | print_batch_step | Set print log interval | 10 | \ | -| save_model_dir | Set model save path | output/{算法名称} | \ | +| save_model_dir | Set model save path | output/{algorithm_name} | \ | | save_epoch_step | Set model save interval | 3 | \ | | eval_batch_step | Set the model evaluation interval | 2000 or [1000, 2000] | running evaluation every 2000 iters or evaluation is run every 2000 iterations after the 1000th iteration | | cal_metric_during_train | Set whether to evaluate the metric during the training process. At this time, the metric of the model under the current batch is evaluated | true | \ | @@ -245,4 +245,4 @@ For more supported languages, please refer to : [Multi-language model](https://g The multi-language model training method is the same as the Chinese model. The training data set is 100w synthetic data. A small amount of fonts and test data can be downloaded using the following two methods. * [Baidu Netdisk](https://pan.baidu.com/s/1bS_u207Rm7YbY33wOECKDA),Extraction code:frgi. -* [Google drive](https://drive.google.com/file/d/18cSWX7wXSy4G0tbKJ0d9PuIaiwRLHpjA/view) \ No newline at end of file +* [Google drive](https://drive.google.com/file/d/18cSWX7wXSy4G0tbKJ0d9PuIaiwRLHpjA/view) diff --git a/ppocr/losses/rec_aster_loss.py b/ppocr/losses/rec_aster_loss.py index 52605e46db35339cc22f7f1e6642456bfaf02f11..9927fbc043f2af146e51cbb9a549f1dffc980341 100644 --- a/ppocr/losses/rec_aster_loss.py +++ b/ppocr/losses/rec_aster_loss.py @@ -27,10 +27,9 @@ class CosineEmbeddingLoss(nn.Layer): self.epsilon = 1e-12 def forward(self, x1, x2, target): - similarity = paddle.sum( - x1 * x2, dim=-1) / (paddle.norm( - x1, axis=-1) * paddle.norm( - x2, axis=-1) + self.epsilon) + similarity = paddle.sum(x1 * x2, axis=-1) / (paddle.norm( + x1, axis=-1) * paddle.norm( + x2, axis=-1) + self.epsilon) one_list = paddle.full_like(target, fill_value=1) out = paddle.mean( paddle.where( diff --git a/ppstructure/utility.py b/ppstructure/utility.py index bdea0af69e37e15d1f191b2a86c036ae1c2b1e45..97b6d6fec0d70fe3014b0b2105dbbef6a292e4d7 100644 --- a/ppstructure/utility.py +++ b/ppstructure/utility.py @@ -32,7 +32,7 @@ def init_args(): parser.add_argument( "--table_char_dict_path", type=str, - default="../ppocr/utils/dict/table_structure_dict.txt") + default="../ppocr/utils/dict/table_structure_dict_ch.txt") # params for layout parser.add_argument("--layout_model_dir", type=str) parser.add_argument( diff --git a/tools/infer/utility.py b/tools/infer/utility.py index 9baf66d7f469a3bf6c9a140e034aee3a635a5c8e..e6adad3dd8c2d57775ab5f7fa489dca98d22eb3d 100644 --- a/tools/infer/utility.py +++ b/tools/infer/utility.py @@ -38,7 +38,6 @@ def init_args(): parser.add_argument("--ir_optim", type=str2bool, default=True) parser.add_argument("--use_tensorrt", type=str2bool, default=False) parser.add_argument("--min_subgraph_size", type=int, default=15) - parser.add_argument("--shape_info_filename", type=str, default=None) parser.add_argument("--precision", type=str, default="fp32") parser.add_argument("--gpu_mem", type=int, default=500) @@ -226,23 +225,22 @@ def create_predictor(args, mode, logger): use_calib_mode=False) # collect shape - if args.shape_info_filename is not None: - if not os.path.exists(args.shape_info_filename): - config.collect_shape_range_info( - args.shape_info_filename) - logger.info( - f"collect dynamic shape info into : {args.shape_info_filename}" - ) - else: - logger.info( - f"dynamic shape info file( {args.shape_info_filename} ) already exists, not need to generate again." - ) - config.enable_tuned_tensorrt_dynamic_shape( - args.shape_info_filename, True) + trt_shape_f = os.path.join(model_dir, f"{mode}_trt_dynamic_shape.txt") + + if not os.path.exists(trt_shape_f): + config.collect_shape_range_info(trt_shape_f) + logger.info( + f"collect dynamic shape info into : {trt_shape_f}") else: logger.info( - f"when using tensorrt, dynamic shape is a suggested option, you can use '--shape_info_filename=shape.txt' for offline dygnamic shape tuning" + f"dynamic shape info file( {trt_shape_f} ) already exists, not need to generate again." ) + try: + config.enable_tuned_tensorrt_dynamic_shape(trt_shape_f, + True) + except Exception as E: + logger.info(E) + logger.info("Please keep your paddlepaddle-gpu >= 2.3.0!") elif args.use_xpu: config.enable_xpu(10 * 1024 * 1024)