diff --git a/deploy/cpp_infer/include/args.h b/deploy/cpp_infer/include/args.h index f7fac9c92c421ca85818b2d04097ce8e55ea117e..e6e76ef927c16f6afe381f64ea8dde4ac99185cf 100644 --- a/deploy/cpp_infer/include/args.h +++ b/deploy/cpp_infer/include/args.h @@ -49,6 +49,11 @@ DECLARE_int32(rec_batch_num); DECLARE_string(rec_char_dict_path); DECLARE_int32(rec_img_h); DECLARE_int32(rec_img_w); +// layout model related +DECLARE_string(layout_model_dir); +DECLARE_string(layout_dict_path); +DECLARE_double(layout_score_threshold); +DECLARE_double(layout_nms_threshold); // structure model related DECLARE_string(table_model_dir); DECLARE_int32(table_max_len); @@ -59,4 +64,5 @@ DECLARE_bool(merge_no_span_structure); DECLARE_bool(det); DECLARE_bool(rec); DECLARE_bool(cls); -DECLARE_bool(table); \ No newline at end of file +DECLARE_bool(table); +DECLARE_bool(layout); \ No newline at end of file diff --git a/deploy/cpp_infer/include/ocr_cls.h b/deploy/cpp_infer/include/ocr_cls.h index f5429a7c5bc58c2640f042811ad0eed23f29feba..f5a0356573b3219865e0c9fe08d57358d3a2c88c 100644 --- a/deploy/cpp_infer/include/ocr_cls.h +++ b/deploy/cpp_infer/include/ocr_cls.h @@ -14,26 +14,12 @@ #pragma once -#include "opencv2/core.hpp" -#include "opencv2/imgcodecs.hpp" -#include "opencv2/imgproc.hpp" #include "paddle_api.h" #include "paddle_inference_api.h" -#include -#include -#include -#include -#include - -#include -#include -#include #include #include -using namespace paddle_infer; - namespace PaddleOCR { class Classifier { @@ -66,7 +52,7 @@ public: std::vector &cls_scores, std::vector ×); private: - std::shared_ptr predictor_; + std::shared_ptr predictor_; bool use_gpu_ = false; int gpu_id_ = 0; diff --git a/deploy/cpp_infer/include/ocr_det.h b/deploy/cpp_infer/include/ocr_det.h index d1421b103b28b44e15a7df53a63fd893ca60e529..9f6f2520540f96dfa53f5c4c907317bb8ff04013 100644 --- a/deploy/cpp_infer/include/ocr_det.h +++ b/deploy/cpp_infer/include/ocr_det.h @@ -14,26 +14,12 @@ #pragma once -#include "opencv2/core.hpp" -#include "opencv2/imgcodecs.hpp" -#include "opencv2/imgproc.hpp" #include "paddle_api.h" #include "paddle_inference_api.h" -#include -#include -#include -#include -#include - -#include -#include -#include #include #include -using namespace paddle_infer; - namespace PaddleOCR { class DBDetector { @@ -41,7 +27,7 @@ public: explicit DBDetector(const std::string &model_dir, const bool &use_gpu, const int &gpu_id, const int &gpu_mem, const int &cpu_math_library_num_threads, - const bool &use_mkldnn, const string &limit_type, + const bool &use_mkldnn, const std::string &limit_type, const int &limit_side_len, const double &det_db_thresh, const double &det_db_box_thresh, const double &det_db_unclip_ratio, @@ -77,7 +63,7 @@ public: std::vector ×); private: - std::shared_ptr predictor_; + std::shared_ptr predictor_; bool use_gpu_ = false; int gpu_id_ = 0; @@ -85,7 +71,7 @@ private: int cpu_math_library_num_threads_ = 4; bool use_mkldnn_ = false; - string limit_type_ = "max"; + std::string limit_type_ = "max"; int limit_side_len_ = 960; double det_db_thresh_ = 0.3; diff --git a/deploy/cpp_infer/include/ocr_rec.h b/deploy/cpp_infer/include/ocr_rec.h index 30f8efa9996a62adc74717dd46f2aef7fc96b091..257c261033bf8f8c0ce605ba90cedfbb49d844dc 100644 --- a/deploy/cpp_infer/include/ocr_rec.h +++ b/deploy/cpp_infer/include/ocr_rec.h @@ -14,27 +14,12 @@ #pragma once -#include "opencv2/core.hpp" -#include "opencv2/imgcodecs.hpp" -#include "opencv2/imgproc.hpp" #include "paddle_api.h" #include "paddle_inference_api.h" -#include -#include -#include -#include -#include - -#include -#include -#include #include -#include #include -using namespace paddle_infer; - namespace PaddleOCR { class CRNNRecognizer { @@ -42,7 +27,7 @@ public: explicit CRNNRecognizer(const std::string &model_dir, const bool &use_gpu, const int &gpu_id, const int &gpu_mem, const int &cpu_math_library_num_threads, - const bool &use_mkldnn, const string &label_path, + const bool &use_mkldnn, const std::string &label_path, const bool &use_tensorrt, const std::string &precision, const int &rec_batch_num, const int &rec_img_h, @@ -75,7 +60,7 @@ public: std::vector &rec_text_scores, std::vector ×); private: - std::shared_ptr predictor_; + std::shared_ptr predictor_; bool use_gpu_ = false; int gpu_id_ = 0; diff --git a/deploy/cpp_infer/include/paddleocr.h b/deploy/cpp_infer/include/paddleocr.h index a2c60b14acceaa90a8d8e4a70ccc50f02f254eb6..16750a15f70d374f8aa837042ba6a13bc10a5d35 100644 --- a/deploy/cpp_infer/include/paddleocr.h +++ b/deploy/cpp_infer/include/paddleocr.h @@ -14,28 +14,9 @@ #pragma once -#include "opencv2/core.hpp" -#include "opencv2/imgcodecs.hpp" -#include "opencv2/imgproc.hpp" -#include "paddle_api.h" -#include "paddle_inference_api.h" -#include -#include -#include -#include -#include - -#include -#include -#include - #include #include #include -#include -#include - -using namespace paddle_infer; namespace PaddleOCR { @@ -43,21 +24,27 @@ class PPOCR { public: explicit PPOCR(); ~PPOCR(); - std::vector> - ocr(std::vector cv_all_img_names, bool det = true, - bool rec = true, bool cls = true); + + std::vector> ocr(std::vector img_list, + bool det = true, + bool rec = true, + bool cls = true); + std::vector ocr(cv::Mat img, bool det = true, + bool rec = true, bool cls = true); + + void reset_timer(); + void benchmark_log(int img_num); protected: - void det(cv::Mat img, std::vector &ocr_results, - std::vector ×); + std::vector time_info_det = {0, 0, 0}; + std::vector time_info_rec = {0, 0, 0}; + std::vector time_info_cls = {0, 0, 0}; + + void det(cv::Mat img, std::vector &ocr_results); void rec(std::vector img_list, - std::vector &ocr_results, - std::vector ×); + std::vector &ocr_results); void cls(std::vector img_list, - std::vector &ocr_results, - std::vector ×); - void log(std::vector &det_times, std::vector &rec_times, - std::vector &cls_times, int img_num); + std::vector &ocr_results); private: DBDetector *detector_ = nullptr; diff --git a/deploy/cpp_infer/include/paddlestructure.h b/deploy/cpp_infer/include/paddlestructure.h index 6d2c8b7d203a05f531b8d038d885061c42897373..8478a85cdec23984f86a323f55a4591d52bcf08c 100644 --- a/deploy/cpp_infer/include/paddlestructure.h +++ b/deploy/cpp_infer/include/paddlestructure.h @@ -14,27 +14,9 @@ #pragma once -#include "opencv2/core.hpp" -#include "opencv2/imgcodecs.hpp" -#include "opencv2/imgproc.hpp" -#include "paddle_api.h" -#include "paddle_inference_api.h" -#include -#include -#include -#include -#include - -#include -#include -#include - #include -#include +#include #include -#include - -using namespace paddle_infer; namespace PaddleOCR { @@ -42,23 +24,31 @@ class PaddleStructure : public PPOCR { public: explicit PaddleStructure(); ~PaddleStructure(); - std::vector> - structure(std::vector cv_all_img_names, bool layout = false, - bool table = true); + + std::vector structure(cv::Mat img, + bool layout = false, + bool table = true, + bool ocr = false); + + void reset_timer(); + void benchmark_log(int img_num); private: - StructureTableRecognizer *recognizer_ = nullptr; + std::vector time_info_table = {0, 0, 0}; + std::vector time_info_layout = {0, 0, 0}; + + StructureTableRecognizer *table_model_ = nullptr; + StructureLayoutRecognizer *layout_model_ = nullptr; + + void layout(cv::Mat img, + std::vector &structure_result); + + void table(cv::Mat img, StructurePredictResult &structure_result); - void table(cv::Mat img, StructurePredictResult &structure_result, - std::vector &time_info_table, - std::vector &time_info_det, - std::vector &time_info_rec, - std::vector &time_info_cls); std::string rebuild_table(std::vector rec_html_tags, std::vector> rec_boxes, std::vector &ocr_result); - float iou(std::vector &box1, std::vector &box2); float dis(std::vector &box1, std::vector &box2); static bool comparison_dis(const std::vector &dis1, diff --git a/deploy/cpp_infer/include/postprocess_op.h b/deploy/cpp_infer/include/postprocess_op.h index f5db52a6097f0fb916fc96fd8c76095f2ed1a9fa..e267eeee1dd8055b05bb10c89149ad31779aabc7 100644 --- a/deploy/cpp_infer/include/postprocess_op.h +++ b/deploy/cpp_infer/include/postprocess_op.h @@ -14,24 +14,9 @@ #pragma once -#include "opencv2/core.hpp" -#include "opencv2/imgcodecs.hpp" -#include "opencv2/imgproc.hpp" -#include -#include -#include -#include -#include - -#include -#include -#include - #include "include/clipper.h" #include "include/utility.h" -using namespace std; - namespace PaddleOCR { class DBPostProcessor { @@ -106,4 +91,27 @@ private: std::string beg = "sos"; }; +class PicodetPostProcessor { +public: + void init(std::string label_path, const double score_threshold = 0.4, + const double nms_threshold = 0.5, + const std::vector &fpn_stride = {8, 16, 32, 64}); + void Run(std::vector &results, + std::vector> outs, std::vector ori_shape, + std::vector resize_shape, int eg_max); + std::vector fpn_stride_ = {8, 16, 32, 64}; + +private: + StructurePredictResult disPred2Bbox(std::vector bbox_pred, int label, + float score, int x, int y, int stride, + std::vector im_shape, int reg_max); + void nms(std::vector &input_boxes, + float nms_threshold); + + std::vector label_list_; + double score_threshold_ = 0.4; + double nms_threshold_ = 0.5; + int num_class_ = 5; +}; + } // namespace PaddleOCR diff --git a/deploy/cpp_infer/include/preprocess_op.h b/deploy/cpp_infer/include/preprocess_op.h index 078f19d5b808c81e88d7aa464d6bfaca7fe1b14e..0b2e18330cbb5d8455cc17a508ab1f12de0f389a 100644 --- a/deploy/cpp_infer/include/preprocess_op.h +++ b/deploy/cpp_infer/include/preprocess_op.h @@ -14,21 +14,12 @@ #pragma once -#include "opencv2/core.hpp" -#include "opencv2/imgcodecs.hpp" -#include "opencv2/imgproc.hpp" -#include -#include #include -#include #include -#include -#include -#include - -using namespace std; -using namespace paddle; +#include "opencv2/core.hpp" +#include "opencv2/imgcodecs.hpp" +#include "opencv2/imgproc.hpp" namespace PaddleOCR { @@ -51,9 +42,9 @@ public: class ResizeImgType0 { public: - virtual void Run(const cv::Mat &img, cv::Mat &resize_img, string limit_type, - int limit_side_len, float &ratio_h, float &ratio_w, - bool use_tensorrt); + virtual void Run(const cv::Mat &img, cv::Mat &resize_img, + std::string limit_type, int limit_side_len, float &ratio_h, + float &ratio_w, bool use_tensorrt); }; class CrnnResizeImg { @@ -82,4 +73,10 @@ public: const int max_len = 488); }; +class Resize { +public: + virtual void Run(const cv::Mat &img, cv::Mat &resize_img, const int h, + const int w); +}; + } // namespace PaddleOCR \ No newline at end of file diff --git a/deploy/cpp_infer/include/structure_layout.h b/deploy/cpp_infer/include/structure_layout.h new file mode 100644 index 0000000000000000000000000000000000000000..3dd605720fa1dc009e8f1b28768d221678df713e --- /dev/null +++ b/deploy/cpp_infer/include/structure_layout.h @@ -0,0 +1,78 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle_api.h" +#include "paddle_inference_api.h" + +#include +#include + +namespace PaddleOCR { + +class StructureLayoutRecognizer { +public: + explicit StructureLayoutRecognizer( + const std::string &model_dir, const bool &use_gpu, const int &gpu_id, + const int &gpu_mem, const int &cpu_math_library_num_threads, + const bool &use_mkldnn, const std::string &label_path, + const bool &use_tensorrt, const std::string &precision, + const double &layout_score_threshold, + const double &layout_nms_threshold) { + this->use_gpu_ = use_gpu; + this->gpu_id_ = gpu_id; + this->gpu_mem_ = gpu_mem; + this->cpu_math_library_num_threads_ = cpu_math_library_num_threads; + this->use_mkldnn_ = use_mkldnn; + this->use_tensorrt_ = use_tensorrt; + this->precision_ = precision; + + this->post_processor_.init(label_path, layout_score_threshold, + layout_nms_threshold); + LoadModel(model_dir); + } + + // Load Paddle inference model + void LoadModel(const std::string &model_dir); + + void Run(cv::Mat img, std::vector &result, + std::vector ×); + +private: + std::shared_ptr predictor_; + + bool use_gpu_ = false; + int gpu_id_ = 0; + int gpu_mem_ = 4000; + int cpu_math_library_num_threads_ = 4; + bool use_mkldnn_ = false; + + std::vector mean_ = {0.485f, 0.456f, 0.406f}; + std::vector scale_ = {1 / 0.229f, 1 / 0.224f, 1 / 0.225f}; + bool is_scale_ = true; + + bool use_tensorrt_ = false; + std::string precision_ = "fp32"; + + // pre-process + Resize resize_op_; + Normalize normalize_op_; + Permute permute_op_; + + // post-process + PicodetPostProcessor post_processor_; +}; + +} // namespace PaddleOCR \ No newline at end of file diff --git a/deploy/cpp_infer/include/structure_table.h b/deploy/cpp_infer/include/structure_table.h index c09e65654a7c8a4deb6729ddfd876531020f306b..616e95d212c948ab165bc73da7758a263583eb98 100644 --- a/deploy/cpp_infer/include/structure_table.h +++ b/deploy/cpp_infer/include/structure_table.h @@ -14,26 +14,11 @@ #pragma once -#include "opencv2/core.hpp" -#include "opencv2/imgcodecs.hpp" -#include "opencv2/imgproc.hpp" #include "paddle_api.h" #include "paddle_inference_api.h" -#include -#include -#include -#include -#include - -#include -#include -#include #include #include -#include - -using namespace paddle_infer; namespace PaddleOCR { @@ -42,7 +27,7 @@ public: explicit StructureTableRecognizer( const std::string &model_dir, const bool &use_gpu, const int &gpu_id, const int &gpu_mem, const int &cpu_math_library_num_threads, - const bool &use_mkldnn, const string &label_path, + const bool &use_mkldnn, const std::string &label_path, const bool &use_tensorrt, const std::string &precision, const int &table_batch_num, const int &table_max_len, const bool &merge_no_span_structure) { @@ -70,7 +55,7 @@ public: std::vector ×); private: - std::shared_ptr predictor_; + std::shared_ptr predictor_; bool use_gpu_ = false; int gpu_id_ = 0; diff --git a/deploy/cpp_infer/include/utility.h b/deploy/cpp_infer/include/utility.h index 85b280fe25a46be70dba529891c3470a729dfbf1..7dfe03dd625e7b31bc64d875c893ea132b46423c 100644 --- a/deploy/cpp_infer/include/utility.h +++ b/deploy/cpp_infer/include/utility.h @@ -41,12 +41,13 @@ struct OCRPredictResult { }; struct StructurePredictResult { - std::vector box; + std::vector box; std::vector> cell_box; std::string type; std::vector text_res; std::string html; float html_score = -1; + float confidence; }; class Utility { @@ -82,13 +83,20 @@ public: static void print_result(const std::vector &ocr_result); - static cv::Mat crop_image(cv::Mat &img, std::vector &area); + static cv::Mat crop_image(cv::Mat &img, const std::vector &area); + static cv::Mat crop_image(cv::Mat &img, const std::vector &area); static void sorted_boxes(std::vector &ocr_result); static std::vector xyxyxyxy2xyxy(std::vector> &box); static std::vector xyxyxyxy2xyxy(std::vector &box); + static float fast_exp(float x); + static std::vector + activation_function_softmax(std::vector &src); + static float iou(std::vector &box1, std::vector &box2); + static float iou(std::vector &box1, std::vector &box2); + private: static bool comparison_box(const OCRPredictResult &result1, const OCRPredictResult &result2) { diff --git a/deploy/cpp_infer/readme.md b/deploy/cpp_infer/readme.md index 2974f3227aa6f9cdd967665addc905f7b902bac2..d176ff986295088a15f4e20b16a7986c3640387b 100644 --- a/deploy/cpp_infer/readme.md +++ b/deploy/cpp_infer/readme.md @@ -174,6 +174,9 @@ inference/ |-- table | |--inference.pdiparams | |--inference.pdmodel +|-- layout +| |--inference.pdiparams +| |--inference.pdmodel ``` @@ -278,8 +281,30 @@ Specifically, --cls=true \ ``` +##### 7. layout+table +```shell +./build/ppocr --det_model_dir=inference/det_db \ + --rec_model_dir=inference/rec_rcnn \ + --table_model_dir=inference/table \ + --image_dir=../../ppstructure/docs/table/table.jpg \ + --layout_model_dir=inference/layout \ + --type=structure \ + --table=true \ + --layout=true +``` + +##### 8. layout +```shell +./build/ppocr --layout_model_dir=inference/layout \ + --image_dir=../../ppstructure/docs/table/1.png \ + --type=structure \ + --table=false \ + --layout=true \ + --det=false \ + --rec=false +``` -##### 7. table +##### 9. table ```shell ./build/ppocr --det_model_dir=inference/det_db \ --rec_model_dir=inference/rec_rcnn \ @@ -343,6 +368,16 @@ More parameters are as follows, |rec_img_h|int|48|image height of recognition| |rec_img_w|int|320|image width of recognition| +- Layout related parameters + +|parameter|data type|default|meaning| +| :---: | :---: | :---: | :---: | +|layout_model_dir|string|-| Address of layout inference model| +|layout_dict_path|string|../../ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt|dictionary file| +|layout_score_threshold|float|0.5|Threshold of score.| +|layout_nms_threshold|float|0.5|Threshold of nms.| + + - Table recognition related parameters |parameter|data type|default|meaning| @@ -368,11 +403,51 @@ predict img: ../../doc/imgs/12.jpg The detection visualized image saved in ./output//12.jpg ``` -- table +- layout+table ```bash -predict img: ../../ppstructure/docs/table/table.jpg -0 type: table, region: [0,0,371,293], res:
MethodsRPFFPS
SegLink [26]70.086.077.08.9
PixelLink [4]73.283.077.8-
TextSnake [18]73.983.278.31.1
TextField [37]75.987.481.35.2
MSR[38]76.787.481.7-
FTSN [3]77.187.682.0-
LSE[30]81.784.282.9-
CRAFT [2]78.288.282.98.6
MCN [16]798883-
ATRR[35]82.185.283.6-
PAN [34]83.884.484.130.2
DB[12]79.291.584.932.0
DRRG [41]82.3088.0585.08-
Ours (SynText)80.6885.4082.9712.68
Ours (MLT-17)84.5486.6285.5712.31
+predict img: ../../ppstructure/docs/table/1.png +0 type: text, region: [12,729,410,848], score: 0.781044, res: count of ocr result is : 7 +********** print ocr result ********** +0 det boxes: [[4,1],[79,1],[79,12],[4,12]] rec text: CTW1500. rec score: 0.769472 +... +6 det boxes: [[4,99],[391,99],[391,112],[4,112]] rec text: sate-of-the-artmethods[12.34.36l.ourapproachachieves rec score: 0.90414 +********** end print ocr result ********** +1 type: text, region: [69,342,342,359], score: 0.703666, res: count of ocr result is : 1 +********** print ocr result ********** +0 det boxes: [[8,2],[269,2],[269,13],[8,13]] rec text: Table6.Experimentalresults on CTW-1500 rec score: 0.890454 +********** end print ocr result ********** +2 type: text, region: [70,316,706,332], score: 0.659738, res: count of ocr result is : 2 +********** print ocr result ********** +0 det boxes: [[373,2],[630,2],[630,11],[373,11]] rec text: oroposals.andthegreencontoursarefinal rec score: 0.919729 +1 det boxes: [[8,3],[357,3],[357,11],[8,11]] rec text: Visualexperimentalresultshebluecontoursareboundar rec score: 0.915963 +********** end print ocr result ********** +3 type: text, region: [489,342,789,359], score: 0.630538, res: count of ocr result is : 1 +********** print ocr result ********** +0 det boxes: [[8,2],[294,2],[294,14],[8,14]] rec text: Table7.Experimentalresults onMSRA-TD500 rec score: 0.942251 +********** end print ocr result ********** +4 type: text, region: [444,751,841,848], score: 0.607345, res: count of ocr result is : 5 +********** print ocr result ********** +0 det boxes: [[19,3],[389,3],[389,17],[19,17]] rec text: Inthispaper,weproposeanovel adaptivebound rec score: 0.941031 +1 det boxes: [[4,22],[390,22],[390,36],[4,36]] rec text: aryproposalnetworkforarbitraryshapetextdetection rec score: 0.960172 +2 det boxes: [[4,42],[392,42],[392,56],[4,56]] rec text: whichadoptanboundaryproposalmodeltogeneratecoarse rec score: 0.934647 +3 det boxes: [[4,61],[389,61],[389,75],[4,75]] rec text: ooundaryproposals,andthenadoptanadaptiveboundary rec score: 0.946296 +4 det boxes: [[5,80],[387,80],[387,93],[5,93]] rec text: leformationmodelcombinedwithGCNandRNNtoper rec score: 0.952401 +********** end print ocr result ********** +5 type: title, region: [444,705,564,724], score: 0.785429, res: count of ocr result is : 1 +********** print ocr result ********** +0 det boxes: [[6,2],[113,2],[113,14],[6,14]] rec text: 5.Conclusion rec score: 0.856903 +********** end print ocr result ********** +6 type: table, region: [14,360,402,711], score: 0.963643, res:
MethodsExtRPFFPS
TextSnake [18]Syn85.367.975.6
CSE [17]MiLT76.178.777.40.38
LOMO[40]Syn76.585.780.84.4
ATRR[35]Sy-80.280.180.1-
SegLink++ [28]Syn79.882.881.3-
TextField [37]Syn79.883.081.46.0
MSR[38]Syn79.084.181.54.3
PSENet-1s [33]MLT79.784.882.23.9
DB [12]Syn80.286.983.422.0
CRAFT [2]Syn81.186.083.5-
TextDragon [5]MLT+82.884.583.6
PAN [34]Syn81.286.483.739.8
ContourNet [36]84.183.783.94.5
DRRG [41]MLT83.0285.9384.45-
TextPerception[23]Syn81.987.584.6
Ours Syn80.5787.6683.9712.08
Ours81.4587.8184.5112.15
OursMLT83.6086.4585.0012.21
+The table visualized image saved in ./output//6_1.png +7 type: table, region: [462,359,820,657], score: 0.953917, res:
MethodsRPFFPS
SegLink [26]70.086.077.08.9
PixelLink [4]73.283.077.8-
TextSnake [18]73.983.278.31.1
TextField [37]75.987.481.35.2
MSR[38]76.787.481.7-
FTSN[3]77.187.682.0:
LSE[30]81.784.282.9
CRAFT [2]78.288.282.98.6
MCN [16]798883-
ATRR[35]82.185.283.6-
PAN [34]83.884.484.130.2
DB[12]79.291.584.932.0
DRRG [41]82.3088.0585.08-
Ours (SynText)80.6885.4082.9712.68
Ours (MLT-17)84.5486.6285.5712.31
+The table visualized image saved in ./output//7_1.png +8 type: figure, region: [14,3,836,310], score: 0.969443, res: count of ocr result is : 26 +********** print ocr result ********** +0 det boxes: [[506,14],[539,15],[539,22],[506,21]] rec text: E rec score: 0.318073 +... +25 det boxes: [[680,290],[759,288],[759,303],[680,305]] rec text: (d) CTW1500 rec score: 0.95911 +********** end print ocr result ********** ``` diff --git a/deploy/cpp_infer/readme_ch.md b/deploy/cpp_infer/readme_ch.md index 03394efdc64788d924e155c989b1fac95f8432da..444567f193abade94029d0f048675eaf1cf03690 100644 --- a/deploy/cpp_infer/readme_ch.md +++ b/deploy/cpp_infer/readme_ch.md @@ -184,6 +184,9 @@ inference/ |-- table | |--inference.pdiparams | |--inference.pdmodel +|-- layout +| |--inference.pdiparams +| |--inference.pdmodel ``` @@ -288,7 +291,30 @@ CUDNN_LIB_DIR=/your_cudnn_lib_dir --cls=true \ ``` -##### 7. 表格识别 +##### 7. 版面分析+表格识别 +```shell +./build/ppocr --det_model_dir=inference/det_db \ + --rec_model_dir=inference/rec_rcnn \ + --table_model_dir=inference/table \ + --image_dir=../../ppstructure/docs/table/table.jpg \ + --layout_model_dir=inference/layout \ + --type=structure \ + --table=true \ + --layout=true +``` + +##### 8. 版面分析 +```shell +./build/ppocr --layout_model_dir=inference/layout \ + --image_dir=../../ppstructure/docs/table/1.png \ + --type=structure \ + --table=false \ + --layout=true \ + --det=false \ + --rec=false +``` + +##### 9. 表格识别 ```shell ./build/ppocr --det_model_dir=inference/det_db \ --rec_model_dir=inference/rec_rcnn \ @@ -352,12 +378,22 @@ CUDNN_LIB_DIR=/your_cudnn_lib_dir |rec_img_w|int|320|文字识别模型输入图像宽度| +- 版面分析模型相关 + +|参数名称|类型|默认参数|意义| +| :---: | :---: | :---: | :---: | +|layout_model_dir|string|-|版面分析模型inference model地址| +|layout_dict_path|string|../../ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt|字典文件| +|layout_score_threshold|float|0.5|检测框的分数阈值| +|layout_nms_threshold|float|0.5|nms的阈值| + + - 表格识别模型相关 |参数名称|类型|默认参数|意义| | :---: | :---: | :---: | :---: | |table_model_dir|string|-|表格识别模型inference model地址| -|table_char_dict_path|string|../../ppocr/utils/dict/table_structure_dict.txt|字典文件| +|table_char_dict_path|string|../../ppocr/utils/dict/table_structure_dict_ch.txt|字典文件| |table_max_len|int|488|表格识别模型输入图像长边大小,最终网络输入图像大小为(table_max_len,table_max_len)| |merge_no_span_structure|bool|true|是否合并 和 为| @@ -378,11 +414,51 @@ predict img: ../../doc/imgs/12.jpg The detection visualized image saved in ./output//12.jpg ``` -- table +- layout+table ```bash -predict img: ../../ppstructure/docs/table/table.jpg -0 type: table, region: [0,0,371,293], res:
MethodsRPFFPS
SegLink [26]70.086.077.08.9
PixelLink [4]73.283.077.8-
TextSnake [18]73.983.278.31.1
TextField [37]75.987.481.35.2
MSR[38]76.787.481.7-
FTSN [3]77.187.682.0-
LSE[30]81.784.282.9-
CRAFT [2]78.288.282.98.6
MCN [16]798883-
ATRR[35]82.185.283.6-
PAN [34]83.884.484.130.2
DB[12]79.291.584.932.0
DRRG [41]82.3088.0585.08-
Ours (SynText)80.6885.4082.9712.68
Ours (MLT-17)84.5486.6285.5712.31
+predict img: ../../ppstructure/docs/table/1.png +0 type: text, region: [12,729,410,848], score: 0.781044, res: count of ocr result is : 7 +********** print ocr result ********** +0 det boxes: [[4,1],[79,1],[79,12],[4,12]] rec text: CTW1500. rec score: 0.769472 +... +6 det boxes: [[4,99],[391,99],[391,112],[4,112]] rec text: sate-of-the-artmethods[12.34.36l.ourapproachachieves rec score: 0.90414 +********** end print ocr result ********** +1 type: text, region: [69,342,342,359], score: 0.703666, res: count of ocr result is : 1 +********** print ocr result ********** +0 det boxes: [[8,2],[269,2],[269,13],[8,13]] rec text: Table6.Experimentalresults on CTW-1500 rec score: 0.890454 +********** end print ocr result ********** +2 type: text, region: [70,316,706,332], score: 0.659738, res: count of ocr result is : 2 +********** print ocr result ********** +0 det boxes: [[373,2],[630,2],[630,11],[373,11]] rec text: oroposals.andthegreencontoursarefinal rec score: 0.919729 +1 det boxes: [[8,3],[357,3],[357,11],[8,11]] rec text: Visualexperimentalresultshebluecontoursareboundar rec score: 0.915963 +********** end print ocr result ********** +3 type: text, region: [489,342,789,359], score: 0.630538, res: count of ocr result is : 1 +********** print ocr result ********** +0 det boxes: [[8,2],[294,2],[294,14],[8,14]] rec text: Table7.Experimentalresults onMSRA-TD500 rec score: 0.942251 +********** end print ocr result ********** +4 type: text, region: [444,751,841,848], score: 0.607345, res: count of ocr result is : 5 +********** print ocr result ********** +0 det boxes: [[19,3],[389,3],[389,17],[19,17]] rec text: Inthispaper,weproposeanovel adaptivebound rec score: 0.941031 +1 det boxes: [[4,22],[390,22],[390,36],[4,36]] rec text: aryproposalnetworkforarbitraryshapetextdetection rec score: 0.960172 +2 det boxes: [[4,42],[392,42],[392,56],[4,56]] rec text: whichadoptanboundaryproposalmodeltogeneratecoarse rec score: 0.934647 +3 det boxes: [[4,61],[389,61],[389,75],[4,75]] rec text: ooundaryproposals,andthenadoptanadaptiveboundary rec score: 0.946296 +4 det boxes: [[5,80],[387,80],[387,93],[5,93]] rec text: leformationmodelcombinedwithGCNandRNNtoper rec score: 0.952401 +********** end print ocr result ********** +5 type: title, region: [444,705,564,724], score: 0.785429, res: count of ocr result is : 1 +********** print ocr result ********** +0 det boxes: [[6,2],[113,2],[113,14],[6,14]] rec text: 5.Conclusion rec score: 0.856903 +********** end print ocr result ********** +6 type: table, region: [14,360,402,711], score: 0.963643, res:
MethodsExtRPFFPS
TextSnake [18]Syn85.367.975.6
CSE [17]MiLT76.178.777.40.38
LOMO[40]Syn76.585.780.84.4
ATRR[35]Sy-80.280.180.1-
SegLink++ [28]Syn79.882.881.3-
TextField [37]Syn79.883.081.46.0
MSR[38]Syn79.084.181.54.3
PSENet-1s [33]MLT79.784.882.23.9
DB [12]Syn80.286.983.422.0
CRAFT [2]Syn81.186.083.5-
TextDragon [5]MLT+82.884.583.6
PAN [34]Syn81.286.483.739.8
ContourNet [36]84.183.783.94.5
DRRG [41]MLT83.0285.9384.45-
TextPerception[23]Syn81.987.584.6
Ours Syn80.5787.6683.9712.08
Ours81.4587.8184.5112.15
OursMLT83.6086.4585.0012.21
+The table visualized image saved in ./output//6_1.png +7 type: table, region: [462,359,820,657], score: 0.953917, res:
MethodsRPFFPS
SegLink [26]70.086.077.08.9
PixelLink [4]73.283.077.8-
TextSnake [18]73.983.278.31.1
TextField [37]75.987.481.35.2
MSR[38]76.787.481.7-
FTSN[3]77.187.682.0:
LSE[30]81.784.282.9
CRAFT [2]78.288.282.98.6
MCN [16]798883-
ATRR[35]82.185.283.6-
PAN [34]83.884.484.130.2
DB[12]79.291.584.932.0
DRRG [41]82.3088.0585.08-
Ours (SynText)80.6885.4082.9712.68
Ours (MLT-17)84.5486.6285.5712.31
+The table visualized image saved in ./output//7_1.png +8 type: figure, region: [14,3,836,310], score: 0.969443, res: count of ocr result is : 26 +********** print ocr result ********** +0 det boxes: [[506,14],[539,15],[539,22],[506,21]] rec text: E rec score: 0.318073 +... +25 det boxes: [[680,290],[759,288],[759,303],[680,305]] rec text: (d) CTW1500 rec score: 0.95911 +********** end print ocr result ********** ``` diff --git a/deploy/cpp_infer/src/args.cpp b/deploy/cpp_infer/src/args.cpp index 17e9c8b625baf53c2583a6d778aba552cdd19e97..28066f0b20061059f32e2658fa4ea70fd827acb7 100644 --- a/deploy/cpp_infer/src/args.cpp +++ b/deploy/cpp_infer/src/args.cpp @@ -51,6 +51,13 @@ DEFINE_string(rec_char_dict_path, "../../ppocr/utils/ppocr_keys_v1.txt", DEFINE_int32(rec_img_h, 48, "rec image height"); DEFINE_int32(rec_img_w, 320, "rec image width"); +// layout model related +DEFINE_string(layout_model_dir, "", "Path of table layout inference model."); +DEFINE_string(layout_dict_path, + "../../ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt", + "Path of dictionary."); +DEFINE_double(layout_score_threshold, 0.5, "Threshold of score."); +DEFINE_double(layout_nms_threshold, 0.5, "Threshold of nms."); // structure model related DEFINE_string(table_model_dir, "", "Path of table struture inference model."); DEFINE_int32(table_max_len, 488, "max len size of input image."); @@ -65,4 +72,5 @@ DEFINE_string(table_char_dict_path, DEFINE_bool(det, true, "Whether use det in forward."); DEFINE_bool(rec, true, "Whether use rec in forward."); DEFINE_bool(cls, false, "Whether use cls in forward."); -DEFINE_bool(table, false, "Whether use table structure in forward."); \ No newline at end of file +DEFINE_bool(table, false, "Whether use table structure in forward."); +DEFINE_bool(layout, false, "Whether use layout analysis in forward."); \ No newline at end of file diff --git a/deploy/cpp_infer/src/main.cpp b/deploy/cpp_infer/src/main.cpp index 34ffdc62674ef02b2d30c8e213a783495ceaff99..0c155dd0eca04874d23c3be7e6eff241b73f5f1b 100644 --- a/deploy/cpp_infer/src/main.cpp +++ b/deploy/cpp_infer/src/main.cpp @@ -65,9 +65,18 @@ void check_params() { exit(1); } } + if (FLAGS_layout) { + if (FLAGS_layout_model_dir.empty() || FLAGS_image_dir.empty()) { + std::cout << "Usage[layout]: ./ppocr " + << "--layout_model_dir=/PATH/TO/LAYOUT_INFERENCE_MODEL/ " + << "--image_dir=/PATH/TO/INPUT/IMAGE/" << std::endl; + exit(1); + } + } if (FLAGS_precision != "fp32" && FLAGS_precision != "fp16" && FLAGS_precision != "int8") { - cout << "precison should be 'fp32'(default), 'fp16' or 'int8'. " << endl; + std::cout << "precison should be 'fp32'(default), 'fp16' or 'int8'. " + << std::endl; exit(1); } } @@ -75,71 +84,94 @@ void check_params() { void ocr(std::vector &cv_all_img_names) { PPOCR ocr = PPOCR(); - std::vector> ocr_results = - ocr.ocr(cv_all_img_names, FLAGS_det, FLAGS_rec, FLAGS_cls); + if (FLAGS_benchmark) { + ocr.reset_timer(); + } + std::vector img_list; + std::vector img_names; for (int i = 0; i < cv_all_img_names.size(); ++i) { - if (FLAGS_benchmark) { - cout << cv_all_img_names[i] << '\t'; - if (FLAGS_rec && FLAGS_det) { - Utility::print_result(ocr_results[i]); - } else if (FLAGS_det) { - for (int n = 0; n < ocr_results[i].size(); n++) { - for (int m = 0; m < ocr_results[i][n].box.size(); m++) { - cout << ocr_results[i][n].box[m][0] << ' ' - << ocr_results[i][n].box[m][1] << ' '; - } - } - cout << endl; - } else { - Utility::print_result(ocr_results[i]); - } - } else { - cout << cv_all_img_names[i] << "\n"; - Utility::print_result(ocr_results[i]); - if (FLAGS_visualize && FLAGS_det) { - cv::Mat srcimg = cv::imread(cv_all_img_names[i], cv::IMREAD_COLOR); - if (!srcimg.data) { - std::cerr << "[ERROR] image read failed! image path: " - << cv_all_img_names[i] << endl; - exit(1); - } - std::string file_name = Utility::basename(cv_all_img_names[i]); + cv::Mat img = cv::imread(cv_all_img_names[i], cv::IMREAD_COLOR); + if (!img.data) { + std::cerr << "[ERROR] image read failed! image path: " + << cv_all_img_names[i] << std::endl; + continue; + } + img_list.push_back(img); + img_names.push_back(cv_all_img_names[i]); + } - Utility::VisualizeBboxes(srcimg, ocr_results[i], - FLAGS_output + "/" + file_name); - } - cout << "***************************" << endl; + std::vector> ocr_results = + ocr.ocr(img_list, FLAGS_det, FLAGS_rec, FLAGS_cls); + + for (int i = 0; i < img_names.size(); ++i) { + std::cout << "predict img: " << cv_all_img_names[i] << std::endl; + Utility::print_result(ocr_results[i]); + if (FLAGS_visualize && FLAGS_det) { + std::string file_name = Utility::basename(img_names[i]); + cv::Mat srcimg = img_list[i]; + Utility::VisualizeBboxes(srcimg, ocr_results[i], + FLAGS_output + "/" + file_name); } } + if (FLAGS_benchmark) { + ocr.benchmark_log(cv_all_img_names.size()); + } } void structure(std::vector &cv_all_img_names) { PaddleOCR::PaddleStructure engine = PaddleOCR::PaddleStructure(); - std::vector> structure_results = - engine.structure(cv_all_img_names, false, FLAGS_table); + + if (FLAGS_benchmark) { + engine.reset_timer(); + } + for (int i = 0; i < cv_all_img_names.size(); i++) { - cout << "predict img: " << cv_all_img_names[i] << endl; - cv::Mat srcimg = cv::imread(cv_all_img_names[i], cv::IMREAD_COLOR); - for (int j = 0; j < structure_results[i].size(); j++) { - std::cout << j << "\ttype: " << structure_results[i][j].type + std::cout << "predict img: " << cv_all_img_names[i] << std::endl; + cv::Mat img = cv::imread(cv_all_img_names[i], cv::IMREAD_COLOR); + if (!img.data) { + std::cerr << "[ERROR] image read failed! image path: " + << cv_all_img_names[i] << std::endl; + continue; + } + + std::vector structure_results = engine.structure( + img, FLAGS_layout, FLAGS_table, FLAGS_det && FLAGS_rec); + + for (int j = 0; j < structure_results.size(); j++) { + std::cout << j << "\ttype: " << structure_results[j].type << ", region: ["; - std::cout << structure_results[i][j].box[0] << "," - << structure_results[i][j].box[1] << "," - << structure_results[i][j].box[2] << "," - << structure_results[i][j].box[3] << "], res: "; - if (structure_results[i][j].type == "table") { - std::cout << structure_results[i][j].html << std::endl; - std::string file_name = Utility::basename(cv_all_img_names[i]); - - Utility::VisualizeBboxes(srcimg, structure_results[i][j], - FLAGS_output + "/" + std::to_string(j) + "_" + - file_name); + std::cout << structure_results[j].box[0] << "," + << structure_results[j].box[1] << "," + << structure_results[j].box[2] << "," + << structure_results[j].box[3] << "], score: "; + std::cout << structure_results[j].confidence << ", res: "; + + if (structure_results[j].type == "table") { + std::cout << structure_results[j].html << std::endl; + if (structure_results[j].cell_box.size() > 0 && FLAGS_visualize) { + std::string file_name = Utility::basename(cv_all_img_names[i]); + + Utility::VisualizeBboxes(img, structure_results[j], + FLAGS_output + "/" + std::to_string(j) + + "_" + file_name); + } } else { - Utility::print_result(structure_results[i][j].text_res); + std::cout << "count of ocr result is : " + << structure_results[j].text_res.size() << std::endl; + if (structure_results[j].text_res.size() > 0) { + std::cout << "********** print ocr result " + << "**********" << std::endl; + Utility::print_result(structure_results[j].text_res); + std::cout << "********** end print ocr result " + << "**********" << std::endl; + } } } } + if (FLAGS_benchmark) { + engine.benchmark_log(cv_all_img_names.size()); + } } int main(int argc, char **argv) { @@ -149,19 +181,22 @@ int main(int argc, char **argv) { if (!Utility::PathExists(FLAGS_image_dir)) { std::cerr << "[ERROR] image path not exist! image_dir: " << FLAGS_image_dir - << endl; + << std::endl; exit(1); } std::vector cv_all_img_names; cv::glob(FLAGS_image_dir, cv_all_img_names); - std::cout << "total images num: " << cv_all_img_names.size() << endl; + std::cout << "total images num: " << cv_all_img_names.size() << std::endl; + if (!Utility::PathExists(FLAGS_output)) { + Utility::CreateDir(FLAGS_output); + } if (FLAGS_type == "ocr") { ocr(cv_all_img_names); } else if (FLAGS_type == "structure") { structure(cv_all_img_names); } else { - std::cout << "only value in ['ocr','structure'] is supported" << endl; + std::cout << "only value in ['ocr','structure'] is supported" << std::endl; } } diff --git a/deploy/cpp_infer/src/ocr_cls.cpp b/deploy/cpp_infer/src/ocr_cls.cpp index 92d83600cea04419db231c0097caa53ed6fec58b..abcfed125f45253fc13c72f94621dda25ba12780 100644 --- a/deploy/cpp_infer/src/ocr_cls.cpp +++ b/deploy/cpp_infer/src/ocr_cls.cpp @@ -32,7 +32,7 @@ void Classifier::Run(std::vector img_list, for (int beg_img_no = 0; beg_img_no < img_num; beg_img_no += this->cls_batch_num_) { auto preprocess_start = std::chrono::steady_clock::now(); - int end_img_no = min(img_num, beg_img_no + this->cls_batch_num_); + int end_img_no = std::min(img_num, beg_img_no + this->cls_batch_num_); int batch_num = end_img_no - beg_img_no; // preprocess std::vector norm_img_batch; @@ -97,7 +97,7 @@ void Classifier::Run(std::vector img_list, } void Classifier::LoadModel(const std::string &model_dir) { - AnalysisConfig config; + paddle_infer::Config config; config.SetModel(model_dir + "/inference.pdmodel", model_dir + "/inference.pdiparams"); @@ -112,9 +112,9 @@ void Classifier::LoadModel(const std::string &model_dir) { precision = paddle_infer::Config::Precision::kInt8; } config.EnableTensorRtEngine(1 << 20, 10, 3, precision, false, false); - if (!Utility::PathExists("./trt_cls_shape.txt")){ + if (!Utility::PathExists("./trt_cls_shape.txt")) { config.CollectShapeRangeInfo("./trt_cls_shape.txt"); - } else { + } else { config.EnableTunedTensorRtDynamicShape("./trt_cls_shape.txt", true); } } @@ -136,6 +136,6 @@ void Classifier::LoadModel(const std::string &model_dir) { config.EnableMemoryOptim(); config.DisableGlogInfo(); - this->predictor_ = CreatePredictor(config); + this->predictor_ = paddle_infer::CreatePredictor(config); } } // namespace PaddleOCR diff --git a/deploy/cpp_infer/src/ocr_det.cpp b/deploy/cpp_infer/src/ocr_det.cpp index 030d5c2f359bba522662324d84c6ef1cc0bc83b8..74fa09bed1193a89091dca82569fa256d1773433 100644 --- a/deploy/cpp_infer/src/ocr_det.cpp +++ b/deploy/cpp_infer/src/ocr_det.cpp @@ -33,12 +33,11 @@ void DBDetector::LoadModel(const std::string &model_dir) { precision = paddle_infer::Config::Precision::kInt8; } config.EnableTensorRtEngine(1 << 30, 1, 20, precision, false, false); - if (!Utility::PathExists("./trt_det_shape.txt")){ + if (!Utility::PathExists("./trt_det_shape.txt")) { config.CollectShapeRangeInfo("./trt_det_shape.txt"); - } else { + } else { config.EnableTunedTensorRtDynamicShape("./trt_det_shape.txt", true); } - } } else { config.DisableGpu(); @@ -59,7 +58,7 @@ void DBDetector::LoadModel(const std::string &model_dir) { config.EnableMemoryOptim(); // config.DisableGlogInfo(); - this->predictor_ = CreatePredictor(config); + this->predictor_ = paddle_infer::CreatePredictor(config); } void DBDetector::Run(cv::Mat &img, diff --git a/deploy/cpp_infer/src/ocr_rec.cpp b/deploy/cpp_infer/src/ocr_rec.cpp index 088cb942ba5ac4b09c9e8d1731a3b20d40967edf..96715163681092c0075fdbf456cc38b1679d82b9 100644 --- a/deploy/cpp_infer/src/ocr_rec.cpp +++ b/deploy/cpp_infer/src/ocr_rec.cpp @@ -37,7 +37,7 @@ void CRNNRecognizer::Run(std::vector img_list, for (int beg_img_no = 0; beg_img_no < img_num; beg_img_no += this->rec_batch_num_) { auto preprocess_start = std::chrono::steady_clock::now(); - int end_img_no = min(img_num, beg_img_no + this->rec_batch_num_); + int end_img_no = std::min(img_num, beg_img_no + this->rec_batch_num_); int batch_num = end_img_no - beg_img_no; int imgH = this->rec_image_shape_[1]; int imgW = this->rec_image_shape_[2]; @@ -46,7 +46,7 @@ void CRNNRecognizer::Run(std::vector img_list, int h = img_list[indices[ino]].rows; int w = img_list[indices[ino]].cols; float wh_ratio = w * 1.0 / h; - max_wh_ratio = max(max_wh_ratio, wh_ratio); + max_wh_ratio = std::max(max_wh_ratio, wh_ratio); } int batch_width = imgW; @@ -60,7 +60,7 @@ void CRNNRecognizer::Run(std::vector img_list, this->normalize_op_.Run(&resize_img, this->mean_, this->scale_, this->is_scale_); norm_img_batch.push_back(resize_img); - batch_width = max(resize_img.cols, batch_width); + batch_width = std::max(resize_img.cols, batch_width); } std::vector input(batch_num * 3 * imgH * batch_width, 0.0f); @@ -115,7 +115,7 @@ void CRNNRecognizer::Run(std::vector img_list, last_index = argmax_idx; } score /= count; - if (isnan(score)) { + if (std::isnan(score)) { continue; } rec_texts[indices[beg_img_no + m]] = str_res; @@ -130,7 +130,6 @@ void CRNNRecognizer::Run(std::vector img_list, } void CRNNRecognizer::LoadModel(const std::string &model_dir) { - // AnalysisConfig config; paddle_infer::Config config; config.SetModel(model_dir + "/inference.pdmodel", model_dir + "/inference.pdiparams"); @@ -147,12 +146,11 @@ void CRNNRecognizer::LoadModel(const std::string &model_dir) { if (this->precision_ == "int8") { precision = paddle_infer::Config::Precision::kInt8; } - if (!Utility::PathExists("./trt_rec_shape.txt")){ + if (!Utility::PathExists("./trt_rec_shape.txt")) { config.CollectShapeRangeInfo("./trt_rec_shape.txt"); - } else { + } else { config.EnableTunedTensorRtDynamicShape("./trt_rec_shape.txt", true); } - } } else { config.DisableGpu(); @@ -177,7 +175,7 @@ void CRNNRecognizer::LoadModel(const std::string &model_dir) { config.EnableMemoryOptim(); // config.DisableGlogInfo(); - this->predictor_ = CreatePredictor(config); + this->predictor_ = paddle_infer::CreatePredictor(config); } } // namespace PaddleOCR diff --git a/deploy/cpp_infer/src/paddleocr.cpp b/deploy/cpp_infer/src/paddleocr.cpp index 1de4fc7e9af8bf63cf68ef42d2a508cdc4b5f9f3..86747c60d682c4f2df66a8bc8f5c9dae68b80170 100644 --- a/deploy/cpp_infer/src/paddleocr.cpp +++ b/deploy/cpp_infer/src/paddleocr.cpp @@ -16,7 +16,7 @@ #include #include "auto_log/autolog.h" -#include + namespace PaddleOCR { PPOCR::PPOCR() { @@ -44,8 +44,71 @@ PPOCR::PPOCR() { } }; -void PPOCR::det(cv::Mat img, std::vector &ocr_results, - std::vector ×) { +std::vector> +PPOCR::ocr(std::vector img_list, bool det, bool rec, bool cls) { + std::vector> ocr_results; + + if (!det) { + std::vector ocr_result; + ocr_result.resize(img_list.size()); + if (cls && this->classifier_ != nullptr) { + this->cls(img_list, ocr_result); + for (int i = 0; i < img_list.size(); i++) { + if (ocr_result[i].cls_label % 2 == 1 && + ocr_result[i].cls_score > this->classifier_->cls_thresh) { + cv::rotate(img_list[i], img_list[i], 1); + } + } + } + if (rec) { + this->rec(img_list, ocr_result); + } + for (int i = 0; i < ocr_result.size(); ++i) { + std::vector ocr_result_tmp; + ocr_result_tmp.push_back(ocr_result[i]); + ocr_results.push_back(ocr_result_tmp); + } + } else { + for (int i = 0; i < img_list.size(); ++i) { + std::vector ocr_result = + this->ocr(img_list[i], true, rec, cls); + ocr_results.push_back(ocr_result); + } + } + return ocr_results; +} + +std::vector PPOCR::ocr(cv::Mat img, bool det, bool rec, + bool cls) { + + std::vector ocr_result; + // det + this->det(img, ocr_result); + // crop image + std::vector img_list; + for (int j = 0; j < ocr_result.size(); j++) { + cv::Mat crop_img; + crop_img = Utility::GetRotateCropImage(img, ocr_result[j].box); + img_list.push_back(crop_img); + } + // cls + if (cls && this->classifier_ != nullptr) { + this->cls(img_list, ocr_result); + for (int i = 0; i < img_list.size(); i++) { + if (ocr_result[i].cls_label % 2 == 1 && + ocr_result[i].cls_score > this->classifier_->cls_thresh) { + cv::rotate(img_list[i], img_list[i], 1); + } + } + } + // rec + if (rec) { + this->rec(img_list, ocr_result); + } + return ocr_result; +} + +void PPOCR::det(cv::Mat img, std::vector &ocr_results) { std::vector>> boxes; std::vector det_times; @@ -58,14 +121,13 @@ void PPOCR::det(cv::Mat img, std::vector &ocr_results, } // sort boex from top to bottom, from left to right Utility::sorted_boxes(ocr_results); - times[0] += det_times[0]; - times[1] += det_times[1]; - times[2] += det_times[2]; + this->time_info_det[0] += det_times[0]; + this->time_info_det[1] += det_times[1]; + this->time_info_det[2] += det_times[2]; } void PPOCR::rec(std::vector img_list, - std::vector &ocr_results, - std::vector ×) { + std::vector &ocr_results) { std::vector rec_texts(img_list.size(), ""); std::vector rec_text_scores(img_list.size(), 0); std::vector rec_times; @@ -75,14 +137,13 @@ void PPOCR::rec(std::vector img_list, ocr_results[i].text = rec_texts[i]; ocr_results[i].score = rec_text_scores[i]; } - times[0] += rec_times[0]; - times[1] += rec_times[1]; - times[2] += rec_times[2]; + this->time_info_rec[0] += rec_times[0]; + this->time_info_rec[1] += rec_times[1]; + this->time_info_rec[2] += rec_times[2]; } void PPOCR::cls(std::vector img_list, - std::vector &ocr_results, - std::vector ×) { + std::vector &ocr_results) { std::vector cls_labels(img_list.size(), 0); std::vector cls_scores(img_list.size(), 0); std::vector cls_times; @@ -92,125 +153,43 @@ void PPOCR::cls(std::vector img_list, ocr_results[i].cls_label = cls_labels[i]; ocr_results[i].cls_score = cls_scores[i]; } - times[0] += cls_times[0]; - times[1] += cls_times[1]; - times[2] += cls_times[2]; + this->time_info_cls[0] += cls_times[0]; + this->time_info_cls[1] += cls_times[1]; + this->time_info_cls[2] += cls_times[2]; } -std::vector> -PPOCR::ocr(std::vector cv_all_img_names, bool det, bool rec, - bool cls) { - std::vector time_info_det = {0, 0, 0}; - std::vector time_info_rec = {0, 0, 0}; - std::vector time_info_cls = {0, 0, 0}; - std::vector> ocr_results; - - if (!det) { - std::vector ocr_result; - // read image - std::vector img_list; - for (int i = 0; i < cv_all_img_names.size(); ++i) { - cv::Mat srcimg = cv::imread(cv_all_img_names[i], cv::IMREAD_COLOR); - if (!srcimg.data) { - std::cerr << "[ERROR] image read failed! image path: " - << cv_all_img_names[i] << endl; - exit(1); - } - img_list.push_back(srcimg); - OCRPredictResult res; - ocr_result.push_back(res); - } - if (cls && this->classifier_ != nullptr) { - this->cls(img_list, ocr_result, time_info_cls); - for (int i = 0; i < img_list.size(); i++) { - if (ocr_result[i].cls_label % 2 == 1 && - ocr_result[i].cls_score > this->classifier_->cls_thresh) { - cv::rotate(img_list[i], img_list[i], 1); - } - } - } - if (rec) { - this->rec(img_list, ocr_result, time_info_rec); - } - for (int i = 0; i < cv_all_img_names.size(); ++i) { - std::vector ocr_result_tmp; - ocr_result_tmp.push_back(ocr_result[i]); - ocr_results.push_back(ocr_result_tmp); - } - } else { - if (!Utility::PathExists(FLAGS_output) && FLAGS_det) { - Utility::CreateDir(FLAGS_output); - } - - for (int i = 0; i < cv_all_img_names.size(); ++i) { - std::vector ocr_result; - if (!FLAGS_benchmark) { - cout << "predict img: " << cv_all_img_names[i] << endl; - } - - cv::Mat srcimg = cv::imread(cv_all_img_names[i], cv::IMREAD_COLOR); - if (!srcimg.data) { - std::cerr << "[ERROR] image read failed! image path: " - << cv_all_img_names[i] << endl; - exit(1); - } - // det - this->det(srcimg, ocr_result, time_info_det); - // crop image - std::vector img_list; - for (int j = 0; j < ocr_result.size(); j++) { - cv::Mat crop_img; - crop_img = Utility::GetRotateCropImage(srcimg, ocr_result[j].box); - img_list.push_back(crop_img); - } - - // cls - if (cls && this->classifier_ != nullptr) { - this->cls(img_list, ocr_result, time_info_cls); - for (int i = 0; i < img_list.size(); i++) { - if (ocr_result[i].cls_label % 2 == 1 && - ocr_result[i].cls_score > this->classifier_->cls_thresh) { - cv::rotate(img_list[i], img_list[i], 1); - } - } - } - // rec - if (rec) { - this->rec(img_list, ocr_result, time_info_rec); - } - ocr_results.push_back(ocr_result); - } - } - if (FLAGS_benchmark) { - this->log(time_info_det, time_info_rec, time_info_cls, - cv_all_img_names.size()); - } - return ocr_results; -} // namespace PaddleOCR +void PPOCR::reset_timer() { + this->time_info_det = {0, 0, 0}; + this->time_info_rec = {0, 0, 0}; + this->time_info_cls = {0, 0, 0}; +} -void PPOCR::log(std::vector &det_times, std::vector &rec_times, - std::vector &cls_times, int img_num) { - if (det_times[0] + det_times[1] + det_times[2] > 0) { +void PPOCR::benchmark_log(int img_num) { + if (this->time_info_det[0] + this->time_info_det[1] + this->time_info_det[2] > + 0) { AutoLogger autolog_det("ocr_det", FLAGS_use_gpu, FLAGS_use_tensorrt, FLAGS_enable_mkldnn, FLAGS_cpu_threads, 1, "dynamic", - FLAGS_precision, det_times, img_num); + FLAGS_precision, this->time_info_det, img_num); autolog_det.report(); } - if (rec_times[0] + rec_times[1] + rec_times[2] > 0) { + if (this->time_info_rec[0] + this->time_info_rec[1] + this->time_info_rec[2] > + 0) { AutoLogger autolog_rec("ocr_rec", FLAGS_use_gpu, FLAGS_use_tensorrt, FLAGS_enable_mkldnn, FLAGS_cpu_threads, FLAGS_rec_batch_num, "dynamic", FLAGS_precision, - rec_times, img_num); + this->time_info_rec, img_num); autolog_rec.report(); } - if (cls_times[0] + cls_times[1] + cls_times[2] > 0) { + if (this->time_info_cls[0] + this->time_info_cls[1] + this->time_info_cls[2] > + 0) { AutoLogger autolog_cls("ocr_cls", FLAGS_use_gpu, FLAGS_use_tensorrt, FLAGS_enable_mkldnn, FLAGS_cpu_threads, FLAGS_cls_batch_num, "dynamic", FLAGS_precision, - cls_times, img_num); + this->time_info_cls, img_num); autolog_cls.report(); } } + PPOCR::~PPOCR() { if (this->detector_ != nullptr) { delete this->detector_; diff --git a/deploy/cpp_infer/src/paddlestructure.cpp b/deploy/cpp_infer/src/paddlestructure.cpp index ea69977a1e45b0f7c1235a647d7c56db4d3cbc74..b2e35f8c777bde3cea0a3fefd0ce8517d8d75318 100644 --- a/deploy/cpp_infer/src/paddlestructure.cpp +++ b/deploy/cpp_infer/src/paddlestructure.cpp @@ -16,14 +16,19 @@ #include #include "auto_log/autolog.h" -#include -#include namespace PaddleOCR { PaddleStructure::PaddleStructure() { + if (FLAGS_layout) { + this->layout_model_ = new StructureLayoutRecognizer( + FLAGS_layout_model_dir, FLAGS_use_gpu, FLAGS_gpu_id, FLAGS_gpu_mem, + FLAGS_cpu_threads, FLAGS_enable_mkldnn, FLAGS_layout_dict_path, + FLAGS_use_tensorrt, FLAGS_precision, FLAGS_layout_score_threshold, + FLAGS_layout_nms_threshold); + } if (FLAGS_table) { - this->recognizer_ = new StructureTableRecognizer( + this->table_model_ = new StructureTableRecognizer( FLAGS_table_model_dir, FLAGS_use_gpu, FLAGS_gpu_id, FLAGS_gpu_mem, FLAGS_cpu_threads, FLAGS_enable_mkldnn, FLAGS_table_char_dict_path, FLAGS_use_tensorrt, FLAGS_precision, FLAGS_table_batch_num, @@ -31,68 +36,63 @@ PaddleStructure::PaddleStructure() { } }; -std::vector> -PaddleStructure::structure(std::vector cv_all_img_names, - bool layout, bool table) { - std::vector time_info_det = {0, 0, 0}; - std::vector time_info_rec = {0, 0, 0}; - std::vector time_info_cls = {0, 0, 0}; - std::vector time_info_table = {0, 0, 0}; +std::vector +PaddleStructure::structure(cv::Mat srcimg, bool layout, bool table, bool ocr) { + cv::Mat img; + srcimg.copyTo(img); - std::vector> structure_results; + std::vector structure_results; - if (!Utility::PathExists(FLAGS_output) && FLAGS_det) { - Utility::CreateDir(FLAGS_output); + if (layout) { + this->layout(img, structure_results); + } else { + StructurePredictResult res; + res.type = "table"; + res.box = std::vector(4, 0.0); + res.box[2] = img.cols; + res.box[3] = img.rows; + structure_results.push_back(res); } - for (int i = 0; i < cv_all_img_names.size(); ++i) { - std::vector structure_result; - cv::Mat srcimg = cv::imread(cv_all_img_names[i], cv::IMREAD_COLOR); - if (!srcimg.data) { - std::cerr << "[ERROR] image read failed! image path: " - << cv_all_img_names[i] << endl; - exit(1); - } - if (layout) { - } else { - StructurePredictResult res; - res.type = "table"; - res.box = std::vector(4, 0); - res.box[2] = srcimg.cols; - res.box[3] = srcimg.rows; - structure_result.push_back(res); - } - cv::Mat roi_img; - for (int i = 0; i < structure_result.size(); i++) { - // crop image - roi_img = Utility::crop_image(srcimg, structure_result[i].box); - if (structure_result[i].type == "table") { - this->table(roi_img, structure_result[i], time_info_table, - time_info_det, time_info_rec, time_info_cls); - } + cv::Mat roi_img; + for (int i = 0; i < structure_results.size(); i++) { + // crop image + roi_img = Utility::crop_image(img, structure_results[i].box); + if (structure_results[i].type == "table" && table) { + this->table(roi_img, structure_results[i]); + } else if (ocr) { + structure_results[i].text_res = this->ocr(roi_img, true, true, false); } - structure_results.push_back(structure_result); } + return structure_results; }; +void PaddleStructure::layout( + cv::Mat img, std::vector &structure_result) { + std::vector layout_times; + this->layout_model_->Run(img, structure_result, layout_times); + + this->time_info_layout[0] += layout_times[0]; + this->time_info_layout[1] += layout_times[1]; + this->time_info_layout[2] += layout_times[2]; +} + void PaddleStructure::table(cv::Mat img, - StructurePredictResult &structure_result, - std::vector &time_info_table, - std::vector &time_info_det, - std::vector &time_info_rec, - std::vector &time_info_cls) { + StructurePredictResult &structure_result) { // predict structure std::vector> structure_html_tags; std::vector structure_scores(1, 0); std::vector>> structure_boxes; - std::vector structure_imes; + std::vector structure_times; std::vector img_list; img_list.push_back(img); - this->recognizer_->Run(img_list, structure_html_tags, structure_scores, - structure_boxes, structure_imes); - time_info_table[0] += structure_imes[0]; - time_info_table[1] += structure_imes[1]; - time_info_table[2] += structure_imes[2]; + + this->table_model_->Run(img_list, structure_html_tags, structure_scores, + structure_boxes, structure_times); + + this->time_info_table[0] += structure_times[0]; + this->time_info_table[1] += structure_times[1]; + this->time_info_table[2] += structure_times[2]; std::vector ocr_result; std::string html; @@ -100,22 +100,22 @@ void PaddleStructure::table(cv::Mat img, for (int i = 0; i < img_list.size(); i++) { // det - this->det(img_list[i], ocr_result, time_info_det); + this->det(img_list[i], ocr_result); // crop image std::vector rec_img_list; std::vector ocr_box; for (int j = 0; j < ocr_result.size(); j++) { ocr_box = Utility::xyxyxyxy2xyxy(ocr_result[j].box); - ocr_box[0] = max(0, ocr_box[0] - expand_pixel); - ocr_box[1] = max(0, ocr_box[1] - expand_pixel), - ocr_box[2] = min(img_list[i].cols, ocr_box[2] + expand_pixel); - ocr_box[3] = min(img_list[i].rows, ocr_box[3] + expand_pixel); + ocr_box[0] = std::max(0, ocr_box[0] - expand_pixel); + ocr_box[1] = std::max(0, ocr_box[1] - expand_pixel), + ocr_box[2] = std::min(img_list[i].cols, ocr_box[2] + expand_pixel); + ocr_box[3] = std::min(img_list[i].rows, ocr_box[3] + expand_pixel); cv::Mat crop_img = Utility::crop_image(img_list[i], ocr_box); rec_img_list.push_back(crop_img); } // rec - this->rec(rec_img_list, ocr_result, time_info_rec); + this->rec(rec_img_list, ocr_result); // rebuild table html = this->rebuild_table(structure_html_tags[i], structure_boxes[i], ocr_result); @@ -130,8 +130,8 @@ PaddleStructure::rebuild_table(std::vector structure_html_tags, std::vector> structure_boxes, std::vector &ocr_result) { // match text in same cell - std::vector> matched(structure_boxes.size(), - std::vector()); + std::vector> matched(structure_boxes.size(), + std::vector()); std::vector ocr_box; std::vector structure_box; @@ -150,7 +150,7 @@ PaddleStructure::rebuild_table(std::vector structure_html_tags, structure_box = structure_boxes[j]; } dis_list[j][0] = this->dis(ocr_box, structure_box); - dis_list[j][1] = 1 - this->iou(ocr_box, structure_box); + dis_list[j][1] = 1 - Utility::iou(ocr_box, structure_box); dis_list[j][2] = j; } // find min dis idx @@ -216,28 +216,6 @@ PaddleStructure::rebuild_table(std::vector structure_html_tags, return html_str; } -float PaddleStructure::iou(std::vector &box1, std::vector &box2) { - int area1 = max(0, box1[2] - box1[0]) * max(0, box1[3] - box1[1]); - int area2 = max(0, box2[2] - box2[0]) * max(0, box2[3] - box2[1]); - - // computing the sum_area - int sum_area = area1 + area2; - - // find the each point of intersect rectangle - int x1 = max(box1[0], box2[0]); - int y1 = max(box1[1], box2[1]); - int x2 = min(box1[2], box2[2]); - int y2 = min(box1[3], box2[3]); - - // judge if there is an intersect - if (y1 >= y2 || x1 >= x2) { - return 0.0; - } else { - int intersect = (x2 - x1) * (y2 - y1); - return intersect / (sum_area - intersect + 0.00000001); - } -} - float PaddleStructure::dis(std::vector &box1, std::vector &box2) { int x1_1 = box1[0]; int y1_1 = box1[1]; @@ -253,12 +231,64 @@ float PaddleStructure::dis(std::vector &box1, std::vector &box2) { abs(x1_2 - x1_1) + abs(y1_2 - y1_1) + abs(x2_2 - x2_1) + abs(y2_2 - y2_1); float dis_2 = abs(x1_2 - x1_1) + abs(y1_2 - y1_1); float dis_3 = abs(x2_2 - x2_1) + abs(y2_2 - y2_1); - return dis + min(dis_2, dis_3); + return dis + std::min(dis_2, dis_3); +} + +void PaddleStructure::reset_timer() { + this->time_info_det = {0, 0, 0}; + this->time_info_rec = {0, 0, 0}; + this->time_info_cls = {0, 0, 0}; + this->time_info_table = {0, 0, 0}; + this->time_info_layout = {0, 0, 0}; +} + +void PaddleStructure::benchmark_log(int img_num) { + if (this->time_info_det[0] + this->time_info_det[1] + this->time_info_det[2] > + 0) { + AutoLogger autolog_det("ocr_det", FLAGS_use_gpu, FLAGS_use_tensorrt, + FLAGS_enable_mkldnn, FLAGS_cpu_threads, 1, "dynamic", + FLAGS_precision, this->time_info_det, img_num); + autolog_det.report(); + } + if (this->time_info_rec[0] + this->time_info_rec[1] + this->time_info_rec[2] > + 0) { + AutoLogger autolog_rec("ocr_rec", FLAGS_use_gpu, FLAGS_use_tensorrt, + FLAGS_enable_mkldnn, FLAGS_cpu_threads, + FLAGS_rec_batch_num, "dynamic", FLAGS_precision, + this->time_info_rec, img_num); + autolog_rec.report(); + } + if (this->time_info_cls[0] + this->time_info_cls[1] + this->time_info_cls[2] > + 0) { + AutoLogger autolog_cls("ocr_cls", FLAGS_use_gpu, FLAGS_use_tensorrt, + FLAGS_enable_mkldnn, FLAGS_cpu_threads, + FLAGS_cls_batch_num, "dynamic", FLAGS_precision, + this->time_info_cls, img_num); + autolog_cls.report(); + } + if (this->time_info_table[0] + this->time_info_table[1] + + this->time_info_table[2] > + 0) { + AutoLogger autolog_table("table", FLAGS_use_gpu, FLAGS_use_tensorrt, + FLAGS_enable_mkldnn, FLAGS_cpu_threads, + FLAGS_cls_batch_num, "dynamic", FLAGS_precision, + this->time_info_table, img_num); + autolog_table.report(); + } + if (this->time_info_layout[0] + this->time_info_layout[1] + + this->time_info_layout[2] > + 0) { + AutoLogger autolog_layout("layout", FLAGS_use_gpu, FLAGS_use_tensorrt, + FLAGS_enable_mkldnn, FLAGS_cpu_threads, + FLAGS_cls_batch_num, "dynamic", FLAGS_precision, + this->time_info_layout, img_num); + autolog_layout.report(); + } } PaddleStructure::~PaddleStructure() { - if (this->recognizer_ != nullptr) { - delete this->recognizer_; + if (this->table_model_ != nullptr) { + delete this->table_model_; } }; diff --git a/deploy/cpp_infer/src/postprocess_op.cpp b/deploy/cpp_infer/src/postprocess_op.cpp index 4b0c693c80467bceb75da2b3fef6e816b0690979..c139fa7236856fa653b21bc7df5914290df0e21c 100644 --- a/deploy/cpp_infer/src/postprocess_op.cpp +++ b/deploy/cpp_infer/src/postprocess_op.cpp @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include #include namespace PaddleOCR { @@ -431,7 +430,7 @@ void TablePostProcessor::Run( } } score /= count; - if (isnan(score) || rec_boxes.size() == 0) { + if (std::isnan(score) || rec_boxes.size() == 0) { score = -1; } rec_scores.push_back(score); @@ -440,4 +439,137 @@ void TablePostProcessor::Run( } } +void PicodetPostProcessor::init(std::string label_path, + const double score_threshold, + const double nms_threshold, + const std::vector &fpn_stride) { + this->label_list_ = Utility::ReadDict(label_path); + this->score_threshold_ = score_threshold; + this->nms_threshold_ = nms_threshold; + this->num_class_ = label_list_.size(); + this->fpn_stride_ = fpn_stride; +} + +void PicodetPostProcessor::Run(std::vector &results, + std::vector> outs, + std::vector ori_shape, + std::vector resize_shape, int reg_max) { + int in_h = resize_shape[0]; + int in_w = resize_shape[1]; + float scale_factor_h = resize_shape[0] / float(ori_shape[0]); + float scale_factor_w = resize_shape[1] / float(ori_shape[1]); + + std::vector> bbox_results; + bbox_results.resize(this->num_class_); + for (int i = 0; i < this->fpn_stride_.size(); ++i) { + int feature_h = std::ceil((float)in_h / this->fpn_stride_[i]); + int feature_w = std::ceil((float)in_w / this->fpn_stride_[i]); + for (int idx = 0; idx < feature_h * feature_w; idx++) { + // score and label + float score = 0; + int cur_label = 0; + for (int label = 0; label < this->num_class_; label++) { + if (outs[i][idx * this->num_class_ + label] > score) { + score = outs[i][idx * this->num_class_ + label]; + cur_label = label; + } + } + // bbox + if (score > this->score_threshold_) { + int row = idx / feature_w; + int col = idx % feature_w; + std::vector bbox_pred( + outs[i + this->fpn_stride_.size()].begin() + idx * 4 * reg_max, + outs[i + this->fpn_stride_.size()].begin() + + (idx + 1) * 4 * reg_max); + bbox_results[cur_label].push_back( + this->disPred2Bbox(bbox_pred, cur_label, score, col, row, + this->fpn_stride_[i], resize_shape, reg_max)); + } + } + } + for (int i = 0; i < bbox_results.size(); i++) { + bool flag = bbox_results[i].size() <= 0; + } + for (int i = 0; i < bbox_results.size(); i++) { + bool flag = bbox_results[i].size() <= 0; + if (bbox_results[i].size() <= 0) { + continue; + } + this->nms(bbox_results[i], this->nms_threshold_); + for (auto box : bbox_results[i]) { + box.box[0] = box.box[0] / scale_factor_w; + box.box[2] = box.box[2] / scale_factor_w; + box.box[1] = box.box[1] / scale_factor_h; + box.box[3] = box.box[3] / scale_factor_h; + results.push_back(box); + } + } +} + +StructurePredictResult +PicodetPostProcessor::disPred2Bbox(std::vector bbox_pred, int label, + float score, int x, int y, int stride, + std::vector im_shape, int reg_max) { + float ct_x = (x + 0.5) * stride; + float ct_y = (y + 0.5) * stride; + std::vector dis_pred; + dis_pred.resize(4); + for (int i = 0; i < 4; i++) { + float dis = 0; + std::vector bbox_pred_i(bbox_pred.begin() + i * reg_max, + bbox_pred.begin() + (i + 1) * reg_max); + std::vector dis_after_sm = + Utility::activation_function_softmax(bbox_pred_i); + for (int j = 0; j < reg_max; j++) { + dis += j * dis_after_sm[j]; + } + dis *= stride; + dis_pred[i] = dis; + } + + float xmin = (std::max)(ct_x - dis_pred[0], .0f); + float ymin = (std::max)(ct_y - dis_pred[1], .0f); + float xmax = (std::min)(ct_x + dis_pred[2], (float)im_shape[1]); + float ymax = (std::min)(ct_y + dis_pred[3], (float)im_shape[0]); + + StructurePredictResult result_item; + result_item.box = {xmin, ymin, xmax, ymax}; + result_item.type = this->label_list_[label]; + result_item.confidence = score; + + return result_item; +} + +void PicodetPostProcessor::nms(std::vector &input_boxes, + float nms_threshold) { + std::sort(input_boxes.begin(), input_boxes.end(), + [](StructurePredictResult a, StructurePredictResult b) { + return a.confidence > b.confidence; + }); + std::vector picked(input_boxes.size(), 1); + + for (int i = 0; i < input_boxes.size(); ++i) { + if (picked[i] == 0) { + continue; + } + for (int j = i + 1; j < input_boxes.size(); ++j) { + if (picked[j] == 0) { + continue; + } + float iou = Utility::iou(input_boxes[i].box, input_boxes[j].box); + if (iou > nms_threshold) { + picked[j] = 0; + } + } + } + std::vector input_boxes_nms; + for (int i = 0; i < input_boxes.size(); ++i) { + if (picked[i] == 1) { + input_boxes_nms.push_back(input_boxes[i]); + } + } + input_boxes = input_boxes_nms; +} + } // namespace PaddleOCR diff --git a/deploy/cpp_infer/src/preprocess_op.cpp b/deploy/cpp_infer/src/preprocess_op.cpp index ac185e22d68955ef440e22c327b835dbce6c4e1b..19cd6c3f799e66c50a004881272e0c4a1e357c1d 100644 --- a/deploy/cpp_infer/src/preprocess_op.cpp +++ b/deploy/cpp_infer/src/preprocess_op.cpp @@ -12,21 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "opencv2/core.hpp" -#include "opencv2/imgcodecs.hpp" -#include "opencv2/imgproc.hpp" -#include "paddle_api.h" -#include "paddle_inference_api.h" -#include -#include -#include -#include -#include - -#include -#include -#include - #include namespace PaddleOCR { @@ -69,13 +54,13 @@ void Normalize::Run(cv::Mat *im, const std::vector &mean, } void ResizeImgType0::Run(const cv::Mat &img, cv::Mat &resize_img, - string limit_type, int limit_side_len, float &ratio_h, - float &ratio_w, bool use_tensorrt) { + std::string limit_type, int limit_side_len, + float &ratio_h, float &ratio_w, bool use_tensorrt) { int w = img.cols; int h = img.rows; float ratio = 1.f; if (limit_type == "min") { - int min_wh = min(h, w); + int min_wh = std::min(h, w); if (min_wh < limit_side_len) { if (h < w) { ratio = float(limit_side_len) / float(h); @@ -84,7 +69,7 @@ void ResizeImgType0::Run(const cv::Mat &img, cv::Mat &resize_img, } } } else { - int max_wh = max(h, w); + int max_wh = std::max(h, w); if (max_wh > limit_side_len) { if (h > w) { ratio = float(limit_side_len) / float(h); @@ -97,8 +82,8 @@ void ResizeImgType0::Run(const cv::Mat &img, cv::Mat &resize_img, int resize_h = int(float(h) * ratio); int resize_w = int(float(w) * ratio); - resize_h = max(int(round(float(resize_h) / 32) * 32), 32); - resize_w = max(int(round(float(resize_w) / 32) * 32), 32); + resize_h = std::max(int(round(float(resize_h) / 32) * 32), 32); + resize_w = std::max(int(round(float(resize_w) / 32) * 32), 32); cv::resize(img, resize_img, cv::Size(resize_w, resize_h)); ratio_h = float(resize_h) / float(h); @@ -175,4 +160,9 @@ void TablePadImg::Run(const cv::Mat &img, cv::Mat &resize_img, cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0)); } +void Resize::Run(const cv::Mat &img, cv::Mat &resize_img, const int h, + const int w) { + cv::resize(img, resize_img, cv::Size(w, h)); +} + } // namespace PaddleOCR diff --git a/deploy/cpp_infer/src/structure_layout.cpp b/deploy/cpp_infer/src/structure_layout.cpp new file mode 100644 index 0000000000000000000000000000000000000000..922959ae0238f01a0e9ce1bec41daba0a2c71669 --- /dev/null +++ b/deploy/cpp_infer/src/structure_layout.cpp @@ -0,0 +1,149 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +namespace PaddleOCR { + +void StructureLayoutRecognizer::Run(cv::Mat img, + std::vector &result, + std::vector ×) { + std::chrono::duration preprocess_diff = + std::chrono::steady_clock::now() - std::chrono::steady_clock::now(); + std::chrono::duration inference_diff = + std::chrono::steady_clock::now() - std::chrono::steady_clock::now(); + std::chrono::duration postprocess_diff = + std::chrono::steady_clock::now() - std::chrono::steady_clock::now(); + + // preprocess + auto preprocess_start = std::chrono::steady_clock::now(); + + cv::Mat srcimg; + img.copyTo(srcimg); + cv::Mat resize_img; + this->resize_op_.Run(srcimg, resize_img, 800, 608); + this->normalize_op_.Run(&resize_img, this->mean_, this->scale_, + this->is_scale_); + + std::vector input(1 * 3 * resize_img.rows * resize_img.cols, 0.0f); + this->permute_op_.Run(&resize_img, input.data()); + auto preprocess_end = std::chrono::steady_clock::now(); + preprocess_diff += preprocess_end - preprocess_start; + + // inference. + auto input_names = this->predictor_->GetInputNames(); + auto input_t = this->predictor_->GetInputHandle(input_names[0]); + input_t->Reshape({1, 3, resize_img.rows, resize_img.cols}); + auto inference_start = std::chrono::steady_clock::now(); + input_t->CopyFromCpu(input.data()); + + this->predictor_->Run(); + + // Get output tensor + std::vector> out_tensor_list; + std::vector> output_shape_list; + auto output_names = this->predictor_->GetOutputNames(); + for (int j = 0; j < output_names.size(); j++) { + auto output_tensor = this->predictor_->GetOutputHandle(output_names[j]); + std::vector output_shape = output_tensor->shape(); + int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1, + std::multiplies()); + output_shape_list.push_back(output_shape); + + std::vector out_data; + out_data.resize(out_num); + output_tensor->CopyToCpu(out_data.data()); + out_tensor_list.push_back(out_data); + } + auto inference_end = std::chrono::steady_clock::now(); + inference_diff += inference_end - inference_start; + + // postprocess + auto postprocess_start = std::chrono::steady_clock::now(); + + std::vector bbox_num; + int reg_max = 0; + for (int i = 0; i < out_tensor_list.size(); i++) { + if (i == this->post_processor_.fpn_stride_.size()) { + reg_max = output_shape_list[i][2] / 4; + break; + } + } + std::vector ori_shape = {srcimg.rows, srcimg.cols}; + std::vector resize_shape = {resize_img.rows, resize_img.cols}; + this->post_processor_.Run(result, out_tensor_list, ori_shape, resize_shape, + reg_max); + bbox_num.push_back(result.size()); + + auto postprocess_end = std::chrono::steady_clock::now(); + postprocess_diff += postprocess_end - postprocess_start; + times.push_back(double(preprocess_diff.count() * 1000)); + times.push_back(double(inference_diff.count() * 1000)); + times.push_back(double(postprocess_diff.count() * 1000)); +} + +void StructureLayoutRecognizer::LoadModel(const std::string &model_dir) { + paddle_infer::Config config; + if (Utility::PathExists(model_dir + "/inference.pdmodel") && + Utility::PathExists(model_dir + "/inference.pdiparams")) { + config.SetModel(model_dir + "/inference.pdmodel", + model_dir + "/inference.pdiparams"); + } else if (Utility::PathExists(model_dir + "/model.pdmodel") && + Utility::PathExists(model_dir + "/model.pdiparams")) { + config.SetModel(model_dir + "/model.pdmodel", + model_dir + "/model.pdiparams"); + } else { + std::cerr << "[ERROR] not find model.pdiparams or inference.pdiparams in " + << model_dir << std::endl; + exit(1); + } + + if (this->use_gpu_) { + config.EnableUseGpu(this->gpu_mem_, this->gpu_id_); + if (this->use_tensorrt_) { + auto precision = paddle_infer::Config::Precision::kFloat32; + if (this->precision_ == "fp16") { + precision = paddle_infer::Config::Precision::kHalf; + } + if (this->precision_ == "int8") { + precision = paddle_infer::Config::Precision::kInt8; + } + config.EnableTensorRtEngine(1 << 20, 10, 3, precision, false, false); + if (!Utility::PathExists("./trt_layout_shape.txt")) { + config.CollectShapeRangeInfo("./trt_layout_shape.txt"); + } else { + config.EnableTunedTensorRtDynamicShape("./trt_layout_shape.txt", true); + } + } + } else { + config.DisableGpu(); + if (this->use_mkldnn_) { + config.EnableMKLDNN(); + } + config.SetCpuMathLibraryNumThreads(this->cpu_math_library_num_threads_); + } + + // false for zero copy tensor + config.SwitchUseFeedFetchOps(false); + // true for multiple input + config.SwitchSpecifyInputNames(true); + + config.SwitchIrOptim(true); + + config.EnableMemoryOptim(); + config.DisableGlogInfo(); + + this->predictor_ = paddle_infer::CreatePredictor(config); +} +} // namespace PaddleOCR diff --git a/deploy/cpp_infer/src/structure_table.cpp b/deploy/cpp_infer/src/structure_table.cpp index 7df0ab94b5df8a62148ceb01f48b35d73b14f78c..52f5d9ee9e46d88fd6e34bbb3afe86cbf7858140 100644 --- a/deploy/cpp_infer/src/structure_table.cpp +++ b/deploy/cpp_infer/src/structure_table.cpp @@ -34,7 +34,7 @@ void StructureTableRecognizer::Run( beg_img_no += this->table_batch_num_) { // preprocess auto preprocess_start = std::chrono::steady_clock::now(); - int end_img_no = min(img_num, beg_img_no + this->table_batch_num_); + int end_img_no = std::min(img_num, beg_img_no + this->table_batch_num_); int batch_num = end_img_no - beg_img_no; std::vector norm_img_batch; std::vector width_list; @@ -118,7 +118,7 @@ void StructureTableRecognizer::Run( } void StructureTableRecognizer::LoadModel(const std::string &model_dir) { - AnalysisConfig config; + paddle_infer::Config config; config.SetModel(model_dir + "/inference.pdmodel", model_dir + "/inference.pdiparams"); @@ -133,6 +133,11 @@ void StructureTableRecognizer::LoadModel(const std::string &model_dir) { precision = paddle_infer::Config::Precision::kInt8; } config.EnableTensorRtEngine(1 << 20, 10, 3, precision, false, false); + if (!Utility::PathExists("./trt_table_shape.txt")) { + config.CollectShapeRangeInfo("./trt_table_shape.txt"); + } else { + config.EnableTunedTensorRtDynamicShape("./trt_table_shape.txt", true); + } } } else { config.DisableGpu(); @@ -152,6 +157,6 @@ void StructureTableRecognizer::LoadModel(const std::string &model_dir) { config.EnableMemoryOptim(); config.DisableGlogInfo(); - this->predictor_ = CreatePredictor(config); + this->predictor_ = paddle_infer::CreatePredictor(config); } } // namespace PaddleOCR diff --git a/deploy/cpp_infer/src/utility.cpp b/deploy/cpp_infer/src/utility.cpp index 0e6ba17fc3bab5b5e005f8b5e41640899bee39d0..4a8b181494fca768b153e0825e8be0853f7f3aef 100644 --- a/deploy/cpp_infer/src/utility.cpp +++ b/deploy/cpp_infer/src/utility.cpp @@ -70,6 +70,7 @@ void Utility::VisualizeBboxes(const cv::Mat &srcimg, const std::string &save_path) { cv::Mat img_vis; srcimg.copyTo(img_vis); + img_vis = crop_image(img_vis, structure_result.box); for (int n = 0; n < structure_result.cell_box.size(); n++) { if (structure_result.cell_box[n].size() == 8) { cv::Point rook_points[4]; @@ -280,23 +281,29 @@ void Utility::print_result(const std::vector &ocr_result) { } } -cv::Mat Utility::crop_image(cv::Mat &img, std::vector &area) { +cv::Mat Utility::crop_image(cv::Mat &img, const std::vector &box) { cv::Mat crop_im; - int crop_x1 = std::max(0, area[0]); - int crop_y1 = std::max(0, area[1]); - int crop_x2 = std::min(img.cols - 1, area[2] - 1); - int crop_y2 = std::min(img.rows - 1, area[3] - 1); + int crop_x1 = std::max(0, box[0]); + int crop_y1 = std::max(0, box[1]); + int crop_x2 = std::min(img.cols - 1, box[2] - 1); + int crop_y2 = std::min(img.rows - 1, box[3] - 1); - crop_im = cv::Mat::zeros(area[3] - area[1], area[2] - area[0], 16); + crop_im = cv::Mat::zeros(box[3] - box[1], box[2] - box[0], 16); cv::Mat crop_im_window = - crop_im(cv::Range(crop_y1 - area[1], crop_y2 + 1 - area[1]), - cv::Range(crop_x1 - area[0], crop_x2 + 1 - area[0])); + crop_im(cv::Range(crop_y1 - box[1], crop_y2 + 1 - box[1]), + cv::Range(crop_x1 - box[0], crop_x2 + 1 - box[0])); cv::Mat roi_img = img(cv::Range(crop_y1, crop_y2 + 1), cv::Range(crop_x1, crop_x2 + 1)); crop_im_window += roi_img; return crop_im; } +cv::Mat Utility::crop_image(cv::Mat &img, const std::vector &box) { + std::vector box_int = {(int)box[0], (int)box[1], (int)box[2], + (int)box[3]}; + return crop_image(img, box_int); +} + void Utility::sorted_boxes(std::vector &ocr_result) { std::sort(ocr_result.begin(), ocr_result.end(), Utility::comparison_box); if (ocr_result.size() > 0) { @@ -341,4 +348,78 @@ std::vector Utility::xyxyxyxy2xyxy(std::vector &box) { return box1; } +float Utility::fast_exp(float x) { + union { + uint32_t i; + float f; + } v{}; + v.i = (1 << 23) * (1.4426950409 * x + 126.93490512f); + return v.f; +} + +std::vector +Utility::activation_function_softmax(std::vector &src) { + int length = src.size(); + std::vector dst; + dst.resize(length); + const float alpha = float(*std::max_element(&src[0], &src[0 + length])); + float denominator{0}; + + for (int i = 0; i < length; ++i) { + dst[i] = fast_exp(src[i] - alpha); + denominator += dst[i]; + } + + for (int i = 0; i < length; ++i) { + dst[i] /= denominator; + } + return dst; +} + +float Utility::iou(std::vector &box1, std::vector &box2) { + int area1 = std::max(0, box1[2] - box1[0]) * std::max(0, box1[3] - box1[1]); + int area2 = std::max(0, box2[2] - box2[0]) * std::max(0, box2[3] - box2[1]); + + // computing the sum_area + int sum_area = area1 + area2; + + // find the each point of intersect rectangle + int x1 = std::max(box1[0], box2[0]); + int y1 = std::max(box1[1], box2[1]); + int x2 = std::min(box1[2], box2[2]); + int y2 = std::min(box1[3], box2[3]); + + // judge if there is an intersect + if (y1 >= y2 || x1 >= x2) { + return 0.0; + } else { + int intersect = (x2 - x1) * (y2 - y1); + return intersect / (sum_area - intersect + 0.00000001); + } +} + +float Utility::iou(std::vector &box1, std::vector &box2) { + float area1 = std::max((float)0.0, box1[2] - box1[0]) * + std::max((float)0.0, box1[3] - box1[1]); + float area2 = std::max((float)0.0, box2[2] - box2[0]) * + std::max((float)0.0, box2[3] - box2[1]); + + // computing the sum_area + float sum_area = area1 + area2; + + // find the each point of intersect rectangle + float x1 = std::max(box1[0], box2[0]); + float y1 = std::max(box1[1], box2[1]); + float x2 = std::min(box1[2], box2[2]); + float y2 = std::min(box1[3], box2[3]); + + // judge if there is an intersect + if (y1 >= y2 || x1 >= x2) { + return 0.0; + } else { + float intersect = (x2 - x1) * (y2 - y1); + return intersect / (sum_area - intersect + 0.00000001); + } +} + } // namespace PaddleOCR \ No newline at end of file