提交 eba1286c 编写于 作者: qq_25193841's avatar qq_25193841

Merge remote-tracking branch 'origin/release/2.6' into release2.6

......@@ -27,13 +27,6 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力
## 📣 近期更新
- 💼 **2022.9.5 飞桨智慧金融行业系列直播课**
- PaddleOCR发布四大范例:印章弯曲文本检测与识别、扫描版合同关键信息抽取、通用卡证结构化信息提取、中文表格识别与属性分析
- 9月6日起每周二、周四19点直播,扫码免费加入微信群获取直播链接,与行业专家深度交流
<div align="center">
<img src="https://user-images.githubusercontent.com/50011306/188440561-d99fce4f-f6ef-4ec0-be7e-47a70b91633a.jpg" width = "150" height = "150" />
</div>
- **🔥2022.8.24 发布 PaddleOCR [release/2.6](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.6)**
- 发布[PP-Structurev2](./ppstructure/README_ch.md),系统功能性能全面升级,适配中文场景,新增支持[版面复原](./ppstructure/recovery/README_ch.md),支持**一行命令完成PDF转Word**
......@@ -83,9 +76,10 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力
<a name="开源社区"></a>
## 👫 开源社区
- **📑项目合作:** 如果您是企业开发者且有明确的OCR垂类应用需求,填写[问卷](https://paddle.wjx.cn/vj/QwF7GKw.aspx)后可免费与官方团队展开不同层次的合作。
- **👫加入社区:** 微信扫描二维码并填写问卷之后,加入交流群领取福利
- **获取PaddleOCR最新发版解说《OCR超强技术详解与产业应用实战》系列直播课回放链接**
- **10G重磅OCR学习大礼包:**《动手学OCR》电子书,配套讲解视频和notebook项目;66篇OCR相关顶会前沿论文打包放送,包括CVPR、AAAI、IJCAI、ICCV等;PaddleOCR历次发版直播课视频;OCR社区优秀开发者项目分享视频。
- **👫加入社区:** 微信扫描二维码并填写问卷之后,加入交流群领取20G重磅OCR学习大礼包
- **包括《动手学OCR》电子书** ,配套讲解视频和notebook项目;PaddleOCR历次发版直播课视频;
- **OCR场景应用模型集合:** 包含数码管、液晶屏、车牌、高精度SVTR模型、手写体识别等垂类模型,覆盖通用,制造、金融、交通行业的主要OCR垂类应用。
- PDF2Word应用程序;OCR社区优秀开发者项目分享视频。
- **🏅️社区项目**[社区项目](./doc/doc_ch/thirdparty.md)文档中包含了社区用户**使用PaddleOCR开发的各种工具、应用**以及**为PaddleOCR贡献的功能、优化的文档与代码**等,是官方为社区开发者打造的荣誉墙,也是帮助优质项目宣传的广播站。
- **🎁社区常规赛**:社区常规赛是面向OCR开发者的积分赛事,覆盖文档、代码、模型和应用四大类型,以季度为单位评选并发放奖励,赛题详情与报名方法可参考[链接](https://github.com/PaddlePaddle/PaddleOCR/issues/4982)
......
......@@ -88,6 +88,7 @@ Train:
prob: 0.5
ext_data_num: 2
image_shape: [48, 320, 3]
max_text_length: *max_text_length
- RecAug:
- MultiLabelEncode:
- RecResizeImg:
......
......@@ -162,6 +162,7 @@ Train:
prob: 0.5
ext_data_num: 2
image_shape: [48, 320, 3]
max_text_length: *max_text_length
- RecAug:
- MultiLabelEncode:
- RecResizeImg:
......
......@@ -88,6 +88,7 @@ Train:
prob: 0.5
ext_data_num: 2
image_shape: [48, 320, 3]
max_text_length: *max_text_length
- RecAug:
- MultiLabelEncode:
- RecResizeImg:
......
......@@ -54,6 +54,7 @@ DECLARE_string(table_model_dir);
DECLARE_int32(table_max_len);
DECLARE_int32(table_batch_num);
DECLARE_string(table_char_dict_path);
DECLARE_bool(merge_no_span_structure);
// forward related
DECLARE_bool(det);
DECLARE_bool(rec);
......
......@@ -54,15 +54,12 @@ private:
std::vector<double> &time_info_det,
std::vector<double> &time_info_rec,
std::vector<double> &time_info_cls);
std::string
rebuild_table(std::vector<std::string> rec_html_tags,
std::vector<std::vector<std::vector<int>>> rec_boxes,
std::string rebuild_table(std::vector<std::string> rec_html_tags,
std::vector<std::vector<int>> rec_boxes,
std::vector<OCRPredictResult> &ocr_result);
float iou(std::vector<std::vector<int>> &box1,
std::vector<std::vector<int>> &box2);
float dis(std::vector<std::vector<int>> &box1,
std::vector<std::vector<int>> &box2);
float iou(std::vector<int> &box1, std::vector<int> &box2);
float dis(std::vector<int> &box1, std::vector<int> &box2);
static bool comparison_dis(const std::vector<float> &dis1,
const std::vector<float> &dis2) {
......
......@@ -92,13 +92,12 @@ private:
class TablePostProcessor {
public:
void init(std::string label_path);
void
Run(std::vector<float> &loc_preds, std::vector<float> &structure_probs,
void init(std::string label_path, bool merge_no_span_structure = true);
void Run(std::vector<float> &loc_preds, std::vector<float> &structure_probs,
std::vector<float> &rec_scores, std::vector<int> &loc_preds_shape,
std::vector<int> &structure_probs_shape,
std::vector<std::vector<std::string>> &rec_html_tag_batch,
std::vector<std::vector<std::vector<std::vector<int>>>> &rec_boxes_batch,
std::vector<std::vector<std::vector<int>>> &rec_boxes_batch,
std::vector<int> &width_list, std::vector<int> &height_list);
private:
......
......@@ -44,7 +44,8 @@ public:
const int &gpu_mem, const int &cpu_math_library_num_threads,
const bool &use_mkldnn, const string &label_path,
const bool &use_tensorrt, const std::string &precision,
const int &table_batch_num, const int &table_max_len) {
const int &table_batch_num, const int &table_max_len,
const bool &merge_no_span_structure) {
this->use_gpu_ = use_gpu;
this->gpu_id_ = gpu_id;
this->gpu_mem_ = gpu_mem;
......@@ -55,7 +56,7 @@ public:
this->table_batch_num_ = table_batch_num;
this->table_max_len_ = table_max_len;
this->post_processor_.init(label_path);
this->post_processor_.init(label_path, merge_no_span_structure);
LoadModel(model_dir);
}
......@@ -65,7 +66,7 @@ public:
void Run(std::vector<cv::Mat> img_list,
std::vector<std::vector<std::string>> &rec_html_tags,
std::vector<float> &rec_scores,
std::vector<std::vector<std::vector<std::vector<int>>>> &rec_boxes,
std::vector<std::vector<std::vector<int>>> &rec_boxes,
std::vector<double> &times);
private:
......
......@@ -42,6 +42,7 @@ struct OCRPredictResult {
struct StructurePredictResult {
std::vector<int> box;
std::vector<std::vector<int>> cell_box;
std::string type;
std::vector<OCRPredictResult> text_res;
std::string html;
......@@ -56,6 +57,10 @@ public:
const std::vector<OCRPredictResult> &ocr_result,
const std::string &save_path);
static void VisualizeBboxes(const cv::Mat &srcimg,
const StructurePredictResult &structure_result,
const std::string &save_path);
template <class ForwardIterator>
inline static size_t argmax(ForwardIterator first, ForwardIterator last) {
return std::distance(first, std::max_element(first, last));
......@@ -81,6 +86,9 @@ public:
static void sorted_boxes(std::vector<OCRPredictResult> &ocr_result);
static std::vector<int> xyxyxyxy2xyxy(std::vector<std::vector<int>> &box);
static std::vector<int> xyxyxyxy2xyxy(std::vector<int> &box);
private:
static bool comparison_box(const OCRPredictResult &result1,
const OCRPredictResult &result2) {
......
......@@ -350,6 +350,7 @@ More parameters are as follows,
|table_model_dir|string|-|Address of table recognition inference model|
|table_char_dict_path|string|../../ppocr/utils/dict/table_structure_dict.txt|dictionary file|
|table_max_len|int|488|The size of the long side of the input image of the table recognition model, the final input image size of the network is(table_max_len,table_max_len)|
|merge_no_span_structure|bool|true|Whether to merge <td> and </td> to <td></td|
* Multi-language inference is also supported in PaddleOCR, you can refer to [recognition tutorial](../../doc/doc_en/recognition_en.md) for more supported languages and models in PaddleOCR. Specifically, if you want to infer using multi-language models, you just need to modify values of `rec_char_dict_path` and `rec_model_dir`.
......
......@@ -359,6 +359,7 @@ CUDNN_LIB_DIR=/your_cudnn_lib_dir
|table_model_dir|string|-|表格识别模型inference model地址|
|table_char_dict_path|string|../../ppocr/utils/dict/table_structure_dict.txt|字典文件|
|table_max_len|int|488|表格识别模型输入图像长边大小,最终网络输入图像大小为(table_max_len,table_max_len)|
|merge_no_span_structure|bool|true|是否合并<td></td><td></td>|
* PaddleOCR也支持多语言的预测,更多支持的语言和模型可以参考[识别文档](../../doc/doc_ch/recognition.md)中的多语言字典与模型部分,如果希望进行多语言预测,只需将修改`rec_char_dict_path`(字典文件路径)以及`rec_model_dir`(inference模型路径)字段即可。
......
......@@ -55,8 +55,10 @@ DEFINE_int32(rec_img_w, 320, "rec image width");
DEFINE_string(table_model_dir, "", "Path of table struture inference model.");
DEFINE_int32(table_max_len, 488, "max len size of input image.");
DEFINE_int32(table_batch_num, 1, "table_batch_num.");
DEFINE_bool(merge_no_span_structure, true,
"Whether merge <td> and </td> to <td></td>");
DEFINE_string(table_char_dict_path,
"../../ppocr/utils/dict/table_structure_dict.txt",
"../../ppocr/utils/dict/table_structure_dict_ch.txt",
"Path of dictionary.");
// ocr forward related
......
......@@ -120,6 +120,7 @@ void structure(std::vector<cv::String> &cv_all_img_names) {
engine.structure(cv_all_img_names, false, FLAGS_table);
for (int i = 0; i < cv_all_img_names.size(); i++) {
cout << "predict img: " << cv_all_img_names[i] << endl;
cv::Mat srcimg = cv::imread(cv_all_img_names[i], cv::IMREAD_COLOR);
for (int j = 0; j < structure_results[i].size(); j++) {
std::cout << j << "\ttype: " << structure_results[i][j].type
<< ", region: [";
......@@ -129,6 +130,11 @@ void structure(std::vector<cv::String> &cv_all_img_names) {
<< structure_results[i][j].box[3] << "], res: ";
if (structure_results[i][j].type == "table") {
std::cout << structure_results[i][j].html << std::endl;
std::string file_name = Utility::basename(cv_all_img_names[i]);
Utility::VisualizeBboxes(srcimg, structure_results[i][j],
FLAGS_output + "/" + std::to_string(j) + "_" +
file_name);
} else {
Utility::print_result(structure_results[i][j].text_res);
}
......
......@@ -112,6 +112,11 @@ void Classifier::LoadModel(const std::string &model_dir) {
precision = paddle_infer::Config::Precision::kInt8;
}
config.EnableTensorRtEngine(1 << 20, 10, 3, precision, false, false);
if (!Utility::PathExists("./trt_cls_shape.txt")){
config.CollectShapeRangeInfo("./trt_cls_shape.txt");
} else {
config.EnableTunedTensorRtDynamicShape("./trt_cls_shape.txt", true);
}
}
} else {
config.DisableGpu();
......
......@@ -32,49 +32,12 @@ void DBDetector::LoadModel(const std::string &model_dir) {
if (this->precision_ == "int8") {
precision = paddle_infer::Config::Precision::kInt8;
}
config.EnableTensorRtEngine(1 << 20, 1, 20, precision, false, false);
std::map<std::string, std::vector<int>> min_input_shape = {
{"x", {1, 3, 50, 50}},
{"conv2d_92.tmp_0", {1, 120, 20, 20}},
{"conv2d_91.tmp_0", {1, 24, 10, 10}},
{"conv2d_59.tmp_0", {1, 96, 20, 20}},
{"nearest_interp_v2_1.tmp_0", {1, 256, 10, 10}},
{"nearest_interp_v2_2.tmp_0", {1, 256, 20, 20}},
{"conv2d_124.tmp_0", {1, 256, 20, 20}},
{"nearest_interp_v2_3.tmp_0", {1, 64, 20, 20}},
{"nearest_interp_v2_4.tmp_0", {1, 64, 20, 20}},
{"nearest_interp_v2_5.tmp_0", {1, 64, 20, 20}},
{"elementwise_add_7", {1, 56, 2, 2}},
{"nearest_interp_v2_0.tmp_0", {1, 256, 2, 2}}};
std::map<std::string, std::vector<int>> max_input_shape = {
{"x", {1, 3, 1536, 1536}},
{"conv2d_92.tmp_0", {1, 120, 400, 400}},
{"conv2d_91.tmp_0", {1, 24, 200, 200}},
{"conv2d_59.tmp_0", {1, 96, 400, 400}},
{"nearest_interp_v2_1.tmp_0", {1, 256, 200, 200}},
{"nearest_interp_v2_2.tmp_0", {1, 256, 400, 400}},
{"conv2d_124.tmp_0", {1, 256, 400, 400}},
{"nearest_interp_v2_3.tmp_0", {1, 64, 400, 400}},
{"nearest_interp_v2_4.tmp_0", {1, 64, 400, 400}},
{"nearest_interp_v2_5.tmp_0", {1, 64, 400, 400}},
{"elementwise_add_7", {1, 56, 400, 400}},
{"nearest_interp_v2_0.tmp_0", {1, 256, 400, 400}}};
std::map<std::string, std::vector<int>> opt_input_shape = {
{"x", {1, 3, 640, 640}},
{"conv2d_92.tmp_0", {1, 120, 160, 160}},
{"conv2d_91.tmp_0", {1, 24, 80, 80}},
{"conv2d_59.tmp_0", {1, 96, 160, 160}},
{"nearest_interp_v2_1.tmp_0", {1, 256, 80, 80}},
{"nearest_interp_v2_2.tmp_0", {1, 256, 160, 160}},
{"conv2d_124.tmp_0", {1, 256, 160, 160}},
{"nearest_interp_v2_3.tmp_0", {1, 64, 160, 160}},
{"nearest_interp_v2_4.tmp_0", {1, 64, 160, 160}},
{"nearest_interp_v2_5.tmp_0", {1, 64, 160, 160}},
{"elementwise_add_7", {1, 56, 40, 40}},
{"nearest_interp_v2_0.tmp_0", {1, 256, 40, 40}}};
config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape,
opt_input_shape);
config.EnableTensorRtEngine(1 << 30, 1, 20, precision, false, false);
if (!Utility::PathExists("./trt_det_shape.txt")){
config.CollectShapeRangeInfo("./trt_det_shape.txt");
} else {
config.EnableTunedTensorRtDynamicShape("./trt_det_shape.txt", true);
}
}
} else {
config.DisableGpu();
......
......@@ -148,19 +148,12 @@ void CRNNRecognizer::LoadModel(const std::string &model_dir) {
precision = paddle_infer::Config::Precision::kInt8;
}
config.EnableTensorRtEngine(1 << 20, 10, 15, precision, false, false);
int imgH = this->rec_image_shape_[1];
int imgW = this->rec_image_shape_[2];
std::map<std::string, std::vector<int>> min_input_shape = {
{"x", {1, 3, imgH, 10}}, {"lstm_0.tmp_0", {10, 1, 96}}};
std::map<std::string, std::vector<int>> max_input_shape = {
{"x", {this->rec_batch_num_, 3, imgH, 2500}},
{"lstm_0.tmp_0", {1000, 1, 96}}};
std::map<std::string, std::vector<int>> opt_input_shape = {
{"x", {this->rec_batch_num_, 3, imgH, imgW}},
{"lstm_0.tmp_0", {25, 1, 96}}};
config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape,
opt_input_shape);
if (!Utility::PathExists("./trt_rec_shape.txt")){
config.CollectShapeRangeInfo("./trt_rec_shape.txt");
} else {
config.EnableTunedTensorRtDynamicShape("./trt_rec_shape.txt", true);
}
}
} else {
config.DisableGpu();
......
......@@ -27,7 +27,7 @@ PaddleStructure::PaddleStructure() {
FLAGS_table_model_dir, FLAGS_use_gpu, FLAGS_gpu_id, FLAGS_gpu_mem,
FLAGS_cpu_threads, FLAGS_enable_mkldnn, FLAGS_table_char_dict_path,
FLAGS_use_tensorrt, FLAGS_precision, FLAGS_table_batch_num,
FLAGS_table_max_len);
FLAGS_table_max_len, FLAGS_merge_no_span_structure);
}
};
......@@ -42,7 +42,7 @@ PaddleStructure::structure(std::vector<cv::String> cv_all_img_names,
std::vector<std::vector<StructurePredictResult>> structure_results;
if (!Utility::PathExists(FLAGS_output) && FLAGS_det) {
mkdir(FLAGS_output.c_str(), 0777);
Utility::CreateDir(FLAGS_output);
}
for (int i = 0; i < cv_all_img_names.size(); ++i) {
std::vector<StructurePredictResult> structure_result;
......@@ -84,7 +84,7 @@ void PaddleStructure::table(cv::Mat img,
// predict structure
std::vector<std::vector<std::string>> structure_html_tags;
std::vector<float> structure_scores(1, 0);
std::vector<std::vector<std::vector<std::vector<int>>>> structure_boxes;
std::vector<std::vector<std::vector<int>>> structure_boxes;
std::vector<double> structure_imes;
std::vector<cv::Mat> img_list;
img_list.push_back(img);
......@@ -103,20 +103,15 @@ void PaddleStructure::table(cv::Mat img,
this->det(img_list[i], ocr_result, time_info_det);
// crop image
std::vector<cv::Mat> rec_img_list;
std::vector<int> ocr_box;
for (int j = 0; j < ocr_result.size(); j++) {
int x_collect[4] = {ocr_result[j].box[0][0], ocr_result[j].box[1][0],
ocr_result[j].box[2][0], ocr_result[j].box[3][0]};
int y_collect[4] = {ocr_result[j].box[0][1], ocr_result[j].box[1][1],
ocr_result[j].box[2][1], ocr_result[j].box[3][1]};
int left = int(*std::min_element(x_collect, x_collect + 4));
int right = int(*std::max_element(x_collect, x_collect + 4));
int top = int(*std::min_element(y_collect, y_collect + 4));
int bottom = int(*std::max_element(y_collect, y_collect + 4));
std::vector<int> box{max(0, left - expand_pixel),
max(0, top - expand_pixel),
min(img_list[i].cols, right + expand_pixel),
min(img_list[i].rows, bottom + expand_pixel)};
cv::Mat crop_img = Utility::crop_image(img_list[i], box);
ocr_box = Utility::xyxyxyxy2xyxy(ocr_result[j].box);
ocr_box[0] = max(0, ocr_box[0] - expand_pixel);
ocr_box[1] = max(0, ocr_box[1] - expand_pixel),
ocr_box[2] = min(img_list[i].cols, ocr_box[2] + expand_pixel);
ocr_box[3] = min(img_list[i].rows, ocr_box[3] + expand_pixel);
cv::Mat crop_img = Utility::crop_image(img_list[i], ocr_box);
rec_img_list.push_back(crop_img);
}
// rec
......@@ -125,38 +120,37 @@ void PaddleStructure::table(cv::Mat img,
html = this->rebuild_table(structure_html_tags[i], structure_boxes[i],
ocr_result);
structure_result.html = html;
structure_result.cell_box = structure_boxes[i];
structure_result.html_score = structure_scores[i];
}
};
std::string PaddleStructure::rebuild_table(
std::vector<std::string> structure_html_tags,
std::vector<std::vector<std::vector<int>>> structure_boxes,
std::string
PaddleStructure::rebuild_table(std::vector<std::string> structure_html_tags,
std::vector<std::vector<int>> structure_boxes,
std::vector<OCRPredictResult> &ocr_result) {
// match text in same cell
std::vector<std::vector<string>> matched(structure_boxes.size(),
std::vector<std::string>());
std::vector<int> ocr_box;
std::vector<int> structure_box;
for (int i = 0; i < ocr_result.size(); i++) {
ocr_box = Utility::xyxyxyxy2xyxy(ocr_result[i].box);
ocr_box[0] -= 1;
ocr_box[1] -= 1;
ocr_box[2] += 1;
ocr_box[3] += 1;
std::vector<std::vector<float>> dis_list(structure_boxes.size(),
std::vector<float>(3, 100000.0));
for (int j = 0; j < structure_boxes.size(); j++) {
int x_collect[4] = {ocr_result[i].box[0][0], ocr_result[i].box[1][0],
ocr_result[i].box[2][0], ocr_result[i].box[3][0]};
int y_collect[4] = {ocr_result[i].box[0][1], ocr_result[i].box[1][1],
ocr_result[i].box[2][1], ocr_result[i].box[3][1]};
int left = int(*std::min_element(x_collect, x_collect + 4));
int right = int(*std::max_element(x_collect, x_collect + 4));
int top = int(*std::min_element(y_collect, y_collect + 4));
int bottom = int(*std::max_element(y_collect, y_collect + 4));
std::vector<std::vector<int>> box(2, std::vector<int>(2, 0));
box[0][0] = left - 1;
box[0][1] = top - 1;
box[1][0] = right + 1;
box[1][1] = bottom + 1;
dis_list[j][0] = this->dis(box, structure_boxes[j]);
dis_list[j][1] = 1 - this->iou(box, structure_boxes[j]);
if (structure_boxes[i].size() == 8) {
structure_box = Utility::xyxyxyxy2xyxy(structure_boxes[j]);
} else {
structure_box = structure_boxes[j];
}
dis_list[j][0] = this->dis(ocr_box, structure_box);
dis_list[j][1] = 1 - this->iou(ocr_box, structure_box);
dis_list[j][2] = j;
}
// find min dis idx
......@@ -164,6 +158,7 @@ std::string PaddleStructure::rebuild_table(
PaddleStructure::comparison_dis);
matched[dis_list[0][2]].push_back(ocr_result[i].text);
}
// get pred html
std::string html_str = "";
int td_tag_idx = 0;
......@@ -221,19 +216,18 @@ std::string PaddleStructure::rebuild_table(
return html_str;
}
float PaddleStructure::iou(std::vector<std::vector<int>> &box1,
std::vector<std::vector<int>> &box2) {
int area1 = max(0, box1[1][0] - box1[0][0]) * max(0, box1[1][1] - box1[0][1]);
int area2 = max(0, box2[1][0] - box2[0][0]) * max(0, box2[1][1] - box2[0][1]);
float PaddleStructure::iou(std::vector<int> &box1, std::vector<int> &box2) {
int area1 = max(0, box1[2] - box1[0]) * max(0, box1[3] - box1[1]);
int area2 = max(0, box2[2] - box2[0]) * max(0, box2[3] - box2[1]);
// computing the sum_area
int sum_area = area1 + area2;
// find the each point of intersect rectangle
int x1 = max(box1[0][0], box2[0][0]);
int y1 = max(box1[0][1], box2[0][1]);
int x2 = min(box1[1][0], box2[1][0]);
int y2 = min(box1[1][1], box2[1][1]);
int x1 = max(box1[0], box2[0]);
int y1 = max(box1[1], box2[1]);
int x2 = min(box1[2], box2[2]);
int y2 = min(box1[3], box2[3]);
// judge if there is an intersect
if (y1 >= y2 || x1 >= x2) {
......@@ -244,17 +238,16 @@ float PaddleStructure::iou(std::vector<std::vector<int>> &box1,
}
}
float PaddleStructure::dis(std::vector<std::vector<int>> &box1,
std::vector<std::vector<int>> &box2) {
int x1_1 = box1[0][0];
int y1_1 = box1[0][1];
int x2_1 = box1[1][0];
int y2_1 = box1[1][1];
float PaddleStructure::dis(std::vector<int> &box1, std::vector<int> &box2) {
int x1_1 = box1[0];
int y1_1 = box1[1];
int x2_1 = box1[2];
int y2_1 = box1[3];
int x1_2 = box2[0][0];
int y1_2 = box2[0][1];
int x2_2 = box2[1][0];
int y2_2 = box2[1][1];
int x1_2 = box2[0];
int y1_2 = box2[1];
int x2_2 = box2[2];
int y2_2 = box2[3];
float dis =
abs(x1_2 - x1_1) + abs(y1_2 - y1_1) + abs(x2_2 - x2_1) + abs(y2_2 - y2_1);
......
......@@ -352,8 +352,21 @@ std::vector<std::vector<std::vector<int>>> DBPostProcessor::FilterTagDetRes(
return root_points;
}
void TablePostProcessor::init(std::string label_path) {
void TablePostProcessor::init(std::string label_path,
bool merge_no_span_structure) {
this->label_list_ = Utility::ReadDict(label_path);
if (merge_no_span_structure) {
this->label_list_.push_back("<td></td>");
std::vector<std::string>::iterator it;
for (it = this->label_list_.begin(); it != this->label_list_.end();) {
if (*it == "<td>") {
it = this->label_list_.erase(it);
} else {
++it;
}
}
}
// add_special_char
this->label_list_.insert(this->label_list_.begin(), this->beg);
this->label_list_.push_back(this->end);
}
......@@ -363,12 +376,12 @@ void TablePostProcessor::Run(
std::vector<float> &rec_scores, std::vector<int> &loc_preds_shape,
std::vector<int> &structure_probs_shape,
std::vector<std::vector<std::string>> &rec_html_tag_batch,
std::vector<std::vector<std::vector<std::vector<int>>>> &rec_boxes_batch,
std::vector<std::vector<std::vector<int>>> &rec_boxes_batch,
std::vector<int> &width_list, std::vector<int> &height_list) {
for (int batch_idx = 0; batch_idx < structure_probs_shape[0]; batch_idx++) {
// image tags and boxs
std::vector<std::string> rec_html_tags;
std::vector<std::vector<std::vector<int>>> rec_boxes;
std::vector<std::vector<int>> rec_boxes;
float score = 0.f;
int count = 0;
......@@ -378,7 +391,7 @@ void TablePostProcessor::Run(
// step
for (int step_idx = 0; step_idx < structure_probs_shape[1]; step_idx++) {
std::string html_tag;
std::vector<std::vector<int>> rec_box;
std::vector<int> rec_box;
// html tag
int step_start_idx = (batch_idx * structure_probs_shape[1] + step_idx) *
structure_probs_shape[2];
......@@ -399,17 +412,19 @@ void TablePostProcessor::Run(
count += 1;
score += char_score;
rec_html_tags.push_back(html_tag);
// box
if (html_tag == "<td>" || html_tag == "<td" || html_tag == "<td></td>") {
for (int point_idx = 0; point_idx < loc_preds_shape[2];
point_idx += 2) {
std::vector<int> point(2, 0);
for (int point_idx = 0; point_idx < loc_preds_shape[2]; point_idx++) {
step_start_idx = (batch_idx * structure_probs_shape[1] + step_idx) *
loc_preds_shape[2] +
point_idx;
point[0] = int(loc_preds[step_start_idx] * width_list[batch_idx]);
point[1] =
int(loc_preds[step_start_idx + 1] * height_list[batch_idx]);
float point = loc_preds[step_start_idx];
if (point_idx % 2 == 0) {
point = int(point * width_list[batch_idx]);
} else {
point = int(point * height_list[batch_idx]);
}
rec_box.push_back(point);
}
rec_boxes.push_back(rec_box);
......
......@@ -20,7 +20,7 @@ void StructureTableRecognizer::Run(
std::vector<cv::Mat> img_list,
std::vector<std::vector<std::string>> &structure_html_tags,
std::vector<float> &structure_scores,
std::vector<std::vector<std::vector<std::vector<int>>>> &structure_boxes,
std::vector<std::vector<std::vector<int>>> &structure_boxes,
std::vector<double> &times) {
std::chrono::duration<float> preprocess_diff =
std::chrono::steady_clock::now() - std::chrono::steady_clock::now();
......@@ -89,8 +89,7 @@ void StructureTableRecognizer::Run(
auto postprocess_start = std::chrono::steady_clock::now();
std::vector<std::vector<std::string>> structure_html_tag_batch;
std::vector<float> structure_score_batch;
std::vector<std::vector<std::vector<std::vector<int>>>>
structure_boxes_batch;
std::vector<std::vector<std::vector<int>>> structure_boxes_batch;
this->post_processor_.Run(loc_preds, structure_probs, structure_score_batch,
predict_shape0, predict_shape1,
structure_html_tag_batch, structure_boxes_batch,
......
......@@ -65,6 +65,37 @@ void Utility::VisualizeBboxes(const cv::Mat &srcimg,
<< std::endl;
}
void Utility::VisualizeBboxes(const cv::Mat &srcimg,
const StructurePredictResult &structure_result,
const std::string &save_path) {
cv::Mat img_vis;
srcimg.copyTo(img_vis);
for (int n = 0; n < structure_result.cell_box.size(); n++) {
if (structure_result.cell_box[n].size() == 8) {
cv::Point rook_points[4];
for (int m = 0; m < structure_result.cell_box[n].size(); m += 2) {
rook_points[m / 2] =
cv::Point(int(structure_result.cell_box[n][m]),
int(structure_result.cell_box[n][m + 1]));
}
const cv::Point *ppt[1] = {rook_points};
int npt[] = {4};
cv::polylines(img_vis, ppt, npt, 1, 1, CV_RGB(0, 255, 0), 2, 8, 0);
} else if (structure_result.cell_box[n].size() == 4) {
cv::Point rook_points[2];
rook_points[0] = cv::Point(int(structure_result.cell_box[n][0]),
int(structure_result.cell_box[n][1]));
rook_points[1] = cv::Point(int(structure_result.cell_box[n][2]),
int(structure_result.cell_box[n][3]));
cv::rectangle(img_vis, rook_points[0], rook_points[1], CV_RGB(0, 255, 0),
2, 8, 0);
}
}
cv::imwrite(save_path, img_vis);
std::cout << "The table visualized image saved in " + save_path << std::endl;
}
// list all files under a directory
void Utility::GetAllFiles(const char *dir_name,
std::vector<std::string> &all_inputs) {
......@@ -268,13 +299,46 @@ cv::Mat Utility::crop_image(cv::Mat &img, std::vector<int> &area) {
void Utility::sorted_boxes(std::vector<OCRPredictResult> &ocr_result) {
std::sort(ocr_result.begin(), ocr_result.end(), Utility::comparison_box);
if (ocr_result.size() > 0) {
for (int i = 0; i < ocr_result.size() - 1; i++) {
if (abs(ocr_result[i + 1].box[0][1] - ocr_result[i].box[0][1]) < 10 &&
(ocr_result[i + 1].box[0][0] < ocr_result[i].box[0][0])) {
for (int j = i; j > 0; j--) {
if (abs(ocr_result[j + 1].box[0][1] - ocr_result[j].box[0][1]) < 10 &&
(ocr_result[j + 1].box[0][0] < ocr_result[j].box[0][0])) {
std::swap(ocr_result[i], ocr_result[i + 1]);
}
}
}
}
}
std::vector<int> Utility::xyxyxyxy2xyxy(std::vector<std::vector<int>> &box) {
int x_collect[4] = {box[0][0], box[1][0], box[2][0], box[3][0]};
int y_collect[4] = {box[0][1], box[1][1], box[2][1], box[3][1]};
int left = int(*std::min_element(x_collect, x_collect + 4));
int right = int(*std::max_element(x_collect, x_collect + 4));
int top = int(*std::min_element(y_collect, y_collect + 4));
int bottom = int(*std::max_element(y_collect, y_collect + 4));
std::vector<int> box1(4, 0);
box1[0] = left;
box1[1] = top;
box1[2] = right;
box1[3] = bottom;
return box1;
}
std::vector<int> Utility::xyxyxyxy2xyxy(std::vector<int> &box) {
int x_collect[4] = {box[0], box[2], box[4], box[6]};
int y_collect[4] = {box[1], box[3], box[5], box[7]};
int left = int(*std::min_element(x_collect, x_collect + 4));
int right = int(*std::max_element(x_collect, x_collect + 4));
int top = int(*std::min_element(y_collect, y_collect + 4));
int bottom = int(*std::max_element(y_collect, y_collect + 4));
std::vector<int> box1(4, 0);
box1[0] = left;
box1[1] = top;
box1[2] = right;
box1[3] = bottom;
return box1;
}
} // namespace PaddleOCR
\ No newline at end of file
......@@ -5,4 +5,4 @@ det_db_unclip_ratio 1.6
det_db_use_dilate 0
det_use_polygon_score 1
use_direction_classify 1
rec_image_height 32
\ No newline at end of file
rec_image_height 48
\ No newline at end of file
......@@ -99,6 +99,8 @@ The following table also provides a series of models that can be deployed on mob
|Version|Introduction|Model size|Detection model|Text Direction model|Recognition model|Paddle-Lite branch|
|---|---|---|---|---|---|---|
|PP-OCRv3|extra-lightweight chinese OCR optimized model|16.2M|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_infer_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.nb)|v2.10|
|PP-OCRv3(slim)|extra-lightweight chinese OCR optimized model|5.9M|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_slim_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.nb)|v2.10|
|PP-OCRv2|extra-lightweight chinese OCR optimized model|11M|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_det_infer_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_infer_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_rec_infer_opt.nb)|v2.10|
|PP-OCRv2(slim)|extra-lightweight chinese OCR optimized model|4.6M|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_det_slim_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_slim_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_rec_slim_opt.nb)|v2.10|
......@@ -134,17 +136,16 @@ Introduction to paddle_lite_opt parameters:
The following takes the ultra-lightweight Chinese model of PaddleOCR as an example to introduce the use of the compiled opt file to complete the conversion of the inference model to the Paddle-Lite optimized model
```
# 【[Recommendation] Download the Chinese and English inference model of PP-OCRv2
wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_quant_infer.tar && tar xf ch_PP-OCRv2_det_slim_quant_infer.tar
wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar && tar xf ch_PP-OCRv2_rec_slim_quant_infer.tar
# 【[Recommendation] Download the Chinese and English inference model of PP-OCRv3
wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.tar && tar xf ch_PP-OCRv3_det_slim_infer.tar
wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.tar && tar xf ch_PP-OCRv2_rec_slim_quant_infer.tar
wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/slim/ch_ppocr_mobile_v2.0_cls_slim_infer.tar && tar xf ch_ppocr_mobile_v2.0_cls_slim_infer.tar
# Convert detection model
./opt --model_file=./ch_PP-OCRv2_det_slim_quant_infer/inference.pdmodel --param_file=./ch_PP-OCRv2_det_slim_quant_infer/inference.pdiparams --optimize_out=./ch_PP-OCRv2_det_slim_opt --valid_targets=arm --optimize_out_type=naive_buffer
paddle_lite_opt --model_file=./ch_PP-OCRv3_det_slim_infer/inference.pdmodel --param_file=./ch_PP-OCRv3_det_slim_infer/inference.pdiparams --optimize_out=./ch_PP-OCRv3_det_slim_opt --valid_targets=arm --optimize_out_type=naive_buffer
# Convert recognition model
./opt --model_file=./ch_PP-OCRv2_rec_slim_quant_infer/inference.pdmodel --param_file=./ch_PP-OCRv2_rec_slim_quant_infer/inference.pdiparams --optimize_out=./ch_PP-OCRv2_rec_slim_opt --valid_targets=arm --optimize_out_type=naive_buffer
paddle_lite_opt --model_file=./ch_PP-OCRv3_rec_slim_infer/inference.pdmodel --param_file=./ch_PP-OCRv3_rec_slim_infer/inference.pdiparams --optimize_out=./ch_PP-OCRv3_rec_slim_opt --valid_targets=arm --optimize_out_type=naive_buffer
# Convert angle classifier model
./opt --model_file=./ch_ppocr_mobile_v2.0_cls_slim_infer/inference.pdmodel --param_file=./ch_ppocr_mobile_v2.0_cls_slim_infer/inference.pdiparams --optimize_out=./ch_ppocr_mobile_v2.0_cls_slim_opt --valid_targets=arm --optimize_out_type=naive_buffer
paddle_lite_opt --model_file=./ch_ppocr_mobile_v2.0_cls_slim_infer/inference.pdmodel --param_file=./ch_ppocr_mobile_v2.0_cls_slim_infer/inference.pdiparams --optimize_out=./ch_ppocr_mobile_v2.0_cls_slim_opt --valid_targets=arm --optimize_out_type=naive_buffer
```
After the conversion is successful, there will be more files ending with `.nb` in the inference model directory, which is the successfully converted model file.
......@@ -197,15 +198,15 @@ Some preparatory work is required first.
cp ../../../cxx/lib/libpaddle_light_api_shared.so ./debug/
```
Prepare the test image, taking PaddleOCR/doc/imgs/11.jpg as an example, copy the image file to the demo/cxx/ocr/debug/ folder. Prepare the model files optimized by the lite opt tool, ch_det_mv3_db_opt.nb, ch_rec_mv3_crnn_opt.nb, and place them under the demo/cxx/ocr/debug/ folder.
Prepare the test image, taking PaddleOCR/doc/imgs/11.jpg as an example, copy the image file to the demo/cxx/ocr/debug/ folder. Prepare the model files optimized by the lite opt tool, ch_PP-OCRv3_det_slim_opt.nb , ch_PP-OCRv3_rec_slim_opt.nb , and place them under the demo/cxx/ocr/debug/ folder.
The structure of the OCR demo is as follows after the above command is executed:
```
demo/cxx/ocr/
|-- debug/
| |--ch_PP-OCRv2_det_slim_opt.nb Detection model
| |--ch_PP-OCRv2_rec_slim_opt.nb Recognition model
| |--ch_PP-OCRv3_det_slim_opt.nb Detection model
| |--ch_PP-OCRv3_rec_slim_opt.nb Recognition model
| |--ch_ppocr_mobile_v2.0_cls_slim_opt.nb Text direction classification model
| |--11.jpg Image for OCR
| |--ppocr_keys_v1.txt Dictionary file
......@@ -240,7 +241,7 @@ det_db_thresh 0.3 # Used to filter the binarized image of DB prediction,
det_db_box_thresh 0.5 # DDB post-processing filter box threshold, if there is a missing box detected, it can be reduced as appropriate
det_db_unclip_ratio 1.6 # Indicates the compactness of the text box, the smaller the value, the closer the text box to the text
use_direction_classify 0 # Whether to use the direction classifier, 0 means not to use, 1 means to use
rec_image_height 32 # The height of the input image of the recognition model, the PP-OCRv3 model needs to be set to 48, and the PP-OCRv2 model needs to be set to 32
rec_image_height 48 # The height of the input image of the recognition model, the PP-OCRv3 model needs to be set to 48, and the PP-OCRv2 model needs to be set to 32
```
5. Run Model on phone
......@@ -260,14 +261,14 @@ After the above steps are completed, you can use adb to push the file to the pho
export LD_LIBRARY_PATH=${PWD}:$LD_LIBRARY_PATH
# The use of ocr_db_crnn is:
# ./ocr_db_crnn Mode Detection model file Orientation classifier model file Recognition model file Hardware Precision Threads Batchsize Test image path Dictionary file path
./ocr_db_crnn system ch_PP-OCRv2_det_slim_opt.nb ch_PP-OCRv2_rec_slim_opt.nb ch_ppocr_mobile_v2.0_cls_slim_opt.nb arm8 INT8 10 1 ./11.jpg config.txt ppocr_keys_v1.txt True
./ocr_db_crnn system ch_PP-OCRv3_det_slim_opt.nb ch_PP-OCRv3_rec_slim_opt.nb ch_ppocr_mobile_v2.0_cls_slim_opt.nb arm8 INT8 10 1 ./11.jpg config.txt ppocr_keys_v1.txt True
# precision can be INT8 for quantitative model or FP32 for normal model.
# Only using detection model
./ocr_db_crnn det ch_PP-OCRv2_det_slim_opt.nb arm8 INT8 10 1 ./11.jpg config.txt
./ocr_db_crnn det ch_PP-OCRv3_det_slim_opt.nb arm8 INT8 10 1 ./11.jpg config.txt
# Only using recognition model
./ocr_db_crnn rec ch_PP-OCRv2_rec_slim_opt.nb arm8 INT8 10 1 word_1.jpg ppocr_keys_v1.txt config.txt
./ocr_db_crnn rec ch_PP-OCRv3_rec_slim_opt.nb arm8 INT8 10 1 word_1.jpg ppocr_keys_v1.txt config.txt
```
If you modify the code, you need to recompile and push to the phone.
......
......@@ -97,6 +97,8 @@ Paddle-Lite 提供了多种策略来自动优化原始的模型,其中包括
|模型版本|模型简介|模型大小|检测模型|文本方向分类模型|识别模型|Paddle-Lite版本|
|---|---|---|---|---|---|---|
|PP-OCRv3|蒸馏版超轻量中文OCR移动端模型|16.2M|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.nb)|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_infer_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.nb)|v2.10|
|PP-OCRv3(slim)|蒸馏版超轻量中文OCR移动端模型|5.9M|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.nb)|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_slim_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.nb)|v2.10|
|PP-OCRv2|蒸馏版超轻量中文OCR移动端模型|11M|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_det_infer_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_infer_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_rec_infer_opt.nb)|v2.10|
|PP-OCRv2(slim)|蒸馏版超轻量中文OCR移动端模型|4.6M|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_det_slim_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_slim_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_rec_slim_opt.nb)|v2.10|
......@@ -131,16 +133,16 @@ paddle_lite_opt 参数介绍:
下面以PaddleOCR的超轻量中文模型为例,介绍使用编译好的opt文件完成inference模型到Paddle-Lite优化模型的转换。
```
# 【推荐】 下载 PP-OCRv2版本的中英文 inference模型
wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_quant_infer.tar && tar xf ch_PP-OCRv2_det_slim_quant_infer.tar
wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar && tar xf ch_PP-OCRv2_rec_slim_quant_infer.tar
# 【推荐】 下载 PP-OCRv3版本的中英文 inference模型
wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.tar && tar xf ch_PP-OCRv3_det_slim_infer.tar
wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.tar && tar xf ch_PP-OCRv2_rec_slim_quant_infer.tar
wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/slim/ch_ppocr_mobile_v2.0_cls_slim_infer.tar && tar xf ch_ppocr_mobile_v2.0_cls_slim_infer.tar
# 转换检测模型
./opt --model_file=./ch_PP-OCRv2_det_slim_quant_infer/inference.pdmodel --param_file=./ch_PP-OCRv2_det_slim_quant_infer/inference.pdiparams --optimize_out=./ch_PP-OCRv2_det_slim_opt --valid_targets=arm --optimize_out_type=naive_buffer
paddle_lite_opt --model_file=./ch_PP-OCRv3_det_slim_infer/inference.pdmodel --param_file=./ch_PP-OCRv3_det_slim_infer/inference.pdiparams --optimize_out=./ch_PP-OCRv3_det_slim_opt --valid_targets=arm --optimize_out_type=naive_buffer
# 转换识别模型
./opt --model_file=./ch_PP-OCRv2_rec_slim_quant_infer/inference.pdmodel --param_file=./ch_PP-OCRv2_rec_slim_quant_infer/inference.pdiparams --optimize_out=./ch_PP-OCRv2_rec_slim_opt --valid_targets=arm --optimize_out_type=naive_buffer
paddle_lite_opt --model_file=./ch_PP-OCRv3_rec_slim_infer/inference.pdmodel --param_file=./ch_PP-OCRv3_rec_slim_infer/inference.pdiparams --optimize_out=./ch_PP-OCRv3_rec_slim_opt --valid_targets=arm --optimize_out_type=naive_buffer
# 转换方向分类器模型
./opt --model_file=./ch_ppocr_mobile_v2.0_cls_slim_infer/inference.pdmodel --param_file=./ch_ppocr_mobile_v2.0_cls_slim_infer/inference.pdiparams --optimize_out=./ch_ppocr_mobile_v2.0_cls_slim_opt --valid_targets=arm --optimize_out_type=naive_buffer
paddle_lite_opt --model_file=./ch_ppocr_mobile_v2.0_cls_slim_infer/inference.pdmodel --param_file=./ch_ppocr_mobile_v2.0_cls_slim_infer/inference.pdiparams --optimize_out=./ch_ppocr_mobile_v2.0_cls_slim_opt --valid_targets=arm --optimize_out_type=naive_buffer
```
......@@ -194,15 +196,15 @@ wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/slim/ch_ppocr_mobile_v2.0_cls
```
准备测试图像,以`PaddleOCR/doc/imgs/11.jpg`为例,将测试的图像复制到`demo/cxx/ocr/debug/`文件夹下。
准备lite opt工具优化后的模型文件,比如使用`ch_PP-OCRv2_det_slim_opt.ch_PP-OCRv2_rec_slim_rec.nb, ch_ppocr_mobile_v2.0_cls_slim_opt.nb`,模型文件放置在`demo/cxx/ocr/debug/`文件夹下。
准备lite opt工具优化后的模型文件,比如使用`ch_PP-OCRv3_det_slim_opt.ch_PP-OCRv3_rec_slim_rec.nb, ch_ppocr_mobile_v2.0_cls_slim_opt.nb`,模型文件放置在`demo/cxx/ocr/debug/`文件夹下。
执行完成后,ocr文件夹下将有如下文件格式:
```
demo/cxx/ocr/
|-- debug/
| |--ch_PP-OCRv2_det_slim_opt.nb 优化后的检测模型文件
| |--ch_PP-OCRv2_rec_slim_opt.nb 优化后的识别模型文件
| |--ch_PP-OCRv3_det_slim_opt.nb 优化后的检测模型文件
| |--ch_PP-OCRv3_rec_slim_opt.nb 优化后的识别模型文件
| |--ch_ppocr_mobile_v2.0_cls_slim_opt.nb 优化后的文字方向分类器模型文件
| |--11.jpg 待测试图像
| |--ppocr_keys_v1.txt 中文字典文件
......@@ -239,7 +241,7 @@ det_db_thresh 0.3 # 用于过滤DB预测的二值化图像,设置为0.
det_db_box_thresh 0.5 # 检测器后处理过滤box的阈值,如果检测存在漏框情况,可酌情减小
det_db_unclip_ratio 1.6 # 表示文本框的紧致程度,越小则文本框更靠近文本
use_direction_classify 0 # 是否使用方向分类器,0表示不使用,1表示使用
rec_image_height 32 # 识别模型输入图像的高度,PP-OCRv3模型设置为48,PP-OCRv2模型需要设置为32
rec_image_height 48 # 识别模型输入图像的高度,PP-OCRv3模型设置为48,PP-OCRv2模型需要设置为32
```
5. 启动调试
......@@ -259,13 +261,13 @@ rec_image_height 32 # 识别模型输入图像的高度,PP-OCRv3模型
export LD_LIBRARY_PATH=${PWD}:$LD_LIBRARY_PATH
# 开始使用,ocr_db_crnn可执行文件的使用方式为:
# ./ocr_db_crnn 预测模式 检测模型文件 方向分类器模型文件 识别模型文件 运行硬件 运行精度 线程数 batchsize 测试图像路径 参数配置路径 字典文件路径 是否使用benchmark参数
./ocr_db_crnn system ch_PP-OCRv2_det_slim_opt.nb ch_PP-OCRv2_rec_slim_opt.nb ch_ppocr_mobile_v2.0_cls_slim_opt.nb arm8 INT8 10 1 ./11.jpg config.txt ppocr_keys_v1.txt True
./ocr_db_crnn system ch_PP-OCRv3_det_slim_opt.nb ch_PP-OCRv3_rec_slim_opt.nb ch_ppocr_mobile_v2.0_cls_slim_opt.nb arm8 INT8 10 1 ./11.jpg config.txt ppocr_keys_v1.txt True
# 仅使用文本检测模型,使用方式如下:
./ocr_db_crnn det ch_PP-OCRv2_det_slim_opt.nb arm8 INT8 10 1 ./11.jpg config.txt
./ocr_db_crnn det ch_PP-OCRv3_det_slim_opt.nb arm8 INT8 10 1 ./11.jpg config.txt
# 仅使用文本识别模型,使用方式如下:
./ocr_db_crnn rec ch_PP-OCRv2_rec_slim_opt.nb arm8 INT8 10 1 word_1.jpg ppocr_keys_v1.txt config.txt
./ocr_db_crnn rec ch_PP-OCRv3_rec_slim_opt.nb arm8 INT8 10 1 word_1.jpg ppocr_keys_v1.txt config.txt
```
如果对代码做了修改,则需要重新编译并push到手机上。
......
......@@ -22,7 +22,7 @@
### 1. 安装PaddleSlim
```bash
pip3 install paddleslim==2.2.2
pip3 install paddleslim==2.3.2
```
### 2. 准备训练好的模型
......@@ -32,18 +32,7 @@ PaddleOCR提供了一系列训练好的[模型](../../../doc/doc_ch/models_list.
### 3. 量化训练
量化训练包括离线量化训练和在线量化训练,在线量化训练效果更好,需加载预训练模型,在定义好量化策略后即可对模型进行量化。
量化训练的代码位于slim/quantization/quant.py 中,比如训练检测模型,训练指令如下:
```bash
python deploy/slim/quantization/quant.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o Global.pretrained_model='your trained model' Global.save_model_dir=./output/quant_model
# 比如下载提供的训练模型
wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_train.tar
tar -xf ch_ppocr_mobile_v2.0_det_train.tar
python deploy/slim/quantization/quant.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o Global.pretrained_model=./ch_ppocr_mobile_v2.0_det_train/best_accuracy Global.save_model_dir=./output/quant_model
```
模型蒸馏和模型量化可以同时使用,以PPOCRv3检测模型为例:
量化训练的代码位于slim/quantization/quant.py 中,比如训练检测模型,以PPOCRv3检测模型为例,训练指令如下:
```
# 下载检测预训练模型:
wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar
......@@ -58,7 +47,7 @@ python deploy/slim/quantization/quant.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_
在得到量化训练保存的模型后,我们可以将其导出为inference_model,用于预测部署:
```bash
python deploy/slim/quantization/export_model.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o Global.checkpoints=output/quant_model/best_accuracy Global.save_inference_dir=./output/quant_inference_model
python deploy/slim/quantization/export_model.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml -o Global.checkpoints=output/quant_model/best_accuracy Global.save_inference_dir=./output/quant_inference_model
```
### 5. 量化模型部署
......
......@@ -25,7 +25,7 @@ After training, if you want to further compress the model size and accelerate th
### 1. Install PaddleSlim
```bash
pip3 install paddleslim==2.2.2
pip3 install paddleslim==2.3.2
```
......@@ -39,18 +39,7 @@ Quantization training includes offline quantization training and online quantiza
Online quantization training is more effective. It is necessary to load the pre-trained model.
After the quantization strategy is defined, the model can be quantified.
The code for quantization training is located in `slim/quantization/quant.py`. For example, to train a detection model, the training instructions are as follows:
```bash
python deploy/slim/quantization/quant.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o Global.pretrained_model='your trained model' Global.save_model_dir=./output/quant_model
# download provided model
wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_train.tar
tar -xf ch_ppocr_mobile_v2.0_det_train.tar
python deploy/slim/quantization/quant.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o Global.pretrained_model=./ch_ppocr_mobile_v2.0_det_train/best_accuracy Global.save_model_dir=./output/quant_model
```
Model distillation and model quantization can be used at the same time, taking the PPOCRv3 detection model as an example:
The code for quantization training is located in `slim/quantization/quant.py`. For example, the training instructions of slim PPOCRv3 detection model are as follows:
```
# download provided model
wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar
......@@ -66,7 +55,7 @@ If you want to quantify the text recognition model, you can modify the configura
Once we got the model after pruning and fine-tuning, we can export it as an inference model for the deployment of predictive tasks:
```bash
python deploy/slim/quantization/export_model.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o Global.checkpoints=output/quant_model/best_accuracy Global.save_inference_dir=./output/quant_inference_model
python deploy/slim/quantization/export_model.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml -o Global.checkpoints=output/quant_model/best_accuracy Global.save_inference_dir=./output/quant_inference_model
```
### 5. Deploy
......
......@@ -151,17 +151,24 @@ def main():
arch_config = config["Architecture"]
arch_config = config["Architecture"]
if arch_config["algorithm"] == "SVTR" and arch_config["Head"][
"name"] != 'MultiHead':
input_shape = config["Eval"]["dataset"]["transforms"][-2][
'SVTRRecResizeImg']['image_shape']
else:
input_shape = None
if arch_config["algorithm"] in ["Distillation", ]: # distillation model
archs = list(arch_config["Models"].values())
for idx, name in enumerate(model.model_name_list):
sub_model_save_path = os.path.join(save_path, name, "inference")
export_single_model(model.model_list[idx], archs[idx],
sub_model_save_path, logger, quanter)
sub_model_save_path, logger, input_shape,
quanter)
else:
save_path = os.path.join(save_path, "inference")
export_single_model(model, arch_config, save_path, logger, quanter)
export_single_model(model, arch_config, save_path, logger, input_shape,
quanter)
if __name__ == "__main__":
......
......@@ -31,7 +31,7 @@ The prediction result is saved as `./output/sdmgr_kie/predicts_kie.txt`, and the
The visualization results are shown in the figure below:
<div align="center">
<img src="./imgs/0.png" width="800">
<img src="../../ppstructure/docs/imgs/sdmgr_result.png" width="800">
</div>
## 2. Model Training
......
......@@ -28,7 +28,7 @@ Take rec_chinese_lite_train_v2.0.yml as an example
| epoch_num | Maximum training epoch number | 500 | \ |
| log_smooth_window | Log queue length, the median value in the queue each time will be printed | 20 | \ |
| print_batch_step | Set print log interval | 10 | \ |
| save_model_dir | Set model save path | output/{算法名称} | \ |
| save_model_dir | Set model save path | output/{algorithm_name} | \ |
| save_epoch_step | Set model save interval | 3 | \ |
| eval_batch_step | Set the model evaluation interval | 2000 or [1000, 2000] | running evaluation every 2000 iters or evaluation is run every 2000 iterations after the 1000th iteration |
| cal_metric_during_train | Set whether to evaluate the metric during the training process. At this time, the metric of the model under the current batch is evaluated | true | \ |
......
......@@ -27,8 +27,7 @@ class CosineEmbeddingLoss(nn.Layer):
self.epsilon = 1e-12
def forward(self, x1, x2, target):
similarity = paddle.sum(
x1 * x2, dim=-1) / (paddle.norm(
similarity = paddle.sum(x1 * x2, axis=-1) / (paddle.norm(
x1, axis=-1) * paddle.norm(
x2, axis=-1) + self.epsilon)
one_list = paddle.full_like(target, fill_value=1)
......
......@@ -32,7 +32,7 @@ def init_args():
parser.add_argument(
"--table_char_dict_path",
type=str,
default="../ppocr/utils/dict/table_structure_dict.txt")
default="../ppocr/utils/dict/table_structure_dict_ch.txt")
# params for layout
parser.add_argument("--layout_model_dir", type=str)
parser.add_argument(
......
......@@ -38,7 +38,6 @@ def init_args():
parser.add_argument("--ir_optim", type=str2bool, default=True)
parser.add_argument("--use_tensorrt", type=str2bool, default=False)
parser.add_argument("--min_subgraph_size", type=int, default=15)
parser.add_argument("--shape_info_filename", type=str, default=None)
parser.add_argument("--precision", type=str, default="fp32")
parser.add_argument("--gpu_mem", type=int, default=500)
......@@ -226,23 +225,22 @@ def create_predictor(args, mode, logger):
use_calib_mode=False)
# collect shape
if args.shape_info_filename is not None:
if not os.path.exists(args.shape_info_filename):
config.collect_shape_range_info(
args.shape_info_filename)
logger.info(
f"collect dynamic shape info into : {args.shape_info_filename}"
)
else:
trt_shape_f = os.path.join(model_dir, f"{mode}_trt_dynamic_shape.txt")
if not os.path.exists(trt_shape_f):
config.collect_shape_range_info(trt_shape_f)
logger.info(
f"dynamic shape info file( {args.shape_info_filename} ) already exists, not need to generate again."
)
config.enable_tuned_tensorrt_dynamic_shape(
args.shape_info_filename, True)
f"collect dynamic shape info into : {trt_shape_f}")
else:
logger.info(
f"when using tensorrt, dynamic shape is a suggested option, you can use '--shape_info_filename=shape.txt' for offline dygnamic shape tuning"
f"dynamic shape info file( {trt_shape_f} ) already exists, not need to generate again."
)
try:
config.enable_tuned_tensorrt_dynamic_shape(trt_shape_f,
True)
except Exception as E:
logger.info(E)
logger.info("Please keep your paddlepaddle-gpu >= 2.3.0!")
elif args.use_xpu:
config.enable_xpu(10 * 1024 * 1024)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册