Merge pull request #6167 from an1018/add_recovery

add recovery

Merge pull request #6167 from an1018/add_recovery
add recovery
f499e800 · MissPenguin · GitHub · 4def1068 · 67e8dd1b · f499e800
8 changed file
--- a/applications/多模态表单识别.md
+++ b/applications/多模态表单识别.md
@@ -16,14 +16,14 @@
 <center><img src='https://ai-studio-static-online.cdn.bcebos.com/9bd844b970f94e5ba0bc0c5799bd819ea9b1861bb306471fabc2d628864d418e'></center>
 <center>图1 多模态表单识别流程图</center>
-注：欢迎再AIStudio领取免费算力体验线上实训，项目链接: [多模态表单识别](https://aistudio.baidu.com/aistudio/projectdetail/3815918)(配备Tesla V100、A100等高级算力资源)
+注：欢迎再AIStudio领取免费算力体验线上实训，项目链接: [多模态表单识别](https://aistudio.baidu.com/aistudio/projectdetail/3884375)(配备Tesla V100、A100等高级算力资源)
 # 2 安装说明
-下载PaddleOCR源码，本项目中已经帮大家打包好的PaddleOCR(已经修改好配置文件)，无需下载解压即可，只需安装依赖环境~
+下载PaddleOCR源码，上述AIStudio项目中已经帮大家打包好的PaddleOCR(已经修改好配置文件)，无需下载解压即可，只需安装依赖环境~
 ```python
@@ -33,7 +33,7 @@
 ```python
 # 如仍需安装or安装更新，可以执行以下步骤
-! git clone https://github.com/PaddlePaddle/PaddleOCR.git -b dygraph
+# ! git clone https://github.com/PaddlePaddle/PaddleOCR.git -b dygraph
 # ! git clone https://gitee.com/PaddlePaddle/PaddleOCR
 ```
@@ -290,7 +290,7 @@ Eval.dataset.transforms.DetResizeForTest：评估尺寸，添加如下参数
 <center><img src="https://ai-studio-static-online.cdn.bcebos.com/5a75137c5f924dfeb6956b5818812298cc3dc7992ac84954b4175be9adf83c77"></center>
 <center>图8 文本检测方案2-模型评估</center>
-使用训练好的模型进行评估，更新模型路径`Global.checkpoints`，这里为大家提供训练好的模型`./pretrain/ch_db_mv3-student1600-finetune/best_accuracy`
+使用训练好的模型进行评估，更新模型路径`Global.checkpoints`，这里为大家提供训练好的模型`./pretrain/ch_db_mv3-student1600-finetune/best_accuracy`，[模型下载地址](https://paddleocr.bj.bcebos.com/fanliku/sheet_recognition/ch_db_mv3-student1600-finetune.zip)
 ```python
@@ -538,7 +538,7 @@ Train.dataset.ratio_list：动态采样
 <center>图16 文本识别方案3-模型评估</center>
-使用训练好的模型进行评估，更新模型路径`Global.checkpoints`，这里为大家提供训练好的模型`./pretrain/rec_mobile_pp-OCRv2-student-readldata/best_accuracy`
+使用训练好的模型进行评估，更新模型路径`Global.checkpoints`，这里为大家提供训练好的模型`./pretrain/rec_mobile_pp-OCRv2-student-readldata/best_accuracy`，[模型下载地址](https://paddleocr.bj.bcebos.com/fanliku/sheet_recognition/rec_mobile_pp-OCRv2-student-realdata.zip)
 ```python

--- a/ppstructure/docs/table/recovery.jpg
+++ b/ppstructure/docs/table/recovery.jpg
--- a/ppstructure/predict_system.py
+++ b/ppstructure/predict_system.py
@@ -23,6 +23,7 @@ sys.path.append(os.path.abspath(os.path.join(__dir__, '..')))
 os.environ["FLAGS_allocator_strategy"] = 'auto_growth'
 import cv2
 import json
+import numpy as np
 import time
 import logging
 from copy import deepcopy
@@ -33,6 +34,7 @@ from ppocr.utils.logging import get_logger
 from tools.infer.predict_system import TextSystem
 from ppstructure.table.predict_table import TableSystem, to_excel
 from ppstructure.utility import parse_args, draw_structure_result
+from ppstructure.recovery.docx import convert_info_docx
 logger = get_logger()
@@ -104,7 +106,12 @@ class StructureSystem(object):
                                                return_ocr_result_in_table)
                else:
                    if self.text_system is not None:
-                        filter_boxes, filter_rec_res = self.text_system(roi_img)
+                        if args.recovery:
+                            wht_im = np.ones(ori_im.shape, dtype=ori_im.dtype)
+                            wht_im[y1:y2, x1:x2, :] = roi_img
+                            filter_boxes, filter_rec_res = self.text_system(wht_im)
+                        else:
+                            filter_boxes, filter_rec_res = self.text_system(roi_img)
                        # remove style char
                        style_token = [
                            '<strike>', '<strike>', '<sup>', '</sub>', '<b>',
@@ -118,7 +125,8 @@ class StructureSystem(object):
                            for token in style_token:
                                if token in rec_str:
                                    rec_str = rec_str.replace(token, '')
-                            box += [x1, y1]
+                            if not args.recovery:
+                                box += [x1, y1]
                            res.append({
                                'text': rec_str,
                                'confidence': float(rec_conf),
@@ -192,6 +200,8 @@ def main(args):
            # img_save_path = os.path.join(save_folder, img_name + '.jpg')
        cv2.imwrite(img_save_path, draw_img)
        logger.info('result save to {}'.format(img_save_path))
+        if args.recovery:
+            convert_info_docx(img, res, save_folder, img_name) 
        elapse = time.time() - starttime
        logger.info("Predict time : {:.3f}s".format(elapse))

--- a/ppstructure/recovery/README.md
+++ b/ppstructure/recovery/README.md
+English | [简体中文](README_ch.md)
+- [Getting Started](#getting-started)
+  - [1.  Introduction](#1)
+  - [2. Install](#2)
+    - [2.1 Installation dependencies](#2.1)
+    - [2.2 Install PaddleOCR](#2.2)
+  - [3. Quick Start](#3)
+<a name="1"></a>
+## 1.  Introduction
+Layout recovery means that after OCR recognition, the content is still arranged like the original document pictures, and the paragraphs are output to word document in the same order.
+Layout recovery combines [layout analysis](../layout/README.md)、[table recognition](../table/README.md) to better recover images, tables, titles, etc.
+The following figure shows the result：
+<div align="center">
+<img src="../docs/table/recovery.jpg"  width = "700" />
+</div>
+<a name="2"></a>
+## 2. Install
+<a name="2.1"></a>
+### 2.1 Install dependencies
+- **(1) Install PaddlePaddle**
+```bash
+python3 -m pip install --upgrade pip
+# GPU installation
+python3 -m pip install "paddlepaddle-gpu>=2.2" -i https://mirror.baidu.com/pypi/simple
+# CPU installation
+python3 -m pip install "paddlepaddle>=2.2" -i https://mirror.baidu.com/pypi/simple
+````
+For more requirements, please refer to the instructions in [Installation Documentation](https://www.paddlepaddle.org.cn/install/quick).
+<a name="2.2"></a>
+### 2.2 Install PaddleOCR
+- **(1) Download source code**
+```bash
+[Recommended] git clone https://github.com/PaddlePaddle/PaddleOCR
+# If the pull cannot be successful due to network problems, you can also choose to use the hosting on the code cloud:
+git clone https://gitee.com/paddlepaddle/PaddleOCR
+# Note: Code cloud hosting code may not be able to synchronize the update of this github project in real time, there is a delay of 3 to 5 days, please use the recommended method first.
+````
+- **(2) Install recovery's `requirements`**
+```bash
+python3 -m pip install -r ppstructure/recovery/requirements.txt
+````
+<a name="3"></a>
+## 3. Quick Start
+```python
+cd PaddleOCR/ppstructure
+# download model
+mkdir inference && cd inference
+# Download the detection model of the ultra-lightweight English PP-OCRv3 model and unzip it
+wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar && tar xf ch_PP-OCRv3_det_infer.tar
+# Download the recognition model of the ultra-lightweight English PP-OCRv3 model and unzip it
+wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar && tar xf  ch_PP-OCRv3_rec_infer.tar
+# Download the ultra-lightweight English table inch model and unzip it
+wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar && tar xf en_ppocr_mobile_v2.0_table_structure_infer.tar
+cd ..
+# run
+python3 predict_system.py --det_model_dir=inference/en_PP-OCRv3_det_infer --rec_model_dir=inference/en_PP-OCRv3_rec_infer --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer --rec_char_dict_path=../ppocr/utils/en_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --output ./output/table --rec_image_shape=3,48,320 --vis_font_path=../doc/fonts/simfang.ttf --recovery=True --image_dir=./docs/table/1.png
+```
+After running, the docx  of each picture will be saved in the directory specified by the output field
\ No newline at end of file
--- a/ppstructure/recovery/README_ch.md
+++ b/ppstructure/recovery/README_ch.md
+[English](README.md) | 简体中文
+# 版面恢复使用说明
+- [1. 简介](#1)
+- [2. 安装](#2)
+  - [2.1 安装依赖](#2.1)
+  - [2.2 安装PaddleOCR](#2.2)
+- [3. 使用](#3)
+<a name="1"></a>
+## 1.  简介
+版面恢复就是在OCR识别后，内容仍然像原文档图片那样排列着，段落不变、顺序不变的输出到word文档中等。
+版面恢复结合了[版面分析](../layout/README_ch.md)、[表格识别](../table/README_ch.md)技术，从而更好地恢复图片、表格、标题等内容，下图展示了版面恢复的结果：
+<div align="center">
+<img src="../docs/table/recovery.jpg"  width = "700" />
+</div>
+<a name="2"></a>
+## 2. 安装
+<a name="2.1"></a>
+### 2.1 安装依赖
+- **（1) 安装PaddlePaddle**
+```bash
+python3 -m pip install --upgrade pip
+# GPU安装
+python3 -m pip install "paddlepaddle-gpu>=2.2" -i https://mirror.baidu.com/pypi/simple
+# CPU安装
+python3 -m pip install "paddlepaddle>=2.2" -i https://mirror.baidu.com/pypi/simple
+```
+更多需求，请参照[安装文档](https://www.paddlepaddle.org.cn/install/quick)中的说明进行操作。
+<a name="2.2"></a>
+### 2.2 安装PaddleOCR
+- **（1）下载版面恢复源码**
+```bash
+【推荐】git clone https://github.com/PaddlePaddle/PaddleOCR
+# 如果因为网络问题无法pull成功，也可选择使用码云上的托管：
+git clone https://gitee.com/paddlepaddle/PaddleOCR
+# 注：码云托管代码可能无法实时同步本github项目更新，存在3~5天延时，请优先使用推荐方式。
+```
+- **（2）安装recovery的`requirements`**
+```bash
+python3 -m pip install -r ppstructure/recovery/requirements.txt
+```
+<a name="3"></a>
+## 3. 使用
+恢复给定文档的版面：
+```python
+cd PaddleOCR/ppstructure
+# 下载模型
+mkdir inference && cd inference
+# 下载超英文轻量级PP-OCRv3模型的检测模型并解压
+wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar && tar xf ch_PP-OCRv3_det_infer.tar
+# 下载英文轻量级PP-OCRv3模型的识别模型并解压
+wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar && tar xf  ch_PP-OCRv3_rec_infer.tar
+# 下载超轻量级英文表格英寸模型并解压
+wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar && tar xf en_ppocr_mobile_v2.0_table_structure_infer.tar
+cd ..
+# 执行预测
+python3 predict_system.py --det_model_dir=inference/en_PP-OCRv3_det_infer --rec_model_dir=inference/en_PP-OCRv3_rec_infer --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer --rec_char_dict_path=../ppocr/utils/en_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --output ./output/table --rec_image_shape=3,48,320 --vis_font_path=../doc/fonts/simfang.ttf --recovery=True --image_dir=./docs/table/1.png
+```
+运行完成后，每张图片的docx文档会保存到output字段指定的目录下
--- a/ppstructure/recovery/docx.py
+++ b/ppstructure/recovery/docx.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import cv2
+import os
+import pypandoc
+from copy import deepcopy
+from docx import Document
+from docx import shared
+from docx.enum.text import WD_ALIGN_PARAGRAPH
+from docx.enum.section import WD_SECTION
+from docx.oxml.ns import qn
+from ppocr.utils.logging import get_logger
+logger = get_logger()
+def convert_info_docx(img, res, save_folder, img_name):
+    doc = Document()
+    doc.styles['Normal'].font.name = 'Times New Roman'
+    doc.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体')
+    doc.styles['Normal'].font.size = shared.Pt(6.5)
+    h, w, _ = img.shape
+    res = sorted_layout_boxes(res, w)
+    flag = 1
+    for i, region in enumerate(res):
+        if flag == 2 and region['layout'] == 'single':
+            section = doc.add_section(WD_SECTION.CONTINUOUS)
+            section._sectPr.xpath('./w:cols')[0].set(qn('w:num'), '1')
+            flag = 1
+        elif flag == 1 and region['layout'] == 'double':
+            section = doc.add_section(WD_SECTION.CONTINUOUS)
+            section._sectPr.xpath('./w:cols')[0].set(qn('w:num'), '2')
+            flag = 2
+        if region['type'] == 'Figure':
+            excel_save_folder = os.path.join(save_folder, img_name)
+            img_path = os.path.join(excel_save_folder,
+                                    '{}.jpg'.format(region['bbox']))
+            paragraph_pic = doc.add_paragraph()
+            paragraph_pic.alignment = WD_ALIGN_PARAGRAPH.CENTER
+            run = paragraph_pic.add_run("")
+            if flag == 1:
+                run.add_picture(img_path, width=shared.Inches(5))
+            elif flag == 2:
+                run.add_picture(img_path, width=shared.Inches(2))
+        elif region['type'] == 'Title':
+            doc.add_heading(region['res'][0]['text'])
+        elif region['type'] == 'Text':
+            paragraph = doc.add_paragraph()
+            paragraph_format = paragraph.paragraph_format
+            for i, line in enumerate(region['res']):
+                if i == 0:
+                    paragraph_format.first_line_indent = shared.Inches(0.25)
+                text_run = paragraph.add_run(line['text'] + ' ')
+                text_run.font.size = shared.Pt(9)
+        elif region['type'] == 'Table':
+            pypandoc.convert(
+                source=region['res']['html'],
+                format='html',
+                to='docx',
+                outputfile='tmp.docx')
+            tmp_doc = Document('tmp.docx')
+            paragraph = doc.add_paragraph()
+            table = tmp_doc.tables[0]
+            new_table = deepcopy(table)
+            new_table.style = doc.styles['Table Grid']
+            from docx.enum.table import WD_TABLE_ALIGNMENT
+            new_table.alignment = WD_TABLE_ALIGNMENT.CENTER
+            paragraph.add_run().element.addnext(new_table._tbl)
+            os.remove('tmp.docx')
+        else:
+            continue
+    # save to docx
+    docx_path = os.path.join(save_folder, '{}.docx'.format(img_name))
+    doc.save(docx_path)
+    logger.info('docx save to {}'.format(docx_path))
+def sorted_layout_boxes(res, w):
+    """
+    Sort text boxes in order from top to bottom, left to right
+    args:
+        res(list):ppstructure results
+    return:
+        sorted results(list)
+    """
+    num_boxes = len(res)
+    if num_boxes == 1:
+        res[0]['layout'] = 'single'
+        return res
+    sorted_boxes = sorted(res, key=lambda x: (x['bbox'][1], x['bbox'][0]))
+    _boxes = list(sorted_boxes)
+    new_res = []
+    res_left = []
+    res_right = []
+    i = 0
+    while True:
+        if i >= num_boxes:
+            break
+        if i == num_boxes - 1:
+            if _boxes[i]['bbox'][1] > _boxes[i - 1]['bbox'][3] and _boxes[i][
+                    'bbox'][0] < w / 2 and _boxes[i]['bbox'][2] > w / 2:
+                new_res += res_left
+                new_res += res_right
+                _boxes[i]['layout'] = 'single'
+                new_res.append(_boxes[i])
+            else:
+                if _boxes[i]['bbox'][2] > w / 2:
+                    _boxes[i]['layout'] = 'double'
+                    res_right.append(_boxes[i])
+                    new_res += res_left
+                    new_res += res_right
+                elif _boxes[i]['bbox'][0] < w / 2:
+                    _boxes[i]['layout'] = 'double'
+                    res_left.append(_boxes[i])
+                    new_res += res_left
+                    new_res += res_right
+            res_left = []
+            res_right = []
+            break
+        elif _boxes[i]['bbox'][0] < w / 4 and _boxes[i]['bbox'][2] < 3*w / 4:
+            _boxes[i]['layout'] = 'double'
+            res_left.append(_boxes[i])
+            i += 1
+        elif _boxes[i]['bbox'][0] > w / 4 and _boxes[i]['bbox'][2] > w / 2:
+            _boxes[i]['layout'] = 'double'
+            res_right.append(_boxes[i])
+            i += 1
+        else:
+            new_res += res_left
+            new_res += res_right
+            _boxes[i]['layout'] = 'single'
+            new_res.append(_boxes[i])
+            res_left = []
+            res_right = []
+            i += 1
+    if res_left:
+        new_res += res_left
+    if res_right:
+        new_res += res_right
+    return new_res
\ No newline at end of file
--- a/ppstructure/recovery/requirements.txt
+++ b/ppstructure/recovery/requirements.txt
+opencv-contrib-python==4.4.0.46
+pypandoc
+python-docx
\ No newline at end of file
--- a/ppstructure/utility.py
+++ b/ppstructure/utility.py
@@ -61,6 +61,11 @@ def init_args():
        type=str2bool,
        default=True,
        help='In the forward, whether the non-table area is recognition by ocr')
+    parser.add_argument(
+        "--recovery",
+        type=bool,
+        default=False,
+        help='Whether to enable layout of recovery')        
    return parser