recovery_to_doc.py 5.7 KB
Newer Older
A
an1018 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import cv2
import os
import pypandoc
from copy import deepcopy

from docx import Document
from docx import shared
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.section import WD_SECTION
from docx.oxml.ns import qn
U
user1018 已提交
25 26
from docx.enum.table import WD_TABLE_ALIGNMENT

A
an1018 已提交
27
from ppstructure.recovery.table_process import HtmlToDocx
A
an1018 已提交
28 29 30 31 32

from ppocr.utils.logging import get_logger
logger = get_logger()


U
user1018 已提交
33
def convert_info_docx(img, res, save_folder, img_name, save_pdf):
A
an1018 已提交
34 35 36 37 38 39 40
    doc = Document()
    doc.styles['Normal'].font.name = 'Times New Roman'
    doc.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体')
    doc.styles['Normal'].font.size = shared.Pt(6.5)

    flag = 1
    for i, region in enumerate(res):
U
user1018 已提交
41
        img_idx = region['img_idx']
A
an1018 已提交
42 43 44 45 46 47 48 49 50
        if flag == 2 and region['layout'] == 'single':
            section = doc.add_section(WD_SECTION.CONTINUOUS)
            section._sectPr.xpath('./w:cols')[0].set(qn('w:num'), '1')
            flag = 1
        elif flag == 1 and region['layout'] == 'double':
            section = doc.add_section(WD_SECTION.CONTINUOUS)
            section._sectPr.xpath('./w:cols')[0].set(qn('w:num'), '2')
            flag = 2

U
user1018 已提交
51
        if region['type'].lower() == 'figure':
A
an1018 已提交
52 53
            excel_save_folder = os.path.join(save_folder, img_name)
            img_path = os.path.join(excel_save_folder,
U
user1018 已提交
54
                                    '{}_{}.jpg'.format(region['bbox'], img_idx))
A
an1018 已提交
55 56 57 58 59 60 61
            paragraph_pic = doc.add_paragraph()
            paragraph_pic.alignment = WD_ALIGN_PARAGRAPH.CENTER
            run = paragraph_pic.add_run("")
            if flag == 1:
                run.add_picture(img_path, width=shared.Inches(5))
            elif flag == 2:
                run.add_picture(img_path, width=shared.Inches(2))
U
user1018 已提交
62
        elif region['type'].lower() == 'title':
A
an1018 已提交
63
            doc.add_heading(region['res'][0]['text'])
U
user1018 已提交
64 65 66 67 68 69 70 71
        elif region['type'].lower() == 'table':
            paragraph = doc.add_paragraph()
            new_parser = HtmlToDocx()
            new_parser.table_style = 'TableGrid'
            table = new_parser.handle_table(html=region['res']['html'])
            new_table = deepcopy(table)
            new_table.alignment = WD_TABLE_ALIGNMENT.CENTER
            paragraph.add_run().element.addnext(new_table._tbl)
A
an1018 已提交
72

U
user1018 已提交
73
        else:
A
an1018 已提交
74 75 76 77 78 79
            paragraph = doc.add_paragraph()
            paragraph_format = paragraph.paragraph_format
            for i, line in enumerate(region['res']):
                if i == 0:
                    paragraph_format.first_line_indent = shared.Inches(0.25)
                text_run = paragraph.add_run(line['text'] + ' ')
U
user1018 已提交
80
                text_run.font.size = shared.Pt(10)
A
an1018 已提交
81 82 83 84 85 86

    # save to docx
    docx_path = os.path.join(save_folder, '{}.docx'.format(img_name))
    doc.save(docx_path)
    logger.info('docx save to {}'.format(docx_path))

U
user1018 已提交
87 88
    # save to pdf
    if save_pdf:
A
an1018 已提交
89
        pdf_path = os.path.join(save_folder, '{}.pdf'.format(img_name))
U
user1018 已提交
90 91
        from docx2pdf import convert
        convert(docx_path, pdf_path)
A
an1018 已提交
92
        logger.info('pdf save to {}'.format(pdf_path))
U
user1018 已提交
93

A
an1018 已提交
94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114

def sorted_layout_boxes(res, w):
    """
    Sort text boxes in order from top to bottom, left to right
    args:
        res(list):ppstructure results
    return:
        sorted results(list)
    """
    num_boxes = len(res)
    if num_boxes == 1:
        res[0]['layout'] = 'single'
        return res

    sorted_boxes = sorted(res, key=lambda x: (x['bbox'][1], x['bbox'][0]))
    _boxes = list(sorted_boxes)

    new_res = []
    res_left = []
    res_right = []
    i = 0
A
an1018 已提交
115

A
an1018 已提交
116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139
    while True:
        if i >= num_boxes:
            break
        if i == num_boxes - 1:
            if _boxes[i]['bbox'][1] > _boxes[i - 1]['bbox'][3] and _boxes[i][
                    'bbox'][0] < w / 2 and _boxes[i]['bbox'][2] > w / 2:
                new_res += res_left
                new_res += res_right
                _boxes[i]['layout'] = 'single'
                new_res.append(_boxes[i])
            else:
                if _boxes[i]['bbox'][2] > w / 2:
                    _boxes[i]['layout'] = 'double'
                    res_right.append(_boxes[i])
                    new_res += res_left
                    new_res += res_right
                elif _boxes[i]['bbox'][0] < w / 2:
                    _boxes[i]['layout'] = 'double'
                    res_left.append(_boxes[i])
                    new_res += res_left
                    new_res += res_right
            res_left = []
            res_right = []
            break
A
an1018 已提交
140
        elif _boxes[i]['bbox'][0] < w / 4 and _boxes[i]['bbox'][2] < 3 * w / 4:
A
an1018 已提交
141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
            _boxes[i]['layout'] = 'double'
            res_left.append(_boxes[i])
            i += 1
        elif _boxes[i]['bbox'][0] > w / 4 and _boxes[i]['bbox'][2] > w / 2:
            _boxes[i]['layout'] = 'double'
            res_right.append(_boxes[i])
            i += 1
        else:
            new_res += res_left
            new_res += res_right
            _boxes[i]['layout'] = 'single'
            new_res.append(_boxes[i])
            res_left = []
            res_right = []
            i += 1
    if res_left:
        new_res += res_left
    if res_right:
        new_res += res_right
A
an1018 已提交
160
    return new_res