提交 7e5e95d6 编写于 作者: A an1018

add recovery

上级 e10490dd
...@@ -16,14 +16,14 @@ ...@@ -16,14 +16,14 @@
<center><img src='https://ai-studio-static-online.cdn.bcebos.com/9bd844b970f94e5ba0bc0c5799bd819ea9b1861bb306471fabc2d628864d418e'></center> <center><img src='https://ai-studio-static-online.cdn.bcebos.com/9bd844b970f94e5ba0bc0c5799bd819ea9b1861bb306471fabc2d628864d418e'></center>
<center>图1 多模态表单识别流程图</center> <center>图1 多模态表单识别流程图</center>
注:欢迎再AIStudio领取免费算力体验线上实训,项目链接: [多模态表单识别](https://aistudio.baidu.com/aistudio/projectdetail/3815918)(配备Tesla V100、A100等高级算力资源) 注:欢迎再AIStudio领取免费算力体验线上实训,项目链接: [多模态表单识别](https://aistudio.baidu.com/aistudio/projectdetail/3884375)(配备Tesla V100、A100等高级算力资源)
# 2 安装说明 # 2 安装说明
下载PaddleOCR源码,项目中已经帮大家打包好的PaddleOCR(已经修改好配置文件),无需下载解压即可,只需安装依赖环境~ 下载PaddleOCR源码,上述AIStudio项目中已经帮大家打包好的PaddleOCR(已经修改好配置文件),无需下载解压即可,只需安装依赖环境~
```python ```python
...@@ -33,7 +33,7 @@ ...@@ -33,7 +33,7 @@
```python ```python
# 如仍需安装or安装更新,可以执行以下步骤 # 如仍需安装or安装更新,可以执行以下步骤
! git clone https://github.com/PaddlePaddle/PaddleOCR.git -b dygraph # ! git clone https://github.com/PaddlePaddle/PaddleOCR.git -b dygraph
# ! git clone https://gitee.com/PaddlePaddle/PaddleOCR # ! git clone https://gitee.com/PaddlePaddle/PaddleOCR
``` ```
...@@ -290,7 +290,7 @@ Eval.dataset.transforms.DetResizeForTest:评估尺寸,添加如下参数 ...@@ -290,7 +290,7 @@ Eval.dataset.transforms.DetResizeForTest:评估尺寸,添加如下参数
<center><img src="https://ai-studio-static-online.cdn.bcebos.com/5a75137c5f924dfeb6956b5818812298cc3dc7992ac84954b4175be9adf83c77"></center> <center><img src="https://ai-studio-static-online.cdn.bcebos.com/5a75137c5f924dfeb6956b5818812298cc3dc7992ac84954b4175be9adf83c77"></center>
<center>图8 文本检测方案2-模型评估</center> <center>图8 文本检测方案2-模型评估</center>
使用训练好的模型进行评估,更新模型路径`Global.checkpoints`,这里为大家提供训练好的模型`./pretrain/ch_db_mv3-student1600-finetune/best_accuracy` 使用训练好的模型进行评估,更新模型路径`Global.checkpoints`,这里为大家提供训练好的模型`./pretrain/ch_db_mv3-student1600-finetune/best_accuracy`[模型下载地址](https://paddleocr.bj.bcebos.com/fanliku/sheet_recognition/ch_db_mv3-student1600-finetune.zip)
```python ```python
...@@ -538,7 +538,7 @@ Train.dataset.ratio_list:动态采样 ...@@ -538,7 +538,7 @@ Train.dataset.ratio_list:动态采样
<center>图16 文本识别方案3-模型评估</center> <center>图16 文本识别方案3-模型评估</center>
使用训练好的模型进行评估,更新模型路径`Global.checkpoints`,这里为大家提供训练好的模型`./pretrain/rec_mobile_pp-OCRv2-student-readldata/best_accuracy` 使用训练好的模型进行评估,更新模型路径`Global.checkpoints`,这里为大家提供训练好的模型`./pretrain/rec_mobile_pp-OCRv2-student-readldata/best_accuracy`[模型下载地址](https://paddleocr.bj.bcebos.com/fanliku/sheet_recognition/rec_mobile_pp-OCRv2-student-realdata.zip)
```python ```python
......
...@@ -23,6 +23,7 @@ sys.path.append(os.path.abspath(os.path.join(__dir__, '..'))) ...@@ -23,6 +23,7 @@ sys.path.append(os.path.abspath(os.path.join(__dir__, '..')))
os.environ["FLAGS_allocator_strategy"] = 'auto_growth' os.environ["FLAGS_allocator_strategy"] = 'auto_growth'
import cv2 import cv2
import json import json
import numpy as np
import time import time
import logging import logging
from copy import deepcopy from copy import deepcopy
...@@ -33,6 +34,7 @@ from ppocr.utils.logging import get_logger ...@@ -33,6 +34,7 @@ from ppocr.utils.logging import get_logger
from tools.infer.predict_system import TextSystem from tools.infer.predict_system import TextSystem
from ppstructure.table.predict_table import TableSystem, to_excel from ppstructure.table.predict_table import TableSystem, to_excel
from ppstructure.utility import parse_args, draw_structure_result from ppstructure.utility import parse_args, draw_structure_result
from ppstructure.recovery.docx import convert_info_docx
logger = get_logger() logger = get_logger()
...@@ -104,7 +106,12 @@ class StructureSystem(object): ...@@ -104,7 +106,12 @@ class StructureSystem(object):
return_ocr_result_in_table) return_ocr_result_in_table)
else: else:
if self.text_system is not None: if self.text_system is not None:
filter_boxes, filter_rec_res = self.text_system(roi_img) if args.recovery:
wht_im = np.ones(ori_im.shape, dtype=ori_im.dtype)
wht_im[y1:y2, x1:x2, :] = roi_img
filter_boxes, filter_rec_res = self.text_system(wht_im)
else:
filter_boxes, filter_rec_res = self.text_system(roi_img)
# remove style char # remove style char
style_token = [ style_token = [
'<strike>', '<strike>', '<sup>', '</sub>', '<b>', '<strike>', '<strike>', '<sup>', '</sub>', '<b>',
...@@ -118,7 +125,8 @@ class StructureSystem(object): ...@@ -118,7 +125,8 @@ class StructureSystem(object):
for token in style_token: for token in style_token:
if token in rec_str: if token in rec_str:
rec_str = rec_str.replace(token, '') rec_str = rec_str.replace(token, '')
box += [x1, y1] if not args.recovery:
box += [x1, y1]
res.append({ res.append({
'text': rec_str, 'text': rec_str,
'confidence': float(rec_conf), 'confidence': float(rec_conf),
...@@ -192,6 +200,8 @@ def main(args): ...@@ -192,6 +200,8 @@ def main(args):
# img_save_path = os.path.join(save_folder, img_name + '.jpg') # img_save_path = os.path.join(save_folder, img_name + '.jpg')
cv2.imwrite(img_save_path, draw_img) cv2.imwrite(img_save_path, draw_img)
logger.info('result save to {}'.format(img_save_path)) logger.info('result save to {}'.format(img_save_path))
if args.recovery:
convert_info_docx(img, res, save_folder, img_name)
elapse = time.time() - starttime elapse = time.time() - starttime
logger.info("Predict time : {:.3f}s".format(elapse)) logger.info("Predict time : {:.3f}s".format(elapse))
......
English | [简体中文](README_ch.md)
- [Getting Started](#getting-started)
- [1. Introduction](#1)
- [2. Quick Start](#2)
<a name="1"></a>
## 1. Introduction
Layout recovery means that after OCR recognition, the content is still arranged like the original document pictures, and the paragraphs are output to word document in the same order.
Layout recovery combines [layout analysis](../layout/README.md)[table recognition](../table/README.md) to better recover images, tables, titles, etc.
The following figure shows the result:
<div align="center">
<img src="../docs/table/recovery.jpg" width = "700" />
</div>
<a name="2"></a>
## 2. Quick Start
```python
cd PaddleOCR/ppstructure
# download model
mkdir inference && cd inference
# Download the detection model of the ultra-lightweight English PP-OCRv3 model and unzip it
wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar && tar xf ch_PP-OCRv3_det_infer.tar
# Download the recognition model of the ultra-lightweight English PP-OCRv3 model and unzip it
wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar && tar xf ch_PP-OCRv3_rec_infer.tar
# Download the ultra-lightweight English table inch model and unzip it
wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar && tar xf en_ppocr_mobile_v2.0_table_structure_infer.tar
cd ..
# run
python3 predict_system.py --det_model_dir=inference/en_PP-OCRv3_det_infer --rec_model_dir=inference/en_PP-OCRv3_rec_infer --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer --rec_char_dict_path=../ppocr/utils/en_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --output ./output/table --rec_image_shape=3,48,320 --vis_font_path=../doc/fonts/simfang.ttf --recovery=True --image_dir=./docs/table/1.png
```
After running, the docx of each picture will be saved in the directory specified by the output field
\ No newline at end of file
[English](README.md) | 简体中文
# 版面恢复使用说明
- [1. 简介](#1)
- [2. 使用](#2)
<a name="1"></a>
## 1. 简介
版面恢复就是在OCR识别后,内容仍然像原文档图片那样排列着,段落不变、顺序不变的输出到word文档中等。
版面恢复结合了[版面分析](../layout/README_ch.md)[表格识别](../table/README_ch.md)技术,从而更好地恢复图片、表格、标题等内容,下图展示了版面恢复的结果:
<div align="center">
<img src="../docs/table/recovery.jpg" width = "700" />
</div>
<a name="2"></a>
## 2. 使用
恢复给定文档的版面:
```python
cd PaddleOCR/ppstructure
# 下载模型
mkdir inference && cd inference
# 下载超英文轻量级PP-OCRv3模型的检测模型并解压
wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar && tar xf ch_PP-OCRv3_det_infer.tar
# 下载英文轻量级PP-OCRv3模型的识别模型并解压
wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar && tar xf ch_PP-OCRv3_rec_infer.tar
# 下载超轻量级英文表格英寸模型并解压
wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar && tar xf en_ppocr_mobile_v2.0_table_structure_infer.tar
cd ..
# 执行预测
python3 predict_system.py --det_model_dir=inference/en_PP-OCRv3_det_infer --rec_model_dir=inference/en_PP-OCRv3_rec_infer --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer --rec_char_dict_path=../ppocr/utils/en_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --output ./output/table --rec_image_shape=3,48,320 --vis_font_path=../doc/fonts/simfang.ttf --recovery=True --image_dir=./docs/table/1.png
```
运行完成后,每张图片的docx文档会保存到output字段指定的目录下
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import cv2
import os
import pypandoc
from copy import deepcopy
from docx import Document
from docx import shared
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.section import WD_SECTION
from docx.oxml.ns import qn
from ppocr.utils.logging import get_logger
logger = get_logger()
def convert_info_docx(img, res, save_folder, img_name):
doc = Document()
doc.styles['Normal'].font.name = 'Times New Roman'
doc.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体')
doc.styles['Normal'].font.size = shared.Pt(6.5)
h, w, _ = img.shape
res = sorted_layout_boxes(res, w)
flag = 1
for i, region in enumerate(res):
if flag == 2 and region['layout'] == 'single':
section = doc.add_section(WD_SECTION.CONTINUOUS)
section._sectPr.xpath('./w:cols')[0].set(qn('w:num'), '1')
flag = 1
elif flag == 1 and region['layout'] == 'double':
section = doc.add_section(WD_SECTION.CONTINUOUS)
section._sectPr.xpath('./w:cols')[0].set(qn('w:num'), '2')
flag = 2
if region['type'] == 'Figure':
excel_save_folder = os.path.join(save_folder, img_name)
img_path = os.path.join(excel_save_folder,
'{}.jpg'.format(region['bbox']))
paragraph_pic = doc.add_paragraph()
paragraph_pic.alignment = WD_ALIGN_PARAGRAPH.CENTER
run = paragraph_pic.add_run("")
if flag == 1:
run.add_picture(img_path, width=shared.Inches(5))
elif flag == 2:
run.add_picture(img_path, width=shared.Inches(2))
elif region['type'] == 'Title':
doc.add_heading(region['res'][0]['text'])
elif region['type'] == 'Text':
paragraph = doc.add_paragraph()
paragraph_format = paragraph.paragraph_format
for i, line in enumerate(region['res']):
if i == 0:
paragraph_format.first_line_indent = shared.Inches(0.25)
text_run = paragraph.add_run(line['text'] + ' ')
text_run.font.size = shared.Pt(9)
elif region['type'] == 'Table':
pypandoc.convert(
source=region['res']['html'],
format='html',
to='docx',
outputfile='tmp.docx')
tmp_doc = Document('tmp.docx')
paragraph = doc.add_paragraph()
table = tmp_doc.tables[0]
new_table = deepcopy(table)
new_table.style = doc.styles['Table Grid']
from docx.enum.table import WD_TABLE_ALIGNMENT
new_table.alignment = WD_TABLE_ALIGNMENT.CENTER
paragraph.add_run().element.addnext(new_table._tbl)
os.remove('tmp.docx')
else:
continue
# save to docx
docx_path = os.path.join(save_folder, '{}.docx'.format(img_name))
doc.save(docx_path)
logger.info('docx save to {}'.format(docx_path))
def sorted_layout_boxes(res, w):
"""
Sort text boxes in order from top to bottom, left to right
args:
res(list):ppstructure results
return:
sorted results(list)
"""
num_boxes = len(res)
if num_boxes == 1:
res[0]['layout'] = 'single'
return res
sorted_boxes = sorted(res, key=lambda x: (x['bbox'][1], x['bbox'][0]))
_boxes = list(sorted_boxes)
new_res = []
res_left = []
res_right = []
i = 0
while True:
if i >= num_boxes:
break
if i == num_boxes - 1:
if _boxes[i]['bbox'][1] > _boxes[i - 1]['bbox'][3] and _boxes[i][
'bbox'][0] < w / 2 and _boxes[i]['bbox'][2] > w / 2:
new_res += res_left
new_res += res_right
_boxes[i]['layout'] = 'single'
new_res.append(_boxes[i])
else:
if _boxes[i]['bbox'][2] > w / 2:
_boxes[i]['layout'] = 'double'
res_right.append(_boxes[i])
new_res += res_left
new_res += res_right
elif _boxes[i]['bbox'][0] < w / 2:
_boxes[i]['layout'] = 'double'
res_left.append(_boxes[i])
new_res += res_left
new_res += res_right
res_left = []
res_right = []
break
elif _boxes[i]['bbox'][0] < w / 4 and _boxes[i]['bbox'][2] < 3*w / 4:
_boxes[i]['layout'] = 'double'
res_left.append(_boxes[i])
i += 1
elif _boxes[i]['bbox'][0] > w / 4 and _boxes[i]['bbox'][2] > w / 2:
_boxes[i]['layout'] = 'double'
res_right.append(_boxes[i])
i += 1
else:
new_res += res_left
new_res += res_right
_boxes[i]['layout'] = 'single'
new_res.append(_boxes[i])
res_left = []
res_right = []
i += 1
if res_left:
new_res += res_left
if res_right:
new_res += res_right
return new_res
\ No newline at end of file
...@@ -61,6 +61,11 @@ def init_args(): ...@@ -61,6 +61,11 @@ def init_args():
type=str2bool, type=str2bool,
default=True, default=True,
help='In the forward, whether the non-table area is recognition by ocr') help='In the forward, whether the non-table area is recognition by ocr')
parser.add_argument(
"--recovery",
type=bool,
default=False,
help='Whether to enable layout of recovery')
return parser return parser
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册