diff --git a/paddleocr.py b/paddleocr.py index 6b4de93e9b7ce6b34b610d9bec2de31ab56dda6f..44308a823ed8edca0e979fd8d83414cec337ab9b 100644 --- a/paddleocr.py +++ b/paddleocr.py @@ -663,6 +663,16 @@ def main(): if not flag_gif and not flag_pdf: img = cv2.imread(img_path) + if args.recovery and args.use_pdf2docx_api and flag_pdf: + from pdf2docx.converter import Converter + docx_file = os.path.join(args.output, + '{}.docx'.format(img_name)) + cv = Converter(img_path) + cv.convert(docx_file) + cv.close() + logger.info('docx save to {}'.format(docx_file)) + continue + if not flag_pdf: if img is None: logger.error("error in loading image:{}".format(img_path)) diff --git a/ppstructure/docs/quickstart.md b/ppstructure/docs/quickstart.md index 60642f78b6691c3ac2eeba99680a2af23299ddc9..3ff325520c49b913a98f64f1099cce20bb3004ef 100644 --- a/ppstructure/docs/quickstart.md +++ b/ppstructure/docs/quickstart.md @@ -97,6 +97,19 @@ paddleocr --image_dir=ppstructure/docs/table/table.jpg --type=structure --layout #### 2.1.6 版面恢复 +版面恢复分为2种方法,详细介绍请参考:[版面恢复教程](../recovery/README_ch.md): + +- PDF解析 +- OCR技术 + +通过PDF解析(只支持pdf格式的输入): + +```bash +paddleocr --image_dir=ppstructure/recovery/UnrealText.pdf --type=structure --recovery=true --use_pdf2docx_api=true +``` + +通过OCR技术: + ```bash # 中文测试图 paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure --recovery=true diff --git a/ppstructure/docs/quickstart_en.md b/ppstructure/docs/quickstart_en.md index e0eec4b38ba57b1bebd0e711093e5dfd4773fdd9..e771877afae83dc91a6e0c2f6fdde1950f6d3ace 100644 --- a/ppstructure/docs/quickstart_en.md +++ b/ppstructure/docs/quickstart_en.md @@ -98,7 +98,21 @@ Key information extraction does not currently support use by the whl package. Fo #### 2.1.6 layout recovery + +Two layout recovery methods are provided, For detailed usage tutorials, please refer to: [Layout Recovery](../recovery/README.md). + +- PDF parse +- OCR + +Recovery by using PDF parse (only support pdf as input): + +```bash +paddleocr --image_dir=ppstructure/recovery/UnrealText.pdf --type=structure --recovery=true --use_pdf2docx_api=true ``` + +Recovery by using OCR: + +```bash paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure --recovery=true --lang='en' ``` diff --git a/ppstructure/predict_system.py b/ppstructure/predict_system.py index 417002d1ef58471268071f96868617a4c9c52056..bb061c998f6f8b16c06f9ee94299af0f59c53eb2 100644 --- a/ppstructure/predict_system.py +++ b/ppstructure/predict_system.py @@ -216,16 +216,26 @@ def main(args): image_file_list = image_file_list image_file_list = image_file_list[args.process_id::args.total_process_num] - structure_sys = StructureSystem(args) + if not args.use_pdf2docx_api: + structure_sys = StructureSystem(args) + save_folder = os.path.join(args.output, structure_sys.mode) + os.makedirs(save_folder, exist_ok=True) img_num = len(image_file_list) - save_folder = os.path.join(args.output, structure_sys.mode) - os.makedirs(save_folder, exist_ok=True) for i, image_file in enumerate(image_file_list): logger.info("[{}/{}] {}".format(i, img_num, image_file)) img, flag_gif, flag_pdf = check_and_read(image_file) img_name = os.path.basename(image_file).split('.')[0] + if args.recovery and args.use_pdf2docx_api and flag_pdf: + from pdf2docx.converter import Converter + docx_file = os.path.join(args.output, '{}.docx'.format(img_name)) + cv = Converter(image_file) + cv.convert(docx_file) + cv.close() + logger.info('docx save to {}'.format(docx_file)) + continue + if not flag_gif and not flag_pdf: img = cv2.imread(image_file) diff --git a/ppstructure/recovery/README.md b/ppstructure/recovery/README.md index 0e06c65475b67bcdfc119069fa6f6076322c0e99..41fb3e45b83329cd7bfa7021b19ffcee33dc947b 100644 --- a/ppstructure/recovery/README.md +++ b/ppstructure/recovery/README.md @@ -6,18 +6,39 @@ English | [简体中文](README_ch.md) - [2. Install](#2) - [2.1 Install PaddlePaddle](#2.1) - [2.2 Install PaddleOCR](#2.2) -- [3. Quick Start](#3) - - [3.1 Download models](#3.1) - - [3.2 Layout recovery](#3.2) -- [4. More](#4) +- [3. Quick Start using PDF parse](#3) +- [4. Quick Start using OCR](#4) + - [4.1 Download models](#4.1) + - [4.2 Layout recovery](#4.2) +- [5. More](#5) ## 1. Introduction -Layout recovery means that after OCR recognition, the content is still arranged like the original document pictures, and the paragraphs are output to word document in the same order. +The layout recovery module is used to restore the image or pdf to an +editable Word file consistent with the original image layout. -Layout recovery combines [layout analysis](../layout/README.md)、[table recognition](../table/README.md) to better recover images, tables, titles, etc. supports input files in PDF and document image formats in Chinese and English. The following figure shows the effect of restoring the layout of English and Chinese documents: +Two layout recovery methods are provided: + +- PDF parse: Python based PDF to word library [pdf2docx] (https://github.com/dothinking/pdf2docx) is optimized, the method extracts data from PDF with PyMuPDF, then parse layout with rule, finally, generate docx with python-docx. + +- OCR: Layout recovery combines [layout analysis](../layout/README.md)、[table recognition](../table/README.md) to better recover images, tables, titles, etc. supports input files in PDF and document image formats in Chinese and English. + +The input formats and application scenarios of the two methods are as follows: + +| method | input formats | application scenarios/problem | +| :-----: | :----------: | :----------------------------------------------------------: | +| PDF parse | pdf | Advantages: Better recovery for non-paper documents, each page remains on the same page after restoration
Disadvantages: English characters in some Chinese documents are garbled, some contents are still beyond the current page, the whole page content is restored to the table format, and the recovery effect of some pictures is not good | +| OCR technique | pdf、picture | Advantages: More suitable for paper document content recovery, OCR recognition effect is more good
Disadvantages: Currently, the recovery is based on rules, the effect of content typesetting (spacing, fonts, etc.) need to be further improved, and the effect of layout recovery depends on layout analysis | + +The following figure shows the effect of restoring the layout of documents by using PDF parse: + +
+ +
+ +The following figures show the effect of restoring the layout of English and Chinese documents by using OCR technique:
@@ -26,6 +47,8 @@ Layout recovery combines [layout analysis](../layout/README.md)、[table recogni
+ + ## 2. Install @@ -61,9 +84,11 @@ git clone https://gitee.com/paddlepaddle/PaddleOCR # Note: Code cloud hosting code may not be able to synchronize the update of this github project in real time, there is a delay of 3 to 5 days, please use the recommended method first. ```` -- **(2) Install recovery's `requirements`** +- **(2) Install recovery `requirements`** + +The layout restoration is exported as docx files, so python-docx API need to be installed, and PyMuPDF api([requires Python >= 3.7](https://pypi.org/project/PyMuPDF/)) need to be installed to process the input files in pdf format. And if using pdf parse method, we need to install pdf2docx api. -The layout restoration is exported as docx and PDF files, so python-docx and docx2pdf API need to be installed, and PyMuPDF api([requires Python >= 3.7](https://pypi.org/project/PyMuPDF/)) need to be installed to process the input files in pdf format. +Install all the libraries by running the following command: ```bash python3 -m pip install -r ppstructure/recovery/requirements.txt @@ -71,7 +96,28 @@ python3 -m pip install -r ppstructure/recovery/requirements.txt -## 3. Quick Start +## 3. Quick Start using PDF parse + +`use_pdf2docx_api` use PDF parse for layout recovery, The whl package is also provided for quick use, follow the above code, for more infomation please refer to [quickstart](../docs/quickstart_en.md) for details. + +```bash +# install paddleocr +pip3 install "paddleocr>=2.6" +paddleocr --image_dir=ppstructure/recovery/UnrealText.pdf --type=structure --recovery=true --use_pdf2docx_api=true +``` + +Command line: + +```bash +python3 predict_system.py \ + --image_dir=ppstructure/recovery/UnrealText.pdf \ + --recovery=True \ + --use_pdf2docx_api=True \ + --output=../output/ +``` + + +## 4. Quick Start using OCR Through layout analysis, we divided the image/PDF documents into regions, located the key regions, such as text, table, picture, etc., and recorded the location, category, and regional pixel value information of each region. Different regions are processed separately, where: @@ -88,8 +134,8 @@ The whl package is also provided for quick use, follow the above code, for more paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure --recovery=true --lang='en' ``` - -### 3.1 Download models + +### 4.1 Download models If input is English document, download English models: @@ -111,10 +157,10 @@ tar xf picodet_lcnet_x1_0_fgd_layout_infer.tar cd .. ``` If input is Chinese document,download Chinese models: -[Chinese and English ultra-lightweight PP-OCRv3 model](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/README.md#pp-ocr-series-model-listupdate-on-september-8th)、[表格识别模型](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppstructure/docs/models_list.md#22-表格识别模型)、[版面分析模型](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppstructure/docs/models_list.md#1-版面分析模型) +[Chinese and English ultra-lightweight PP-OCRv3 model](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/README.md#pp-ocr-series-model-listupdate-on-september-8th)、[table recognition model](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppstructure/docs/models_list.md#22-表格识别模型)、[layout analysis model](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppstructure/docs/models_list.md#1-版面分析模型) - -### 3.2 Layout recovery + +### 4.2 Layout recovery ```bash @@ -129,7 +175,6 @@ python3 predict_system.py \ --layout_dict_path=../ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt \ --vis_font_path=../doc/fonts/simfang.ttf \ --recovery=True \ - --save_pdf=False \ --output=../output/ ``` @@ -137,7 +182,7 @@ After running, the docx of each picture will be saved in the directory specified Field: -- image_dir:test file测试文件, can be picture, picture directory, pdf file, pdf file directory +- image_dir:test file, can be picture, picture directory, pdf file, pdf file directory - det_model_dir:OCR detection model path - rec_model_dir:OCR recognition model path - rec_char_dict_path:OCR recognition dict path. If the Chinese model is used, change to "../ppocr/utils/ppocr_keys_v1.txt". And if you trained the model on your own dataset, change to the trained dictionary @@ -146,12 +191,11 @@ Field: - layout_model_dir:layout analysis model path - layout_dict_path:layout analysis dict path. If the Chinese model is used, change to "../ppocr/utils/dict/layout_dict/layout_cdla_dict.txt" - recovery:whether to enable layout of recovery, default False -- save_pdf:when recovery file, whether to save pdf file, default False - output:save the recovery result path - + -## 4. More +## 5. More For training, evaluation and inference tutorial for text detection models, please refer to [text detection doc](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/doc/doc_en/detection_en.md). diff --git a/ppstructure/recovery/README_ch.md b/ppstructure/recovery/README_ch.md index bc8913adca3385a88cb2decc87fa9acffc707257..eaa5260b57db81bc483ff4b32ec4334340002335 100644 --- a/ppstructure/recovery/README_ch.md +++ b/ppstructure/recovery/README_ch.md @@ -6,19 +6,37 @@ - [2. 安装](#2) - [2.1 安装PaddlePaddle](#2.1) - [2.2 安装PaddleOCR](#2.2) -- [3. 使用](#3) - - [3.1 下载模型](#3.1) - - [3.2 版面恢复](#3.2) -- [4. 更多](#4) - +- [3.使用 PDF解析进行版面恢复](#3) +- [4. 使用 OCR技术进行版面恢复](#4) + - [4.1 下载模型](#4.1) + - [4.2 版面恢复](#4.2) +- [5. 更多](#5) ## 1. 简介 -版面恢复就是在OCR识别后,内容仍然像原文档图片那样排列着,段落不变、顺序不变的输出到word文档中等。 +版面恢复就是将输入的图片、pdf内容仍然像原文档那样排列着,段落不变、顺序不变的输出到word文档中等。 + +提供了2种版面恢复方法: + +- PDF解析:基于Python的pdf转word库[pdf2docx](https://github.com/dothinking/pdf2docx)进行优化,该方法通过PyMuPDF获取页面元素,然后利用规则解析章节、段落、表格等布局及样式,最后通过python-docx将解析的内容元素重建到word文档中。 +- OCR技术:结合[版面分析](../layout/README_ch.md)、[表格识别](../table/README_ch.md)技术,从而更好地恢复图片、表格、标题等内容,支持中、英文pdf文档、文档图片格式的输入文件。 + +2种方法输入格式、适用场景如下: -版面恢复结合了[版面分析](../layout/README_ch.md)、[表格识别](../table/README_ch.md)技术,从而更好地恢复图片、表格、标题等内容,支持中、英文pdf文档、文档图片格式的输入文件,下图分别展示了英文文档和中文文档版面恢复的效果: +| 方法 | 支持输入文件 | 适用场景/存在问题 | +| :-----: | :----------: | :----------------------------------------------------------: | +| PDF解析 | pdf | 优点:非论文文档恢复效果更优、每一页内容恢复后仍在同一页
缺点:有些中文文档中的英文乱码、仍存在内容超出当前页面的情况、整页内容恢复为表格格式、部分图片恢复效果不佳 | +| OCR技术 | pdf、图片 | 优点:更适合论文文档正文内容的恢复、中英文文档OCR识别效果好
缺点:目前内容恢复基于规则,内容排版效果(间距、字体等)待进一步提升、版面恢复效果依赖于版面分析效果 | + +下图展示了通过PDF解析版面恢复效果: + +
+ +
+ +下图分别展示了通过OCR技术,英文文档和中文文档版面恢复的效果:
@@ -64,7 +82,9 @@ git clone https://gitee.com/paddlepaddle/PaddleOCR - **(2)安装recovery的`requirements`** -版面恢复导出为docx、pdf文件,所以需要安装python-docx、docx2pdf API,同时处理pdf格式的输入文件,需要安装PyMuPDF API([要求Python >= 3.7](https://pypi.org/project/PyMuPDF/))。 +版面恢复导出为docx文件,所以需要安装Python处理word文档的python-docx API,同时处理pdf格式的输入文件,需要安装PyMuPDF API([要求Python >= 3.7](https://pypi.org/project/PyMuPDF/))。使用pdf2docx库解析的方式恢复文档需要安装pdf2docx等。 + +通过如下命令安装全部库: ```bash python3 -m pip install -r ppstructure/recovery/requirements.txt @@ -72,7 +92,29 @@ python3 -m pip install -r ppstructure/recovery/requirements.txt -## 3. 使用 +## 3.使用 PDF解析进行版面恢复 + +`use_pdf2docx_api`表示使用PDF解析的方式进行版面恢复,通过whl包的形式方便快速使用,代码如下,更多信息详见 [quickstart](../docs/quickstart.md)。 + +```bash +# 安装 paddleocr,推荐使用2.6版本 +pip3 install "paddleocr>=2.6" +paddleocr --image_dir=ppstructure/recovery/UnrealText.pdf --type=structure --recovery=true --use_pdf2docx_api=true +``` + +通过命令行的方式: + +```bash +python3 predict_system.py \ + --image_dir=ppstructure/recovery/UnrealText.pdf \ + --recovery=True \ + --use_pdf2docx_api=True \ + --output=../output/ +``` + + + +## 4.使用 OCR技术进行版面恢复 我们通过版面分析对图片/pdf形式的文档进行区域划分,定位其中的关键区域,如文字、表格、图片等,记录每个区域的位置、类别、区域像素值信息。对不同的区域分别处理,其中: @@ -86,6 +128,8 @@ python3 -m pip install -r ppstructure/recovery/requirements.txt 提供如下代码实现版面恢复,也提供了whl包的形式方便快速使用,代码如下,更多信息详见 [quickstart](../docs/quickstart.md)。 ```bash +# 安装 paddleocr,推荐使用2.6版本 +pip3 install "paddleocr>=2.6" # 中文测试图 paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure --recovery=true # 英文测试图 @@ -94,9 +138,9 @@ paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure --recovery=t paddleocr --image_dir=ppstructure/recovery/UnrealText.pdf --type=structure --recovery=true --lang='en' ``` - + -### 3.1 下载模型 +### 4.1 下载模型 如果输入为英文文档类型,下载OCR检测和识别、版面分析、表格识别的英文模型 @@ -122,9 +166,9 @@ cd .. [PP-OCRv3中英文超轻量文本检测和识别模型](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/README_ch.md#pp-ocr%E7%B3%BB%E5%88%97%E6%A8%A1%E5%9E%8B%E5%88%97%E8%A1%A8%E6%9B%B4%E6%96%B0%E4%B8%AD)、[表格识别模型](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppstructure/docs/models_list.md#22-表格识别模型)、[版面分析模型](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppstructure/docs/models_list.md#1-版面分析模型) - + -### 3.2 版面恢复 +### 4.2 版面恢复 使用下载的模型恢复给定文档的版面,以英文模型为例,执行如下命令: @@ -140,7 +184,6 @@ python3 predict_system.py \ --layout_dict_path=../ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt \ --vis_font_path=../doc/fonts/simfang.ttf \ --recovery=True \ - --save_pdf=False \ --output=../output/ ``` @@ -157,12 +200,11 @@ python3 predict_system.py \ - layout_model_dir:版面分析模型路径 - layout_dict_path:版面分析字典,如果更换为中文模型,需要更改为"../ppocr/utils/dict/layout_dict/layout_cdla_dict.txt" - recovery:是否进行版面恢复,默认False -- save_pdf:进行版面恢复导出docx文档的同时,是否保存为pdf文件,默认为False - output:版面恢复结果保存路径 - + -## 4. 更多 +## 5. 更多 关于OCR检测模型的训练评估与推理,请参考:[文本检测教程](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/doc/doc_ch/detection.md) diff --git a/ppstructure/recovery/requirements.txt b/ppstructure/recovery/requirements.txt index 7ddc3391338e5a2a87f9cea9fca006dc03da58fb..4e4239a14af9b6f95aca1171f25d50da5eac37cf 100644 --- a/ppstructure/recovery/requirements.txt +++ b/ppstructure/recovery/requirements.txt @@ -1,3 +1,5 @@ python-docx -PyMuPDF -beautifulsoup4 \ No newline at end of file +PyMuPDF==1.19.0 +beautifulsoup4 +fonttools>=4.24.0 +fire>=0.3.0 \ No newline at end of file diff --git a/ppstructure/utility.py b/ppstructure/utility.py index 7f8a06d2ec1cd18f19975542667cc0f2cf8ad825..d909f1a8a165745a5c0df78cc3d89960ec4469e7 100644 --- a/ppstructure/utility.py +++ b/ppstructure/utility.py @@ -93,6 +93,11 @@ def init_args(): type=str2bool, default=False, help='Whether to enable layout of recovery') + parser.add_argument( + "--use_pdf2docx_api", + type=str2bool, + default=False, + help='Whether to use pdf2docx api') return parser diff --git a/requirements.txt b/requirements.txt index 7a018b50952a876b4839eabbd72fac09d2bbd73b..c90ca37719a3f2156b420c35d19ebdf90fe3ed54 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,4 +15,4 @@ premailer openpyxl attrdict Polygon3 -PyMuPDF==1.18.7 +PyMuPDF==1.19.0