From b7d99acd2e06945c789312cda70d60b7c8a5b0d0 Mon Sep 17 00:00:00 2001 From: user1018 <614803115@qq.com> Date: Fri, 19 Aug 2022 20:15:37 +0800 Subject: [PATCH] update recovery (#7259) * update recovery * update recovery * update recovery * update recovery * update recovery --- ppocr/utils/utility.py | 28 +- ppstructure/layout/layout_in_ocr.md | 469 ------------------ ppstructure/predict_system.py | 114 +++-- ppstructure/recovery/README.md | 22 +- ppstructure/recovery/README_ch.md | 33 +- ppstructure/recovery/recovery_to_doc.py | 52 +- ppstructure/recovery/requirements.txt | 6 +- ppstructure/recovery/table_process.py | 632 ++++++++++++++++++++++++ ppstructure/utility.py | 5 + 9 files changed, 814 insertions(+), 547 deletions(-) delete mode 100644 ppstructure/layout/layout_in_ocr.md create mode 100644 ppstructure/recovery/table_process.py diff --git a/ppocr/utils/utility.py b/ppocr/utils/utility.py index b881fcab..18357c8e 100755 --- a/ppocr/utils/utility.py +++ b/ppocr/utils/utility.py @@ -50,7 +50,7 @@ def get_check_global_params(mode): def _check_image_file(path): - img_end = {'jpg', 'bmp', 'png', 'jpeg', 'rgb', 'tif', 'tiff', 'gif'} + img_end = {'jpg', 'bmp', 'png', 'jpeg', 'rgb', 'tif', 'tiff', 'gif', 'pdf'} return any([path.lower().endswith(e) for e in img_end]) @@ -59,7 +59,7 @@ def get_image_file_list(img_file): if img_file is None or not os.path.exists(img_file): raise Exception("not found any img file in {}".format(img_file)) - img_end = {'jpg', 'bmp', 'png', 'jpeg', 'rgb', 'tif', 'tiff', 'gif'} + img_end = {'jpg', 'bmp', 'png', 'jpeg', 'rgb', 'tif', 'tiff', 'gif', 'pdf'} if os.path.isfile(img_file) and _check_image_file(img_file): imgs_lists.append(img_file) elif os.path.isdir(img_file): @@ -73,7 +73,7 @@ def get_image_file_list(img_file): return imgs_lists -def check_and_read_gif(img_path): +def check_and_read(img_path): if os.path.basename(img_path)[-3:] in ['gif', 'GIF']: gif = cv2.VideoCapture(img_path) ret, frame = gif.read() @@ -84,8 +84,26 @@ def check_and_read_gif(img_path): if len(frame.shape) == 2 or frame.shape[-1] == 1: frame = cv2.cvtColor(frame, cv2.COLOR_GRAY2RGB) imgvalue = frame[:, :, ::-1] - return imgvalue, True - return None, False + return imgvalue, True, False + elif os.path.basename(img_path)[-3:] in ['pdf']: + import fitz + from PIL import Image + imgs = [] + with fitz.open(img_path) as pdf: + for pg in range(0, pdf.pageCount): + page = pdf[pg] + mat = fitz.Matrix(2, 2) + pm = page.getPixmap(matrix=mat, alpha=False) + + # if width or height > 2000 pixels, don't enlarge the image + if pm.width > 2000 or pm.height > 2000: + pm = page.getPixmap(matrix=fitz.Matrix(1, 1), alpha=False) + + img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples) + img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR) + imgs.append(img) + return imgs, False, True + return None, False, False def load_vqa_bio_label_maps(label_map_path): diff --git a/ppstructure/layout/layout_in_ocr.md b/ppstructure/layout/layout_in_ocr.md deleted file mode 100644 index 3762544b..00000000 --- a/ppstructure/layout/layout_in_ocr.md +++ /dev/null @@ -1,469 +0,0 @@ -- [1. 简介](#1-简介) - -- [2. 安装](#2-安装) - - - [2.1 安装PaddlePaddle](#21-安装paddlepaddle) - - [2.2 安装PaddleDetection](#22-安装paddledetection) - -- [3. 数据准备](#3-数据准备) - - - [3.1 英文数据集](#31-英文数据集) - - [3.2 更多数据集](#32-更多数据集) - -- [4. 开始训练](#4-开始训练) - - - [4.1 启动训练](#41-启动训练) - - [4.2 FGD蒸馏训练](#42-FGD蒸馏训练) - -- [5. 模型评估与预测](#5-模型评估与预测) - - - [5.1 指标评估](#51-指标评估) - - [5.2 测试版面分析结果](#52-测试版面分析结果) - -- [6 模型导出与预测](#6-模型导出与预测) - - - [6.1 模型导出](#61-模型导出) - - - [6.2 模型推理](#62-模型推理) - -# 版面分析 - -## 1. 简介 - -版面分析指的是对图片形式的文档进行区域划分,定位其中的关键区域,如文字、标题、表格、图片等。版面分析算法基于[PaddleDetection](https://github.com/PaddlePaddle/PaddleDetection)的轻量模型PP-PicoDet进行开发。 - -
- -
- - - -## 2. 安装依赖 - -### 2.1. 安装PaddlePaddle - -- **(1) 安装PaddlePaddle** - -```bash -python3 -m pip install --upgrade pip - -# GPU安装 -python3 -m pip install "paddlepaddle-gpu>=2.2" -i https://mirror.baidu.com/pypi/simple - -# CPU安装 -python3 -m pip install "paddlepaddle>=2.2" -i https://mirror.baidu.com/pypi/simple -``` -更多需求,请参照[安装文档](https://www.paddlepaddle.org.cn/install/quick)中的说明进行操作。 - -### 2.2. 安装PaddleDetection - -- **(1)下载PaddleDetection源码** - -```bash -git clone https://github.com/PaddlePaddle/PaddleDetection.git -``` - -- **(2)安装其他依赖 ** - -```bash -cd PaddleDetection -python3 -m pip install -r requirements.txt -``` - -## 3. 数据准备 - -如果希望直接体验预测过程,可以跳过数据准备,下载我们提供的预训练模型。 - -### 3.1. 英文数据集 - -下载文档分析数据集[PubLayNet](https://developer.ibm.com/exchanges/data/all/publaynet/)(数据集96G),包含5个类:`{0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"}` - -``` -# 下载数据 -wget https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/publaynet.tar.gz -# 解压数据 -tar -xvf publaynet.tar.gz -``` - -解压之后的**目录结构:** - -``` -|-publaynet - |- test - |- PMC1277013_00004.jpg - |- PMC1291385_00002.jpg - | ... - |- train.json - |- train - |- PMC1291385_00002.jpg - |- PMC1277013_00004.jpg - | ... - |- val.json - |- val - |- PMC538274_00004.jpg - |- PMC539300_00004.jpg - | ... -``` - -**数据分布:** - -| File or Folder | Description | num | -| :------------- | :------------- | ------- | -| `train/` | 训练集图片 | 335,703 | -| `val/` | 验证集图片 | 11,245 | -| `test/` | 测试集图片 | 11,405 | -| `train.json` | 训练集标注文件 | - | -| `val.json` | 验证集标注文件 | - | - -**标注格式:** - -json文件包含所有图像的标注,数据以字典嵌套的方式存放,包含以下key: - -- info,表示标注文件info。 - -- licenses,表示标注文件licenses。 - -- images,表示标注文件中图像信息列表,每个元素是一张图像的信息。如下为其中一张图像的信息: - - ``` - { - 'file_name': 'PMC4055390_00006.jpg', # file_name - 'height': 601, # image height - 'width': 792, # image width - 'id': 341427 # image id - } - ``` - -- annotations,表示标注文件中目标物体的标注信息列表,每个元素是一个目标物体的标注信息。如下为其中一个目标物体的标注信息: - - ``` - { - - 'segmentation': # 物体的分割标注 - 'area': 60518.099043117836, # 物体的区域面积 - 'iscrowd': 0, # iscrowd - 'image_id': 341427, # image id - 'bbox': [50.58, 490.86, 240.15, 252.16], # bbox [x1,y1,w,h] - 'category_id': 1, # category_id - 'id': 3322348 # image id - } - ``` - -### 3.2. 更多数据集 - -我们提供了CDLA(中文版面分析)、TableBank(表格版面分析)等数据集的下连接,处理为上述标注文件json格式,即可以按相同方式进行训练。 - -| dataset | 简介 | -| ------------------------------------------------------------ | ------------------------------------------------------------ | -| [cTDaR2019_cTDaR](https://cndplab-founder.github.io/cTDaR2019/) | 用于表格检测(TRACKA)和表格识别(TRACKB)。图片类型包含历史数据集(以cTDaR_t0开头,如cTDaR_t00872.jpg)和现代数据集(以cTDaR_t1开头,cTDaR_t10482.jpg)。 | -| [IIIT-AR-13K](http://cvit.iiit.ac.in/usodi/iiitar13k.php) | 手动注释公开的年度报告中的图形或页面而构建的数据集,包含5类:table, figure, natural image, logo, and signature | -| [CDLA](https://github.com/buptlihang/CDLA) | 中文文档版面分析数据集,面向中文文献类(论文)场景,包含10类:Table、Figure、Figure caption、Table、Table caption、Header、Footer、Reference、Equation | -| [TableBank](https://github.com/doc-analysis/TableBank) | 用于表格检测和识别大型数据集,包含Word和Latex2种文档格式 | -| [DocBank](https://github.com/doc-analysis/DocBank) | 使用弱监督方法构建的大规模数据集(500K文档页面),用于文档布局分析,包含12类:Author、Caption、Date、Equation、Figure、Footer、List、Paragraph、Reference、Section、Table、Title | - - -## 4. 开始训练 - -提供了训练脚本、评估脚本和预测脚本,本节将以PubLayNet预训练模型为例进行讲解。 - -如果不希望训练,直接体验后面的模型评估、预测、动转静、推理的流程,可以下载提供的预训练模型,并跳过本部分。 - -``` -mkdir pretrained_model -cd pretrained_model -# 下载并解压PubLayNet预训练模型 -wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_layout.pdparams -``` - -### 4.1. 启动训练 - -开始训练: - -* 修改配置文件 - -如果你希望训练自己的数据集,需要修改配置文件中的数据配置、类别数。 - - -以`configs/picodet/legacy_model/application/layout_detection/picodet_lcnet_x1_0_layout.yml` 为例,修改的内容如下所示。 - -```yaml -metric: COCO -# 类别数 -num_classes: 5 - -TrainDataset: - !COCODataSet - # 修改为你自己的训练数据目录 - image_dir: train - # 修改为你自己的训练数据标签文件 - anno_path: train.json - # 修改为你自己的训练数据根目录 - dataset_dir: /root/publaynet/ - data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd'] - -EvalDataset: - !COCODataSet - # 修改为你自己的验证数据目录 - image_dir: val - # 修改为你自己的验证数据标签文件 - anno_path: val.json - # 修改为你自己的验证数据根目录 - dataset_dir: /root/publaynet/ - -TestDataset: - !ImageFolder - # 修改为你自己的测试数据标签文件 - anno_path: /root/publaynet/val.json -``` - -* 开始训练,在训练时,会默认下载PP-PicoDet预训练模型,这里无需预先下载。 - -```bash -# GPU训练 支持单卡,多卡训练 -# 训练日志会自动保存到 log 目录中 - -# 单卡训练 -python3 tools/train.py \ - -c configs/picodet/legacy_model/application/layout_detection/picodet_lcnet_x1_0_layout.yml \ - --eval - -# 多卡训练,通过--gpus参数指定卡号 -python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py \ - -c configs/picodet/legacy_model/application/layout_detection/picodet_lcnet_x1_0_layout.yml \ - --eval -``` - -正常启动训练后,会看到以下log输出: - -``` -[08/15 04:02:30] ppdet.utils.checkpoint INFO: Finish loading model weights: /root/.cache/paddle/weights/LCNet_x1_0_pretrained.pdparams -[08/15 04:02:46] ppdet.engine INFO: Epoch: [0] [ 0/1929] learning_rate: 0.040000 loss_vfl: 1.216707 loss_bbox: 1.142163 loss_dfl: 0.544196 loss: 2.903065 eta: 17 days, 13:50:26 batch_cost: 15.7452 data_cost: 2.9112 ips: 1.5243 images/s -[08/15 04:03:19] ppdet.engine INFO: Epoch: [0] [ 20/1929] learning_rate: 0.064000 loss_vfl: 1.180627 loss_bbox: 0.939552 loss_dfl: 0.442436 loss: 2.628206 eta: 2 days, 12:18:53 batch_cost: 1.5770 data_cost: 0.0008 ips: 15.2184 images/s -[08/15 04:03:47] ppdet.engine INFO: Epoch: [0] [ 40/1929] learning_rate: 0.088000 loss_vfl: 0.543321 loss_bbox: 1.071401 loss_dfl: 0.457817 loss: 2.057003 eta: 2 days, 0:07:03 batch_cost: 1.3190 data_cost: 0.0007 ips: 18.1954 images/s -[08/15 04:04:12] ppdet.engine INFO: Epoch: [0] [ 60/1929] learning_rate: 0.112000 loss_vfl: 0.630989 loss_bbox: 0.859183 loss_dfl: 0.384702 loss: 1.883143 eta: 1 day, 19:01:29 batch_cost: 1.2177 data_cost: 0.0006 ips: 19.7087 images/s -``` - -- `--eval`表示训练的同时,进行评估, 评估过程中默认将最佳模型,保存为 `output/picodet_lcnet_x1_0_layout/best_accuracy` 。 - -**注意,预测/评估时的配置文件请务必与训练一致。** - -### 4.2. FGD蒸馏训练 - -PaddleDetection支持了基于FGD([Focal and Global Knowledge Distillation for Detectors](https://arxiv.org/abs/2111.11837v1))蒸馏的目标检测模型训练过程,FGD蒸馏分为两个部分`Focal`和`Global`。`Focal`蒸馏分离图像的前景和背景,让学生模型分别关注教师模型的前景和背景部分特征的关键像素;`Global`蒸馏部分重建不同像素之间的关系并将其从教师转移到学生,以补偿`Focal`蒸馏中丢失的全局信息。 - -更换数据集,修改【TODO】配置中的数据配置、类别数,具体可以参考4.1。启动训练: - -```bash -python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py \ - -c configs/picodet/legacy_model/application/layout_detection/picodet_lcnet_x1_0_layout.yml \ - --slim_config configs/picodet/legacy_model/application/layout_detection/picodet_lcnet_x2_5_layout.yml \ - --eval -``` - -- `-c`: 指定模型配置文件。 -- `--slim_config`: 指定压缩策略配置文件。 - -## 5. 模型评估与预测 - -### 5.1. 指标评估 - -训练中模型参数默认保存在`output/picodet_lcnet_x1_0_layout`目录下。在评估指标时,需要设置`weights`指向保存的参数文件。评估数据集可以通过 `configs/picodet/legacy_model/application/layout_detection/picodet_lcnet_x1_0_layout.yml` 修改`EvalDataset`中的 `image_dir`、`anno_path`和`dataset_dir` 设置。 - -```bash -# GPU 评估, weights 为待测权重 -python3 tools/eval.py \ - -c configs/picodet/legacy_model/application/layout_detection/picodet_lcnet_x1_0_layout.yml \ - -o weigths=./output/picodet_lcnet_x1_0_layout/best_model -``` - -会输出以下信息,打印出mAP、AP0.5等信息。 - -```py - Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.935 - Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.979 - Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.956 - Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.404 - Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.782 - Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.969 - Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.539 - Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.938 - Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.949 - Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.495 - Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.818 - Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.978 -[08/15 07:07:09] ppdet.engine INFO: Total sample number: 11245, averge FPS: 24.405059207157436 -[08/15 07:07:09] ppdet.engine INFO: Best test bbox ap is 0.935. -``` - -使用FGD蒸馏模型进行评估: - -``` -python3 tools/eval.py \ - -c configs/picodet/legacy_model/application/layout_detection/picodet_lcnet_x1_0_layout.yml \ - --slim_config configs/picodet/legacy_model/application/layout_detection/picodet_lcnet_x2_5_layout.yml \ - -o weights=output/picodet_lcnet_x2_5_layout/best_model -``` - -- `-c`: 指定模型配置文件。 -- `--slim_config`: 指定蒸馏策略配置文件。 -- `-o weights`: 指定蒸馏算法训好的模型路径。 - -### 5.2. 测试版面分析结果 - - -预测使用的配置文件必须与训练一致,如您通过 `python3 tools/train.py -c configs/picodet/legacy_model/application/layout_detection/picodet_lcnet_x1_0_layout.yml` 完成了模型的训练过程。 - -使用 PaddleDetection 训练好的模型,您可以使用如下命令进行中文模型预测。 - - -```bash -python3 tools/infer.py \ - -c configs/picodet/legacy_model/application/layout_detection/picodet_lcnet_x1_0_layout.yml \ - -o weights='output/picodet_lcnet_x1_0_layout/best_model.pdparams' \ - --infer_img='docs/images/layout.jpg' \ - --output_dir=output_dir/ \ - --draw_threshold=0.4 -``` - -- `--infer_img`: 推理单张图片,也可以通过`--infer_dir`推理文件中的所有图片。 -- `--output_dir`: 指定可视化结果保存路径。 -- `--draw_threshold`:指定绘制结果框的NMS阈值。 - -预测图片如下所示,图片会存储在`output_dir`路径中。 - -使用FGD蒸馏模型进行测试: - -``` -python3 tools/infer.py \ - -c configs/picodet/legacy_model/application/layout_detection/picodet_lcnet_x1_0_layout.yml \ - --slim_config configs/picodet/legacy_model/application/layout_detection/picodet_lcnet_x2_5_layout.yml \ - -o weights='output/picodet_lcnet_x2_5_layout/best_model.pdparams' \ - --infer_img='docs/images/layout.jpg' \ - --output_dir=output_dir/ \ - --draw_threshold=0.4 -``` - - - -## 6. 模型导出与预测 - - -### 6.1 模型导出 - -inference 模型(`paddle.jit.save`保存的模型) 一般是模型训练,把模型结构和模型参数保存在文件中的固化模型,多用于预测部署场景。 训练过程中保存的模型是checkpoints模型,保存的只有模型的参数,多用于恢复训练等。 与checkpoints模型相比,inference 模型会额外保存模型的结构信息,在预测部署、加速推理上性能优越,灵活方便,适合于实际系统集成。 - -版面分析模型转inference模型步骤如下: - -```bash -python3 tools/export_model.py \ - -c configs/picodet/legacy_model/application/layout_detection/picodet_lcnet_x1_0_layout.yml \ - -o weights=output/picodet_lcnet_x1_0_layout/best_model \ - --output_dir=output_inference/ -``` - -* 如无需导出后处理,请指定:`-o export.benchmark=True`(如果-o已出现过,此处删掉-o) -* 如无需导出NMS,请指定:`-o export.nms=False` - -转换成功后,在目录下有三个文件: - -``` -output_inference/picodet_lcnet_x1_0_layout/ - ├── model.pdiparams # inference模型的参数文件 - ├── model.pdiparams.info # inference模型的参数信息,可忽略 - └── model.pdmodel # inference模型的模型结构文件 -``` - -FGD蒸馏模型转inference模型步骤如下: - -```bash -python3 tools/export_model.py \ - -c configs/picodet/legacy_model/application/publayernet_lcnet_x1_5/picodet_student.yml \ - --slim_config configs/picodet/legacy_model/application/publayernet_lcnet_x1_5/picodet_teacher.yml \ - -o weights=./output/picodet_lcnet_x2_5_layout/best_model \ - --output_dir=output_inference/ -``` - - - -### 6.2 模型推理 - -版面恢复任务进行推理,可以执行如下命令: - -```bash -python3 deploy/python/infer.py \ - --model_dir=output_inference/picodet_lcnet_x1_0_layout/ \ - --image_file=docs/images/layout.jpg \ - --device=CPU -``` - -- --device:指定GPU、CPU设备 - -模型推理完成,会看到以下log输出 - -``` ------------------------------------------- ------------ Model Configuration ----------- -Model Arch: PicoDet -Transform Order: ---transform op: Resize ---transform op: NormalizeImage ---transform op: Permute ---transform op: PadStride --------------------------------------------- -class_id:0, confidence:0.9921, left_top:[20.18,35.66],right_bottom:[341.58,600.99] -class_id:0, confidence:0.9914, left_top:[19.77,611.42],right_bottom:[341.48,901.82] -class_id:0, confidence:0.9904, left_top:[369.36,375.10],right_bottom:[691.29,600.59] -class_id:0, confidence:0.9835, left_top:[369.60,608.60],right_bottom:[691.38,736.72] -class_id:0, confidence:0.9830, left_top:[369.58,805.38],right_bottom:[690.97,901.80] -class_id:0, confidence:0.9716, left_top:[383.68,271.44],right_bottom:[688.93,335.39] -class_id:0, confidence:0.9452, left_top:[370.82,34.48],right_bottom:[688.10,63.54] -class_id:1, confidence:0.8712, left_top:[370.84,771.03],right_bottom:[519.30,789.13] -class_id:3, confidence:0.9856, left_top:[371.28,67.85],right_bottom:[685.73,267.72] -save result to: output/layout.jpg -Test iter 0 ------------------- Inference Time Info ---------------------- -total_time(ms): 2196.0, img_num: 1 -average latency time(ms): 2196.00, QPS: 0.455373 -preprocess_time(ms): 2172.50, inference_time(ms): 11.90, postprocess_time(ms): 11.60 -``` - -- Model:模型结构 -- Transform Order:预处理操作 -- class_id、confidence、left_top、right_bottom:分别表示类别id、置信度、左上角坐标、右下角坐标 -- save result to:可视化版面分析结果保存路径,默认保存到`./output`文件夹 -- Inference Time Info:推理时间,其中preprocess_time表示预处理耗时,inference_time表示模型预测耗时,postprocess_time表示后处理耗时 - -可视化版面结果如下图所示 - -
- -
- - - -## Citations - -``` -@inproceedings{zhong2019publaynet, - title={PubLayNet: largest dataset ever for document layout analysis}, - author={Zhong, Xu and Tang, Jianbin and Yepes, Antonio Jimeno}, - booktitle={2019 International Conference on Document Analysis and Recognition (ICDAR)}, - year={2019}, - volume={}, - number={}, - pages={1015-1022}, - doi={10.1109/ICDAR.2019.00166}, - ISSN={1520-5363}, - month={Sep.}, - organization={IEEE} -} - -@inproceedings{yang2022focal, - title={Focal and global knowledge distillation for detectors}, - author={Yang, Zhendong and Li, Zhe and Jiang, Xiaohu and Gong, Yuan and Yuan, Zehuan and Zhao, Danpei and Yuan, Chun}, - booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, - pages={4643--4652}, - year={2022} -} -``` - diff --git a/ppstructure/predict_system.py b/ppstructure/predict_system.py index 053a8aac..68a84a53 100644 --- a/ppstructure/predict_system.py +++ b/ppstructure/predict_system.py @@ -28,13 +28,12 @@ import time import logging from copy import deepcopy -from ppocr.utils.utility import get_image_file_list, check_and_read_gif +from ppocr.utils.utility import get_image_file_list, check_and_read from ppocr.utils.logging import get_logger from tools.infer.predict_system import TextSystem from ppstructure.layout.predict_layout import LayoutPredictor from ppstructure.table.predict_table import TableSystem, to_excel from ppstructure.utility import parse_args, draw_structure_result -from ppstructure.recovery.recovery_to_doc import convert_info_docx logger = get_logger() @@ -78,7 +77,7 @@ class StructureSystem(object): elif self.mode == 'vqa': raise NotImplementedError - def __call__(self, img, return_ocr_result_in_table=False): + def __call__(self, img, img_idx=0, return_ocr_result_in_table=False): time_dict = { 'image_orientation': 0, 'layout': 0, @@ -143,8 +142,8 @@ class StructureSystem(object): time_dict['det'] += ocr_time_dict['det'] time_dict['rec'] += ocr_time_dict['rec'] - # remove style char, - # when using the recognition model trained on the PubtabNet dataset, + # remove style char, + # when using the recognition model trained on the PubtabNet dataset, # it will recognize the text format in the table, such as style_token = [ '', '', '', '', '', @@ -169,7 +168,8 @@ class StructureSystem(object): 'type': region['label'].lower(), 'bbox': [x1, y1, x2, y2], 'img': roi_img, - 'res': res + 'res': res, + 'img_idx': img_idx }) end = time.time() time_dict['all'] = end - start @@ -179,26 +179,29 @@ class StructureSystem(object): return None, None -def save_structure_res(res, save_folder, img_name): +def save_structure_res(res, save_folder, img_name, img_idx=0): excel_save_folder = os.path.join(save_folder, img_name) os.makedirs(excel_save_folder, exist_ok=True) res_cp = deepcopy(res) # save res with open( - os.path.join(excel_save_folder, 'res.txt'), 'w', + os.path.join(excel_save_folder, 'res_{}.txt'.format(img_idx)), + 'w', encoding='utf8') as f: for region in res_cp: roi_img = region.pop('img') f.write('{}\n'.format(json.dumps(region))) - if region['type'] == 'table' and len(region[ + if region['type'].lower() == 'table' and len(region[ 'res']) > 0 and 'html' in region['res']: - excel_path = os.path.join(excel_save_folder, - '{}.xlsx'.format(region['bbox'])) + excel_path = os.path.join( + excel_save_folder, + '{}_{}.xlsx'.format(region['bbox'], img_idx)) to_excel(region['res']['html'], excel_path) - elif region['type'] == 'figure': - img_path = os.path.join(excel_save_folder, - '{}.jpg'.format(region['bbox'])) + elif region['type'].lower() == 'figure': + img_path = os.path.join( + excel_save_folder, + '{}_{}.jpg'.format(region['bbox'], img_idx)) cv2.imwrite(img_path, roi_img) @@ -214,28 +217,75 @@ def main(args): for i, image_file in enumerate(image_file_list): logger.info("[{}/{}] {}".format(i, img_num, image_file)) - img, flag = check_and_read_gif(image_file) + img, flag_gif, flag_pdf = check_and_read(image_file) img_name = os.path.basename(image_file).split('.')[0] - if not flag: + if not flag_gif and not flag_pdf: img = cv2.imread(image_file) - if img is None: - logger.error("error in loading image:{}".format(image_file)) - continue - res, time_dict = structure_sys(img) - if structure_sys.mode == 'structure': - save_structure_res(res, save_folder, img_name) - draw_img = draw_structure_result(img, res, args.vis_font_path) - img_save_path = os.path.join(save_folder, img_name, 'show.jpg') - elif structure_sys.mode == 'vqa': - raise NotImplementedError - # draw_img = draw_ser_results(img, res, args.vis_font_path) - # img_save_path = os.path.join(save_folder, img_name + '.jpg') - cv2.imwrite(img_save_path, draw_img) - logger.info('result save to {}'.format(img_save_path)) - if args.recovery: - convert_info_docx(img, res, save_folder, img_name) + if not flag_pdf: + if img is None: + logger.error("error in loading image:{}".format(image_file)) + continue + res, time_dict = structure_sys(img) + + if structure_sys.mode == 'structure': + save_structure_res(res, save_folder, img_name) + draw_img = draw_structure_result(img, res, args.vis_font_path) + img_save_path = os.path.join(save_folder, img_name, 'show.jpg') + elif structure_sys.mode == 'vqa': + raise NotImplementedError + # draw_img = draw_ser_results(img, res, args.vis_font_path) + # img_save_path = os.path.join(save_folder, img_name + '.jpg') + cv2.imwrite(img_save_path, draw_img) + logger.info('result save to {}'.format(img_save_path)) + if args.recovery: + try: + from ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_info_docx + h, w, _ = img.shape + res = sorted_layout_boxes(res, w) + convert_info_docx(img, res, save_folder, img_name, + args.save_pdf) + except Exception as ex: + logger.error( + "error in layout recovery image:{}, err msg: {}".format( + image_file, ex)) + continue + else: + pdf_imgs = img + all_res = [] + for index, img in enumerate(pdf_imgs): + + res, time_dict = structure_sys(img, index) + if structure_sys.mode == 'structure' and res != []: + save_structure_res(res, save_folder, img_name, index) + draw_img = draw_structure_result(img, res, + args.vis_font_path) + img_save_path = os.path.join(save_folder, img_name, + 'show_{}.jpg'.format(index)) + elif structure_sys.mode == 'vqa': + raise NotImplementedError + # draw_img = draw_ser_results(img, res, args.vis_font_path) + # img_save_path = os.path.join(save_folder, img_name + '.jpg') + if res != []: + cv2.imwrite(img_save_path, draw_img) + logger.info('result save to {}'.format(img_save_path)) + if args.recovery and res != []: + from ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_info_docx + h, w, _ = img.shape + res = sorted_layout_boxes(res, w) + all_res += res + + if args.recovery and all_res != []: + try: + convert_info_docx(img, all_res, save_folder, img_name, + args.save_pdf) + except Exception as ex: + logger.error( + "error in layout recovery image:{}, err msg: {}".format( + image_file, ex)) + continue + logger.info("Predict time : {:.3f}s".format(time_dict['all'])) diff --git a/ppstructure/recovery/README.md b/ppstructure/recovery/README.md index 883dbef3..713d0307 100644 --- a/ppstructure/recovery/README.md +++ b/ppstructure/recovery/README.md @@ -78,9 +78,27 @@ wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar && tar xf ch_PP-OCRv3_rec_infer.tar # Download the ultra-lightweight English table inch model and unzip it wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar && tar xf en_ppocr_mobile_v2.0_table_structure_infer.tar +# Download the layout model of publaynet dataset and unzip it +wget +https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_layout_infer.tar && tar picodet_lcnet_x1_0_layout_infer.tar cd .. # run -python3 predict_system.py --det_model_dir=inference/en_PP-OCRv3_det_infer --rec_model_dir=inference/en_PP-OCRv3_rec_infer --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer --rec_char_dict_path=../ppocr/utils/en_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --output ./output/table --rec_image_shape=3,48,320 --vis_font_path=../doc/fonts/simfang.ttf --recovery=True --image_dir=./docs/table/1.png +python3 predict_system.py \ + --image_dir=./docs/table/1.png \ + --det_model_dir=inference/en_PP-OCRv3_det_infer \ + --rec_model_dir=inference/en_PP-OCRv3_rec_infe \ + --rec_char_dict_path=../ppocr/utils/en_dict.txt \ + --output=../output/ \ + --table_model_dir=inference/ch_ppstructure_mobile_v2.0_SLANet_infer \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \ + --table_max_len=488 \ + --layout_model_dir=inference/picodet_lcnet_x1_0_layout_infer \ + --layout_dict_path=../ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt \ + --vis_font_path=../doc/fonts/simfang.ttf \ + --recovery=True \ + --save_pdf=False ``` -After running, the docx of each picture will be saved in the directory specified by the output field \ No newline at end of file +After running, the docx of each picture will be saved in the directory specified by the output field + +Recovery table to Word code[table_process.py] reference:https://github.com/pqzx/html2docx.git \ No newline at end of file diff --git a/ppstructure/recovery/README_ch.md b/ppstructure/recovery/README_ch.md index 5a05abff..14ca8836 100644 --- a/ppstructure/recovery/README_ch.md +++ b/ppstructure/recovery/README_ch.md @@ -35,21 +35,15 @@ python3 -m pip install --upgrade pip # GPU安装 -python3 -m pip install "paddlepaddle-gpu>=2.2" -i https://mirror.baidu.com/pypi/simple +python3 -m pip install "paddlepaddle-gpu>=2.3" -i https://mirror.baidu.com/pypi/simple # CPU安装 -python3 -m pip install "paddlepaddle>=2.2" -i https://mirror.baidu.com/pypi/simple +python3 -m pip install "paddlepaddle>=2.3" -i https://mirror.baidu.com/pypi/simple ``` 更多需求,请参照[安装文档](https://www.paddlepaddle.org.cn/install/quick)中的说明进行操作。 -* **(2)安装依赖** - -```bash -python3 -m pip install -r ppstructure/recovery/requirements.txt -``` - ### 2.2 安装PaddleOCR @@ -87,11 +81,28 @@ wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar # 下载英文轻量级PP-OCRv3模型的识别模型并解压 wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar && tar xf ch_PP-OCRv3_rec_infer.tar # 下载超轻量级英文表格英寸模型并解压 -wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar && tar xf en_ppocr_mobile_v2.0_table_structure_infer.tar +wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar && tar xf ch_ppstructure_mobile_v2.0_SLANet_infer.tar +# 下载英文版面分析模型 +wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_layout_infer.tar && tar picodet_lcnet_x1_0_layout_infer.tar cd .. + # 执行预测 -python3 predict_system.py --det_model_dir=inference/en_PP-OCRv3_det_infer --rec_model_dir=inference/en_PP-OCRv3_rec_infer --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer --rec_char_dict_path=../ppocr/utils/en_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --output ./output/table --rec_image_shape=3,48,320 --vis_font_path=../doc/fonts/simfang.ttf --recovery=True --image_dir=./docs/table/1.png +python3 predict_system.py \ + --image_dir=./docs/table/1.png \ + --det_model_dir=inference/en_PP-OCRv3_det_infer \ + --rec_model_dir=inference/en_PP-OCRv3_rec_infe \ + --rec_char_dict_path=../ppocr/utils/en_dict.txt \ + --output=../output/ \ + --table_model_dir=inference/ch_ppstructure_mobile_v2.0_SLANet_infer \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \ + --table_max_len=488 \ + --layout_model_dir=inference/picodet_lcnet_x1_0_layout_infer \ + --layout_dict_path=../ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt \ + --vis_font_path=../doc/fonts/simfang.ttf \ + --recovery=True \ + --save_pdf=False ``` -运行完成后,每张图片的docx文档会保存到output字段指定的目录下 +运行完成后,每张图片的docx文档会保存到`output`字段指定的目录下 +表格恢复到Word代码[table_process.py]来自:https://github.com/pqzx/html2docx.git diff --git a/ppstructure/recovery/recovery_to_doc.py b/ppstructure/recovery/recovery_to_doc.py index 5278217d..4401b1f2 100644 --- a/ppstructure/recovery/recovery_to_doc.py +++ b/ppstructure/recovery/recovery_to_doc.py @@ -22,21 +22,23 @@ from docx import shared from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.enum.section import WD_SECTION from docx.oxml.ns import qn +from docx.enum.table import WD_TABLE_ALIGNMENT + +from table_process import HtmlToDocx from ppocr.utils.logging import get_logger logger = get_logger() -def convert_info_docx(img, res, save_folder, img_name): +def convert_info_docx(img, res, save_folder, img_name, save_pdf): doc = Document() doc.styles['Normal'].font.name = 'Times New Roman' doc.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体') doc.styles['Normal'].font.size = shared.Pt(6.5) - h, w, _ = img.shape - res = sorted_layout_boxes(res, w) flag = 1 for i, region in enumerate(res): + img_idx = region['img_idx'] if flag == 2 and region['layout'] == 'single': section = doc.add_section(WD_SECTION.CONTINUOUS) section._sectPr.xpath('./w:cols')[0].set(qn('w:num'), '1') @@ -46,10 +48,10 @@ def convert_info_docx(img, res, save_folder, img_name): section._sectPr.xpath('./w:cols')[0].set(qn('w:num'), '2') flag = 2 - if region['type'] == 'Figure': + if region['type'].lower() == 'figure': excel_save_folder = os.path.join(save_folder, img_name) img_path = os.path.join(excel_save_folder, - '{}.jpg'.format(region['bbox'])) + '{}_{}.jpg'.format(region['bbox'], img_idx)) paragraph_pic = doc.add_paragraph() paragraph_pic.alignment = WD_ALIGN_PARAGRAPH.CENTER run = paragraph_pic.add_run("") @@ -57,40 +59,38 @@ def convert_info_docx(img, res, save_folder, img_name): run.add_picture(img_path, width=shared.Inches(5)) elif flag == 2: run.add_picture(img_path, width=shared.Inches(2)) - elif region['type'] == 'Title': + elif region['type'].lower() == 'title': doc.add_heading(region['res'][0]['text']) - elif region['type'] == 'Text': + elif region['type'].lower() == 'table': + paragraph = doc.add_paragraph() + new_parser = HtmlToDocx() + new_parser.table_style = 'TableGrid' + table = new_parser.handle_table(html=region['res']['html']) + new_table = deepcopy(table) + new_table.alignment = WD_TABLE_ALIGNMENT.CENTER + paragraph.add_run().element.addnext(new_table._tbl) + + else: paragraph = doc.add_paragraph() paragraph_format = paragraph.paragraph_format for i, line in enumerate(region['res']): if i == 0: paragraph_format.first_line_indent = shared.Inches(0.25) text_run = paragraph.add_run(line['text'] + ' ') - text_run.font.size = shared.Pt(9) - elif region['type'] == 'Table': - pypandoc.convert( - source=region['res']['html'], - format='html', - to='docx', - outputfile='tmp.docx') - tmp_doc = Document('tmp.docx') - paragraph = doc.add_paragraph() - - table = tmp_doc.tables[0] - new_table = deepcopy(table) - new_table.style = doc.styles['Table Grid'] - from docx.enum.table import WD_TABLE_ALIGNMENT - new_table.alignment = WD_TABLE_ALIGNMENT.CENTER - paragraph.add_run().element.addnext(new_table._tbl) - os.remove('tmp.docx') - else: - continue + text_run.font.size = shared.Pt(10) # save to docx docx_path = os.path.join(save_folder, '{}.docx'.format(img_name)) doc.save(docx_path) logger.info('docx save to {}'.format(docx_path)) + # save to pdf + if save_pdf: + pdf = os.path.join(save_folder, '{}.pdf'.format(img_name)) + from docx2pdf import convert + convert(docx_path, pdf_path) + logger.info('pdf save to {}'.format(pdf)) + def sorted_layout_boxes(res, w): """ diff --git a/ppstructure/recovery/requirements.txt b/ppstructure/recovery/requirements.txt index 04187baa..5ba3099d 100644 --- a/ppstructure/recovery/requirements.txt +++ b/ppstructure/recovery/requirements.txt @@ -1,3 +1,5 @@ -opencv-contrib-python==4.4.0.46 pypandoc -python-docx \ No newline at end of file +python-docx +docx2pdf +fitz +PyMuPDF \ No newline at end of file diff --git a/ppstructure/recovery/table_process.py b/ppstructure/recovery/table_process.py new file mode 100644 index 00000000..243aaf89 --- /dev/null +++ b/ppstructure/recovery/table_process.py @@ -0,0 +1,632 @@ + +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from:https://github.com/pqzx/html2docx/blob/8f6695a778c68befb302e48ac0ed5201ddbd4524/htmldocx/h2d.py + +""" +import re, argparse +import io, os +import urllib.request +from urllib.parse import urlparse +from html.parser import HTMLParser + +import docx, docx.table +from docx import Document +from docx.shared import RGBColor, Pt, Inches +from docx.enum.text import WD_COLOR, WD_ALIGN_PARAGRAPH +from docx.oxml import OxmlElement +from docx.oxml.ns import qn + +from bs4 import BeautifulSoup + +# values in inches +INDENT = 0.25 +LIST_INDENT = 0.5 +MAX_INDENT = 5.5 # To stop indents going off the page + +# Style to use with tables. By default no style is used. +DEFAULT_TABLE_STYLE = None + +# Style to use with paragraphs. By default no style is used. +DEFAULT_PARAGRAPH_STYLE = None + + +def get_filename_from_url(url): + return os.path.basename(urlparse(url).path) + +def is_url(url): + """ + Not to be used for actually validating a url, but in our use case we only + care if it's a url or a file path, and they're pretty distinguishable + """ + parts = urlparse(url) + return all([parts.scheme, parts.netloc, parts.path]) + +def fetch_image(url): + """ + Attempts to fetch an image from a url. + If successful returns a bytes object, else returns None + :return: + """ + try: + with urllib.request.urlopen(url) as response: + # security flaw? + return io.BytesIO(response.read()) + except urllib.error.URLError: + return None + +def remove_last_occurence(ls, x): + ls.pop(len(ls) - ls[::-1].index(x) - 1) + +def remove_whitespace(string, leading=False, trailing=False): + """Remove white space from a string. + Args: + string(str): The string to remove white space from. + leading(bool, optional): Remove leading new lines when True. + trailing(bool, optional): Remove trailing new lines when False. + Returns: + str: The input string with new line characters removed and white space squashed. + Examples: + Single or multiple new line characters are replaced with space. + >>> remove_whitespace("abc\\ndef") + 'abc def' + >>> remove_whitespace("abc\\n\\n\\ndef") + 'abc def' + New line characters surrounded by white space are replaced with a single space. + >>> remove_whitespace("abc \\n \\n \\n def") + 'abc def' + >>> remove_whitespace("abc \\n \\n \\n def") + 'abc def' + Leading and trailing new lines are replaced with a single space. + >>> remove_whitespace("\\nabc") + ' abc' + >>> remove_whitespace(" \\n abc") + ' abc' + >>> remove_whitespace("abc\\n") + 'abc ' + >>> remove_whitespace("abc \\n ") + 'abc ' + Use ``leading=True`` to remove leading new line characters, including any surrounding + white space: + >>> remove_whitespace("\\nabc", leading=True) + 'abc' + >>> remove_whitespace(" \\n abc", leading=True) + 'abc' + Use ``trailing=True`` to remove trailing new line characters, including any surrounding + white space: + >>> remove_whitespace("abc \\n ", trailing=True) + 'abc' + """ + # Remove any leading new line characters along with any surrounding white space + if leading: + string = re.sub(r'^\s*\n+\s*', '', string) + + # Remove any trailing new line characters along with any surrounding white space + if trailing: + string = re.sub(r'\s*\n+\s*$', '', string) + + # Replace new line characters and absorb any surrounding space. + string = re.sub(r'\s*\n\s*', ' ', string) + # TODO need some way to get rid of extra spaces in e.g. text text + return re.sub(r'\s+', ' ', string) + +def delete_paragraph(paragraph): + # https://github.com/python-openxml/python-docx/issues/33#issuecomment-77661907 + p = paragraph._element + p.getparent().remove(p) + p._p = p._element = None + +font_styles = { + 'b': 'bold', + 'strong': 'bold', + 'em': 'italic', + 'i': 'italic', + 'u': 'underline', + 's': 'strike', + 'sup': 'superscript', + 'sub': 'subscript', + 'th': 'bold', +} + +font_names = { + 'code': 'Courier', + 'pre': 'Courier', +} + +styles = { + 'LIST_BULLET': 'List Bullet', + 'LIST_NUMBER': 'List Number', +} + +class HtmlToDocx(HTMLParser): + + def __init__(self): + super().__init__() + self.options = { + 'fix-html': True, + 'images': True, + 'tables': True, + 'styles': True, + } + self.table_row_selectors = [ + 'table > tr', + 'table > thead > tr', + 'table > tbody > tr', + 'table > tfoot > tr' + ] + self.table_style = DEFAULT_TABLE_STYLE + self.paragraph_style = DEFAULT_PARAGRAPH_STYLE + + def set_initial_attrs(self, document=None): + self.tags = { + 'span': [], + 'list': [], + } + if document: + self.doc = document + else: + self.doc = Document() + self.bs = self.options['fix-html'] # whether or not to clean with BeautifulSoup + self.document = self.doc + self.include_tables = True #TODO add this option back in? + self.include_images = self.options['images'] + self.include_styles = self.options['styles'] + self.paragraph = None + self.skip = False + self.skip_tag = None + self.instances_to_skip = 0 + + def copy_settings_from(self, other): + """Copy settings from another instance of HtmlToDocx""" + self.table_style = other.table_style + self.paragraph_style = other.paragraph_style + + def get_cell_html(self, soup): + # Returns string of td element with opening and closing tags removed + # Cannot use find_all as it only finds element tags and does not find text which + # is not inside an element + return ' '.join([str(i) for i in soup.contents]) + + def add_styles_to_paragraph(self, style): + if 'text-align' in style: + align = style['text-align'] + if align == 'center': + self.paragraph.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER + elif align == 'right': + self.paragraph.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.RIGHT + elif align == 'justify': + self.paragraph.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY + if 'margin-left' in style: + margin = style['margin-left'] + units = re.sub(r'[0-9]+', '', margin) + margin = int(float(re.sub(r'[a-z]+', '', margin))) + if units == 'px': + self.paragraph.paragraph_format.left_indent = Inches(min(margin // 10 * INDENT, MAX_INDENT)) + # TODO handle non px units + + def add_styles_to_run(self, style): + if 'color' in style: + if 'rgb' in style['color']: + color = re.sub(r'[a-z()]+', '', style['color']) + colors = [int(x) for x in color.split(',')] + elif '#' in style['color']: + color = style['color'].lstrip('#') + colors = tuple(int(color[i:i+2], 16) for i in (0, 2, 4)) + else: + colors = [0, 0, 0] + # TODO map colors to named colors (and extended colors...) + # For now set color to black to prevent crashing + self.run.font.color.rgb = RGBColor(*colors) + + if 'background-color' in style: + if 'rgb' in style['background-color']: + color = color = re.sub(r'[a-z()]+', '', style['background-color']) + colors = [int(x) for x in color.split(',')] + elif '#' in style['background-color']: + color = style['background-color'].lstrip('#') + colors = tuple(int(color[i:i+2], 16) for i in (0, 2, 4)) + else: + colors = [0, 0, 0] + # TODO map colors to named colors (and extended colors...) + # For now set color to black to prevent crashing + self.run.font.highlight_color = WD_COLOR.GRAY_25 #TODO: map colors + + def apply_paragraph_style(self, style=None): + try: + if style: + self.paragraph.style = style + elif self.paragraph_style: + self.paragraph.style = self.paragraph_style + except KeyError as e: + raise ValueError(f"Unable to apply style {self.paragraph_style}.") from e + + def parse_dict_string(self, string, separator=';'): + new_string = string.replace(" ", '').split(separator) + string_dict = dict([x.split(':') for x in new_string if ':' in x]) + return string_dict + + def handle_li(self): + # check list stack to determine style and depth + list_depth = len(self.tags['list']) + if list_depth: + list_type = self.tags['list'][-1] + else: + list_type = 'ul' # assign unordered if no tag + + if list_type == 'ol': + list_style = styles['LIST_NUMBER'] + else: + list_style = styles['LIST_BULLET'] + + self.paragraph = self.doc.add_paragraph(style=list_style) + self.paragraph.paragraph_format.left_indent = Inches(min(list_depth * LIST_INDENT, MAX_INDENT)) + self.paragraph.paragraph_format.line_spacing = 1 + + def add_image_to_cell(self, cell, image): + # python-docx doesn't have method yet for adding images to table cells. For now we use this + paragraph = cell.add_paragraph() + run = paragraph.add_run() + run.add_picture(image) + + def handle_img(self, current_attrs): + if not self.include_images: + self.skip = True + self.skip_tag = 'img' + return + src = current_attrs['src'] + # fetch image + src_is_url = is_url(src) + if src_is_url: + try: + image = fetch_image(src) + except urllib.error.URLError: + image = None + else: + image = src + # add image to doc + if image: + try: + if isinstance(self.doc, docx.document.Document): + self.doc.add_picture(image) + else: + self.add_image_to_cell(self.doc, image) + except FileNotFoundError: + image = None + if not image: + if src_is_url: + self.doc.add_paragraph("" % src) + else: + # avoid exposing filepaths in document + self.doc.add_paragraph("" % get_filename_from_url(src)) + + + def handle_table(self, html): + """ + To handle nested tables, we will parse tables manually as follows: + Get table soup + Create docx table + Iterate over soup and fill docx table with new instances of this parser + Tell HTMLParser to ignore any tags until the corresponding closing table tag + """ + doc = Document() + table_soup = BeautifulSoup(html, 'html.parser') + rows, cols_len = self.get_table_dimensions(table_soup) + table = doc.add_table(len(rows), cols_len) + table.style = doc.styles['Table Grid'] + cell_row = 0 + for index, row in enumerate(rows): + cols = self.get_table_columns(row) + cell_col = 0 + for col in cols: + colspan = int(col.attrs.get('colspan', 1)) + rowspan = int(col.attrs.get('rowspan', 1)) + + cell_html = self.get_cell_html(col) + + if col.name == 'th': + cell_html = "%s" % cell_html + docx_cell = table.cell(cell_row, cell_col) + while docx_cell.text != '': # Skip the merged cell + cell_col += 1 + docx_cell = table.cell(cell_row, cell_col) + + cell_to_merge = table.cell(cell_row + rowspan - 1, cell_col + colspan - 1) + if docx_cell != cell_to_merge: + docx_cell.merge(cell_to_merge) + + child_parser = HtmlToDocx() + child_parser.copy_settings_from(self) + + child_parser.add_html_to_cell(cell_html or ' ', docx_cell) # occupy the position + + cell_col += colspan + cell_row += 1 + + # skip all tags until corresponding closing tag + self.instances_to_skip = len(table_soup.find_all('table')) + self.skip_tag = 'table' + self.skip = True + self.table = None + return table + + def handle_link(self, href, text): + # Link requires a relationship + is_external = href.startswith('http') + rel_id = self.paragraph.part.relate_to( + href, + docx.opc.constants.RELATIONSHIP_TYPE.HYPERLINK, + is_external=True # don't support anchor links for this library yet + ) + + # Create the w:hyperlink tag and add needed values + hyperlink = docx.oxml.shared.OxmlElement('w:hyperlink') + hyperlink.set(docx.oxml.shared.qn('r:id'), rel_id) + + + # Create sub-run + subrun = self.paragraph.add_run() + rPr = docx.oxml.shared.OxmlElement('w:rPr') + + # add default color + c = docx.oxml.shared.OxmlElement('w:color') + c.set(docx.oxml.shared.qn('w:val'), "0000EE") + rPr.append(c) + + # add underline + u = docx.oxml.shared.OxmlElement('w:u') + u.set(docx.oxml.shared.qn('w:val'), 'single') + rPr.append(u) + + subrun._r.append(rPr) + subrun._r.text = text + + # Add subrun to hyperlink + hyperlink.append(subrun._r) + + # Add hyperlink to run + self.paragraph._p.append(hyperlink) + + def handle_starttag(self, tag, attrs): + if self.skip: + return + if tag == 'head': + self.skip = True + self.skip_tag = tag + self.instances_to_skip = 0 + return + elif tag == 'body': + return + + current_attrs = dict(attrs) + + if tag == 'span': + self.tags['span'].append(current_attrs) + return + elif tag == 'ol' or tag == 'ul': + self.tags['list'].append(tag) + return # don't apply styles for now + elif tag == 'br': + self.run.add_break() + return + + self.tags[tag] = current_attrs + if tag in ['p', 'pre']: + self.paragraph = self.doc.add_paragraph() + self.apply_paragraph_style() + + elif tag == 'li': + self.handle_li() + + elif tag == "hr": + + # This implementation was taken from: + # https://github.com/python-openxml/python-docx/issues/105#issuecomment-62806373 + + self.paragraph = self.doc.add_paragraph() + pPr = self.paragraph._p.get_or_add_pPr() + pBdr = OxmlElement('w:pBdr') + pPr.insert_element_before(pBdr, + 'w:shd', 'w:tabs', 'w:suppressAutoHyphens', 'w:kinsoku', 'w:wordWrap', + 'w:overflowPunct', 'w:topLinePunct', 'w:autoSpaceDE', 'w:autoSpaceDN', + 'w:bidi', 'w:adjustRightInd', 'w:snapToGrid', 'w:spacing', 'w:ind', + 'w:contextualSpacing', 'w:mirrorIndents', 'w:suppressOverlap', 'w:jc', + 'w:textDirection', 'w:textAlignment', 'w:textboxTightWrap', + 'w:outlineLvl', 'w:divId', 'w:cnfStyle', 'w:rPr', 'w:sectPr', + 'w:pPrChange' + ) + bottom = OxmlElement('w:bottom') + bottom.set(qn('w:val'), 'single') + bottom.set(qn('w:sz'), '6') + bottom.set(qn('w:space'), '1') + bottom.set(qn('w:color'), 'auto') + pBdr.append(bottom) + + elif re.match('h[1-9]', tag): + if isinstance(self.doc, docx.document.Document): + h_size = int(tag[1]) + self.paragraph = self.doc.add_heading(level=min(h_size, 9)) + else: + self.paragraph = self.doc.add_paragraph() + + elif tag == 'img': + self.handle_img(current_attrs) + return + + elif tag == 'table': + self.handle_table() + return + + # set new run reference point in case of leading line breaks + if tag in ['p', 'li', 'pre']: + self.run = self.paragraph.add_run() + + # add style + if not self.include_styles: + return + if 'style' in current_attrs and self.paragraph: + style = self.parse_dict_string(current_attrs['style']) + self.add_styles_to_paragraph(style) + + def handle_endtag(self, tag): + if self.skip: + if not tag == self.skip_tag: + return + + if self.instances_to_skip > 0: + self.instances_to_skip -= 1 + return + + self.skip = False + self.skip_tag = None + self.paragraph = None + + if tag == 'span': + if self.tags['span']: + self.tags['span'].pop() + return + elif tag == 'ol' or tag == 'ul': + remove_last_occurence(self.tags['list'], tag) + return + elif tag == 'table': + self.table_no += 1 + self.table = None + self.doc = self.document + self.paragraph = None + + if tag in self.tags: + self.tags.pop(tag) + # maybe set relevant reference to None? + + def handle_data(self, data): + if self.skip: + return + + # Only remove white space if we're not in a pre block. + if 'pre' not in self.tags: + # remove leading and trailing whitespace in all instances + data = remove_whitespace(data, True, True) + + if not self.paragraph: + self.paragraph = self.doc.add_paragraph() + self.apply_paragraph_style() + + # There can only be one nested link in a valid html document + # You cannot have interactive content in an A tag, this includes links + # https://html.spec.whatwg.org/#interactive-content + link = self.tags.get('a') + if link: + self.handle_link(link['href'], data) + else: + # If there's a link, dont put the data directly in the run + self.run = self.paragraph.add_run(data) + spans = self.tags['span'] + for span in spans: + if 'style' in span: + style = self.parse_dict_string(span['style']) + self.add_styles_to_run(style) + + # add font style and name + for tag in self.tags: + if tag in font_styles: + font_style = font_styles[tag] + setattr(self.run.font, font_style, True) + + if tag in font_names: + font_name = font_names[tag] + self.run.font.name = font_name + + def ignore_nested_tables(self, tables_soup): + """ + Returns array containing only the highest level tables + Operates on the assumption that bs4 returns child elements immediately after + the parent element in `find_all`. If this changes in the future, this method will need to be updated + :return: + """ + new_tables = [] + nest = 0 + for table in tables_soup: + if nest: + nest -= 1 + continue + new_tables.append(table) + nest = len(table.find_all('table')) + return new_tables + + def get_table_rows(self, table_soup): + # If there's a header, body, footer or direct child tr tags, add row dimensions from there + return table_soup.select(', '.join(self.table_row_selectors), recursive=False) + + def get_table_columns(self, row): + # Get all columns for the specified row tag. + return row.find_all(['th', 'td'], recursive=False) if row else [] + + def get_table_dimensions(self, table_soup): + # Get rows for the table + rows = self.get_table_rows(table_soup) + # Table is either empty or has non-direct children between table and tr tags + # Thus the row dimensions and column dimensions are assumed to be 0 + + cols = self.get_table_columns(rows[0]) if rows else [] + # Add colspan calculation column number + col_count = 0 + for col in cols: + colspan = col.attrs.get('colspan', 1) + col_count += int(colspan) + + # return len(rows), col_count + return rows, col_count + + def get_tables(self): + if not hasattr(self, 'soup'): + self.include_tables = False + return + # find other way to do it, or require this dependency? + self.tables = self.ignore_nested_tables(self.soup.find_all('table')) + self.table_no = 0 + + def run_process(self, html): + if self.bs and BeautifulSoup: + self.soup = BeautifulSoup(html, 'html.parser') + html = str(self.soup) + if self.include_tables: + self.get_tables() + self.feed(html) + + def add_html_to_document(self, html, document): + if not isinstance(html, str): + raise ValueError('First argument needs to be a %s' % str) + elif not isinstance(document, docx.document.Document) and not isinstance(document, docx.table._Cell): + raise ValueError('Second argument needs to be a %s' % docx.document.Document) + self.set_initial_attrs(document) + self.run_process(html) + + def add_html_to_cell(self, html, cell): + self.set_initial_attrs(cell) + self.run_process(html) + + def parse_html_file(self, filename_html, filename_docx=None): + with open(filename_html, 'r') as infile: + html = infile.read() + self.set_initial_attrs() + self.run_process(html) + if not filename_docx: + path, filename = os.path.split(filename_html) + filename_docx = '%s/new_docx_file_%s' % (path, filename) + self.doc.save('%s.docx' % filename_docx) + + def parse_html_string(self, html): + self.set_initial_attrs() + self.run_process(html) + return self.doc \ No newline at end of file diff --git a/ppstructure/utility.py b/ppstructure/utility.py index 625185e6..2cf20eb5 100644 --- a/ppstructure/utility.py +++ b/ppstructure/utility.py @@ -89,6 +89,11 @@ def init_args(): type=bool, default=False, help='Whether to enable layout of recovery') + parser.add_argument( + "--save_pdf", + type=bool, + default=False, + help='Whether to save pdf file') return parser -- GitLab