', '', '',
@@ -169,7 +168,8 @@ class StructureSystem(object):
'type': region['label'].lower(),
'bbox': [x1, y1, x2, y2],
'img': roi_img,
- 'res': res
+ 'res': res,
+ 'img_idx': img_idx
})
end = time.time()
time_dict['all'] = end - start
@@ -179,26 +179,29 @@ class StructureSystem(object):
return None, None
-def save_structure_res(res, save_folder, img_name):
+def save_structure_res(res, save_folder, img_name, img_idx=0):
excel_save_folder = os.path.join(save_folder, img_name)
os.makedirs(excel_save_folder, exist_ok=True)
res_cp = deepcopy(res)
# save res
with open(
- os.path.join(excel_save_folder, 'res.txt'), 'w',
+ os.path.join(excel_save_folder, 'res_{}.txt'.format(img_idx)),
+ 'w',
encoding='utf8') as f:
for region in res_cp:
roi_img = region.pop('img')
f.write('{}\n'.format(json.dumps(region)))
- if region['type'] == 'table' and len(region[
+ if region['type'].lower() == 'table' and len(region[
'res']) > 0 and 'html' in region['res']:
- excel_path = os.path.join(excel_save_folder,
- '{}.xlsx'.format(region['bbox']))
+ excel_path = os.path.join(
+ excel_save_folder,
+ '{}_{}.xlsx'.format(region['bbox'], img_idx))
to_excel(region['res']['html'], excel_path)
- elif region['type'] == 'figure':
- img_path = os.path.join(excel_save_folder,
- '{}.jpg'.format(region['bbox']))
+ elif region['type'].lower() == 'figure':
+ img_path = os.path.join(
+ excel_save_folder,
+ '{}_{}.jpg'.format(region['bbox'], img_idx))
cv2.imwrite(img_path, roi_img)
@@ -214,28 +217,75 @@ def main(args):
for i, image_file in enumerate(image_file_list):
logger.info("[{}/{}] {}".format(i, img_num, image_file))
- img, flag = check_and_read_gif(image_file)
+ img, flag_gif, flag_pdf = check_and_read(image_file)
img_name = os.path.basename(image_file).split('.')[0]
- if not flag:
+ if not flag_gif and not flag_pdf:
img = cv2.imread(image_file)
- if img is None:
- logger.error("error in loading image:{}".format(image_file))
- continue
- res, time_dict = structure_sys(img)
- if structure_sys.mode == 'structure':
- save_structure_res(res, save_folder, img_name)
- draw_img = draw_structure_result(img, res, args.vis_font_path)
- img_save_path = os.path.join(save_folder, img_name, 'show.jpg')
- elif structure_sys.mode == 'vqa':
- raise NotImplementedError
- # draw_img = draw_ser_results(img, res, args.vis_font_path)
- # img_save_path = os.path.join(save_folder, img_name + '.jpg')
- cv2.imwrite(img_save_path, draw_img)
- logger.info('result save to {}'.format(img_save_path))
- if args.recovery:
- convert_info_docx(img, res, save_folder, img_name)
+ if not flag_pdf:
+ if img is None:
+ logger.error("error in loading image:{}".format(image_file))
+ continue
+ res, time_dict = structure_sys(img)
+
+ if structure_sys.mode == 'structure':
+ save_structure_res(res, save_folder, img_name)
+ draw_img = draw_structure_result(img, res, args.vis_font_path)
+ img_save_path = os.path.join(save_folder, img_name, 'show.jpg')
+ elif structure_sys.mode == 'vqa':
+ raise NotImplementedError
+ # draw_img = draw_ser_results(img, res, args.vis_font_path)
+ # img_save_path = os.path.join(save_folder, img_name + '.jpg')
+ cv2.imwrite(img_save_path, draw_img)
+ logger.info('result save to {}'.format(img_save_path))
+ if args.recovery:
+ try:
+ from ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_info_docx
+ h, w, _ = img.shape
+ res = sorted_layout_boxes(res, w)
+ convert_info_docx(img, res, save_folder, img_name,
+ args.save_pdf)
+ except Exception as ex:
+ logger.error(
+ "error in layout recovery image:{}, err msg: {}".format(
+ image_file, ex))
+ continue
+ else:
+ pdf_imgs = img
+ all_res = []
+ for index, img in enumerate(pdf_imgs):
+
+ res, time_dict = structure_sys(img, index)
+ if structure_sys.mode == 'structure' and res != []:
+ save_structure_res(res, save_folder, img_name, index)
+ draw_img = draw_structure_result(img, res,
+ args.vis_font_path)
+ img_save_path = os.path.join(save_folder, img_name,
+ 'show_{}.jpg'.format(index))
+ elif structure_sys.mode == 'vqa':
+ raise NotImplementedError
+ # draw_img = draw_ser_results(img, res, args.vis_font_path)
+ # img_save_path = os.path.join(save_folder, img_name + '.jpg')
+ if res != []:
+ cv2.imwrite(img_save_path, draw_img)
+ logger.info('result save to {}'.format(img_save_path))
+ if args.recovery and res != []:
+ from ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_info_docx
+ h, w, _ = img.shape
+ res = sorted_layout_boxes(res, w)
+ all_res += res
+
+ if args.recovery and all_res != []:
+ try:
+ convert_info_docx(img, all_res, save_folder, img_name,
+ args.save_pdf)
+ except Exception as ex:
+ logger.error(
+ "error in layout recovery image:{}, err msg: {}".format(
+ image_file, ex))
+ continue
+
logger.info("Predict time : {:.3f}s".format(time_dict['all']))
diff --git a/ppstructure/recovery/README.md b/ppstructure/recovery/README.md
index 883dbef3e829dfa213644b610af1ca279dac8641..713d0307dbbd66664db15d19df484af76efea75a 100644
--- a/ppstructure/recovery/README.md
+++ b/ppstructure/recovery/README.md
@@ -78,9 +78,27 @@ wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar
wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar && tar xf ch_PP-OCRv3_rec_infer.tar
# Download the ultra-lightweight English table inch model and unzip it
wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar && tar xf en_ppocr_mobile_v2.0_table_structure_infer.tar
+# Download the layout model of publaynet dataset and unzip it
+wget
+https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_layout_infer.tar && tar picodet_lcnet_x1_0_layout_infer.tar
cd ..
# run
-python3 predict_system.py --det_model_dir=inference/en_PP-OCRv3_det_infer --rec_model_dir=inference/en_PP-OCRv3_rec_infer --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer --rec_char_dict_path=../ppocr/utils/en_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --output ./output/table --rec_image_shape=3,48,320 --vis_font_path=../doc/fonts/simfang.ttf --recovery=True --image_dir=./docs/table/1.png
+python3 predict_system.py \
+ --image_dir=./docs/table/1.png \
+ --det_model_dir=inference/en_PP-OCRv3_det_infer \
+ --rec_model_dir=inference/en_PP-OCRv3_rec_infe \
+ --rec_char_dict_path=../ppocr/utils/en_dict.txt \
+ --output=../output/ \
+ --table_model_dir=inference/ch_ppstructure_mobile_v2.0_SLANet_infer \
+ --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \
+ --table_max_len=488 \
+ --layout_model_dir=inference/picodet_lcnet_x1_0_layout_infer \
+ --layout_dict_path=../ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt \
+ --vis_font_path=../doc/fonts/simfang.ttf \
+ --recovery=True \
+ --save_pdf=False
```
-After running, the docx of each picture will be saved in the directory specified by the output field
\ No newline at end of file
+After running, the docx of each picture will be saved in the directory specified by the output field
+
+Recovery table to Word code[table_process.py] reference:https://github.com/pqzx/html2docx.git
\ No newline at end of file
diff --git a/ppstructure/recovery/README_ch.md b/ppstructure/recovery/README_ch.md
index 5a05abffd0399387bc0d22d878e64d03d8894a79..14ca8836a0332a5b0e119be4bf6bcb36fb011d1e 100644
--- a/ppstructure/recovery/README_ch.md
+++ b/ppstructure/recovery/README_ch.md
@@ -35,21 +35,15 @@
python3 -m pip install --upgrade pip
# GPU安装
-python3 -m pip install "paddlepaddle-gpu>=2.2" -i https://mirror.baidu.com/pypi/simple
+python3 -m pip install "paddlepaddle-gpu>=2.3" -i https://mirror.baidu.com/pypi/simple
# CPU安装
-python3 -m pip install "paddlepaddle>=2.2" -i https://mirror.baidu.com/pypi/simple
+python3 -m pip install "paddlepaddle>=2.3" -i https://mirror.baidu.com/pypi/simple
```
更多需求,请参照[安装文档](https://www.paddlepaddle.org.cn/install/quick)中的说明进行操作。
-* **(2)安装依赖**
-
-```bash
-python3 -m pip install -r ppstructure/recovery/requirements.txt
-```
-
### 2.2 安装PaddleOCR
@@ -87,11 +81,28 @@ wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar
# 下载英文轻量级PP-OCRv3模型的识别模型并解压
wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar && tar xf ch_PP-OCRv3_rec_infer.tar
# 下载超轻量级英文表格英寸模型并解压
-wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar && tar xf en_ppocr_mobile_v2.0_table_structure_infer.tar
+wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar && tar xf ch_ppstructure_mobile_v2.0_SLANet_infer.tar
+# 下载英文版面分析模型
+wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_layout_infer.tar && tar picodet_lcnet_x1_0_layout_infer.tar
cd ..
+
# 执行预测
-python3 predict_system.py --det_model_dir=inference/en_PP-OCRv3_det_infer --rec_model_dir=inference/en_PP-OCRv3_rec_infer --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer --rec_char_dict_path=../ppocr/utils/en_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --output ./output/table --rec_image_shape=3,48,320 --vis_font_path=../doc/fonts/simfang.ttf --recovery=True --image_dir=./docs/table/1.png
+python3 predict_system.py \
+ --image_dir=./docs/table/1.png \
+ --det_model_dir=inference/en_PP-OCRv3_det_infer \
+ --rec_model_dir=inference/en_PP-OCRv3_rec_infe \
+ --rec_char_dict_path=../ppocr/utils/en_dict.txt \
+ --output=../output/ \
+ --table_model_dir=inference/ch_ppstructure_mobile_v2.0_SLANet_infer \
+ --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \
+ --table_max_len=488 \
+ --layout_model_dir=inference/picodet_lcnet_x1_0_layout_infer \
+ --layout_dict_path=../ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt \
+ --vis_font_path=../doc/fonts/simfang.ttf \
+ --recovery=True \
+ --save_pdf=False
```
-运行完成后,每张图片的docx文档会保存到output字段指定的目录下
+运行完成后,每张图片的docx文档会保存到`output`字段指定的目录下
+表格恢复到Word代码[table_process.py]来自:https://github.com/pqzx/html2docx.git
diff --git a/ppstructure/recovery/recovery_to_doc.py b/ppstructure/recovery/recovery_to_doc.py
index 5278217d5b983008d357b6b1be3ab1b883a4939d..4401b1f27cf10f8483ee9b2b4a61315ad6aad264 100644
--- a/ppstructure/recovery/recovery_to_doc.py
+++ b/ppstructure/recovery/recovery_to_doc.py
@@ -22,21 +22,23 @@ from docx import shared
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.section import WD_SECTION
from docx.oxml.ns import qn
+from docx.enum.table import WD_TABLE_ALIGNMENT
+
+from table_process import HtmlToDocx
from ppocr.utils.logging import get_logger
logger = get_logger()
-def convert_info_docx(img, res, save_folder, img_name):
+def convert_info_docx(img, res, save_folder, img_name, save_pdf):
doc = Document()
doc.styles['Normal'].font.name = 'Times New Roman'
doc.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体')
doc.styles['Normal'].font.size = shared.Pt(6.5)
- h, w, _ = img.shape
- res = sorted_layout_boxes(res, w)
flag = 1
for i, region in enumerate(res):
+ img_idx = region['img_idx']
if flag == 2 and region['layout'] == 'single':
section = doc.add_section(WD_SECTION.CONTINUOUS)
section._sectPr.xpath('./w:cols')[0].set(qn('w:num'), '1')
@@ -46,10 +48,10 @@ def convert_info_docx(img, res, save_folder, img_name):
section._sectPr.xpath('./w:cols')[0].set(qn('w:num'), '2')
flag = 2
- if region['type'] == 'Figure':
+ if region['type'].lower() == 'figure':
excel_save_folder = os.path.join(save_folder, img_name)
img_path = os.path.join(excel_save_folder,
- '{}.jpg'.format(region['bbox']))
+ '{}_{}.jpg'.format(region['bbox'], img_idx))
paragraph_pic = doc.add_paragraph()
paragraph_pic.alignment = WD_ALIGN_PARAGRAPH.CENTER
run = paragraph_pic.add_run("")
@@ -57,40 +59,38 @@ def convert_info_docx(img, res, save_folder, img_name):
run.add_picture(img_path, width=shared.Inches(5))
elif flag == 2:
run.add_picture(img_path, width=shared.Inches(2))
- elif region['type'] == 'Title':
+ elif region['type'].lower() == 'title':
doc.add_heading(region['res'][0]['text'])
- elif region['type'] == 'Text':
+ elif region['type'].lower() == 'table':
+ paragraph = doc.add_paragraph()
+ new_parser = HtmlToDocx()
+ new_parser.table_style = 'TableGrid'
+ table = new_parser.handle_table(html=region['res']['html'])
+ new_table = deepcopy(table)
+ new_table.alignment = WD_TABLE_ALIGNMENT.CENTER
+ paragraph.add_run().element.addnext(new_table._tbl)
+
+ else:
paragraph = doc.add_paragraph()
paragraph_format = paragraph.paragraph_format
for i, line in enumerate(region['res']):
if i == 0:
paragraph_format.first_line_indent = shared.Inches(0.25)
text_run = paragraph.add_run(line['text'] + ' ')
- text_run.font.size = shared.Pt(9)
- elif region['type'] == 'Table':
- pypandoc.convert(
- source=region['res']['html'],
- format='html',
- to='docx',
- outputfile='tmp.docx')
- tmp_doc = Document('tmp.docx')
- paragraph = doc.add_paragraph()
-
- table = tmp_doc.tables[0]
- new_table = deepcopy(table)
- new_table.style = doc.styles['Table Grid']
- from docx.enum.table import WD_TABLE_ALIGNMENT
- new_table.alignment = WD_TABLE_ALIGNMENT.CENTER
- paragraph.add_run().element.addnext(new_table._tbl)
- os.remove('tmp.docx')
- else:
- continue
+ text_run.font.size = shared.Pt(10)
# save to docx
docx_path = os.path.join(save_folder, '{}.docx'.format(img_name))
doc.save(docx_path)
logger.info('docx save to {}'.format(docx_path))
+ # save to pdf
+ if save_pdf:
+ pdf = os.path.join(save_folder, '{}.pdf'.format(img_name))
+ from docx2pdf import convert
+ convert(docx_path, pdf_path)
+ logger.info('pdf save to {}'.format(pdf))
+
def sorted_layout_boxes(res, w):
"""
diff --git a/ppstructure/recovery/requirements.txt b/ppstructure/recovery/requirements.txt
index 04187baa2a72d2ac60f0a4e5ce643f882b7255fb..5ba3099d64574954c65ac8169798759dd7c053ac 100644
--- a/ppstructure/recovery/requirements.txt
+++ b/ppstructure/recovery/requirements.txt
@@ -1,3 +1,5 @@
-opencv-contrib-python==4.4.0.46
pypandoc
-python-docx
\ No newline at end of file
+python-docx
+docx2pdf
+fitz
+PyMuPDF
\ No newline at end of file
diff --git a/ppstructure/recovery/table_process.py b/ppstructure/recovery/table_process.py
new file mode 100644
index 0000000000000000000000000000000000000000..243aaf8933791bf4704964d9665173fe70982f95
--- /dev/null
+++ b/ppstructure/recovery/table_process.py
@@ -0,0 +1,632 @@
+
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is refer from:https://github.com/pqzx/html2docx/blob/8f6695a778c68befb302e48ac0ed5201ddbd4524/htmldocx/h2d.py
+
+"""
+import re, argparse
+import io, os
+import urllib.request
+from urllib.parse import urlparse
+from html.parser import HTMLParser
+
+import docx, docx.table
+from docx import Document
+from docx.shared import RGBColor, Pt, Inches
+from docx.enum.text import WD_COLOR, WD_ALIGN_PARAGRAPH
+from docx.oxml import OxmlElement
+from docx.oxml.ns import qn
+
+from bs4 import BeautifulSoup
+
+# values in inches
+INDENT = 0.25
+LIST_INDENT = 0.5
+MAX_INDENT = 5.5 # To stop indents going off the page
+
+# Style to use with tables. By default no style is used.
+DEFAULT_TABLE_STYLE = None
+
+# Style to use with paragraphs. By default no style is used.
+DEFAULT_PARAGRAPH_STYLE = None
+
+
+def get_filename_from_url(url):
+ return os.path.basename(urlparse(url).path)
+
+def is_url(url):
+ """
+ Not to be used for actually validating a url, but in our use case we only
+ care if it's a url or a file path, and they're pretty distinguishable
+ """
+ parts = urlparse(url)
+ return all([parts.scheme, parts.netloc, parts.path])
+
+def fetch_image(url):
+ """
+ Attempts to fetch an image from a url.
+ If successful returns a bytes object, else returns None
+ :return:
+ """
+ try:
+ with urllib.request.urlopen(url) as response:
+ # security flaw?
+ return io.BytesIO(response.read())
+ except urllib.error.URLError:
+ return None
+
+def remove_last_occurence(ls, x):
+ ls.pop(len(ls) - ls[::-1].index(x) - 1)
+
+def remove_whitespace(string, leading=False, trailing=False):
+ """Remove white space from a string.
+ Args:
+ string(str): The string to remove white space from.
+ leading(bool, optional): Remove leading new lines when True.
+ trailing(bool, optional): Remove trailing new lines when False.
+ Returns:
+ str: The input string with new line characters removed and white space squashed.
+ Examples:
+ Single or multiple new line characters are replaced with space.
+ >>> remove_whitespace("abc\\ndef")
+ 'abc def'
+ >>> remove_whitespace("abc\\n\\n\\ndef")
+ 'abc def'
+ New line characters surrounded by white space are replaced with a single space.
+ >>> remove_whitespace("abc \\n \\n \\n def")
+ 'abc def'
+ >>> remove_whitespace("abc \\n \\n \\n def")
+ 'abc def'
+ Leading and trailing new lines are replaced with a single space.
+ >>> remove_whitespace("\\nabc")
+ ' abc'
+ >>> remove_whitespace(" \\n abc")
+ ' abc'
+ >>> remove_whitespace("abc\\n")
+ 'abc '
+ >>> remove_whitespace("abc \\n ")
+ 'abc '
+ Use ``leading=True`` to remove leading new line characters, including any surrounding
+ white space:
+ >>> remove_whitespace("\\nabc", leading=True)
+ 'abc'
+ >>> remove_whitespace(" \\n abc", leading=True)
+ 'abc'
+ Use ``trailing=True`` to remove trailing new line characters, including any surrounding
+ white space:
+ >>> remove_whitespace("abc \\n ", trailing=True)
+ 'abc'
+ """
+ # Remove any leading new line characters along with any surrounding white space
+ if leading:
+ string = re.sub(r'^\s*\n+\s*', '', string)
+
+ # Remove any trailing new line characters along with any surrounding white space
+ if trailing:
+ string = re.sub(r'\s*\n+\s*$', '', string)
+
+ # Replace new line characters and absorb any surrounding space.
+ string = re.sub(r'\s*\n\s*', ' ', string)
+ # TODO need some way to get rid of extra spaces in e.g. text text
+ return re.sub(r'\s+', ' ', string)
+
+def delete_paragraph(paragraph):
+ # https://github.com/python-openxml/python-docx/issues/33#issuecomment-77661907
+ p = paragraph._element
+ p.getparent().remove(p)
+ p._p = p._element = None
+
+font_styles = {
+ 'b': 'bold',
+ 'strong': 'bold',
+ 'em': 'italic',
+ 'i': 'italic',
+ 'u': 'underline',
+ 's': 'strike',
+ 'sup': 'superscript',
+ 'sub': 'subscript',
+ 'th': 'bold',
+}
+
+font_names = {
+ 'code': 'Courier',
+ 'pre': 'Courier',
+}
+
+styles = {
+ 'LIST_BULLET': 'List Bullet',
+ 'LIST_NUMBER': 'List Number',
+}
+
+class HtmlToDocx(HTMLParser):
+
+ def __init__(self):
+ super().__init__()
+ self.options = {
+ 'fix-html': True,
+ 'images': True,
+ 'tables': True,
+ 'styles': True,
+ }
+ self.table_row_selectors = [
+ 'table > tr',
+ 'table > thead > tr',
+ 'table > tbody > tr',
+ 'table > tfoot > tr'
+ ]
+ self.table_style = DEFAULT_TABLE_STYLE
+ self.paragraph_style = DEFAULT_PARAGRAPH_STYLE
+
+ def set_initial_attrs(self, document=None):
+ self.tags = {
+ 'span': [],
+ 'list': [],
+ }
+ if document:
+ self.doc = document
+ else:
+ self.doc = Document()
+ self.bs = self.options['fix-html'] # whether or not to clean with BeautifulSoup
+ self.document = self.doc
+ self.include_tables = True #TODO add this option back in?
+ self.include_images = self.options['images']
+ self.include_styles = self.options['styles']
+ self.paragraph = None
+ self.skip = False
+ self.skip_tag = None
+ self.instances_to_skip = 0
+
+ def copy_settings_from(self, other):
+ """Copy settings from another instance of HtmlToDocx"""
+ self.table_style = other.table_style
+ self.paragraph_style = other.paragraph_style
+
+ def get_cell_html(self, soup):
+ # Returns string of td element with opening and closing tags removed
+ # Cannot use find_all as it only finds element tags and does not find text which
+ # is not inside an element
+ return ' '.join([str(i) for i in soup.contents])
+
+ def add_styles_to_paragraph(self, style):
+ if 'text-align' in style:
+ align = style['text-align']
+ if align == 'center':
+ self.paragraph.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER
+ elif align == 'right':
+ self.paragraph.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.RIGHT
+ elif align == 'justify':
+ self.paragraph.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
+ if 'margin-left' in style:
+ margin = style['margin-left']
+ units = re.sub(r'[0-9]+', '', margin)
+ margin = int(float(re.sub(r'[a-z]+', '', margin)))
+ if units == 'px':
+ self.paragraph.paragraph_format.left_indent = Inches(min(margin // 10 * INDENT, MAX_INDENT))
+ # TODO handle non px units
+
+ def add_styles_to_run(self, style):
+ if 'color' in style:
+ if 'rgb' in style['color']:
+ color = re.sub(r'[a-z()]+', '', style['color'])
+ colors = [int(x) for x in color.split(',')]
+ elif '#' in style['color']:
+ color = style['color'].lstrip('#')
+ colors = tuple(int(color[i:i+2], 16) for i in (0, 2, 4))
+ else:
+ colors = [0, 0, 0]
+ # TODO map colors to named colors (and extended colors...)
+ # For now set color to black to prevent crashing
+ self.run.font.color.rgb = RGBColor(*colors)
+
+ if 'background-color' in style:
+ if 'rgb' in style['background-color']:
+ color = color = re.sub(r'[a-z()]+', '', style['background-color'])
+ colors = [int(x) for x in color.split(',')]
+ elif '#' in style['background-color']:
+ color = style['background-color'].lstrip('#')
+ colors = tuple(int(color[i:i+2], 16) for i in (0, 2, 4))
+ else:
+ colors = [0, 0, 0]
+ # TODO map colors to named colors (and extended colors...)
+ # For now set color to black to prevent crashing
+ self.run.font.highlight_color = WD_COLOR.GRAY_25 #TODO: map colors
+
+ def apply_paragraph_style(self, style=None):
+ try:
+ if style:
+ self.paragraph.style = style
+ elif self.paragraph_style:
+ self.paragraph.style = self.paragraph_style
+ except KeyError as e:
+ raise ValueError(f"Unable to apply style {self.paragraph_style}.") from e
+
+ def parse_dict_string(self, string, separator=';'):
+ new_string = string.replace(" ", '').split(separator)
+ string_dict = dict([x.split(':') for x in new_string if ':' in x])
+ return string_dict
+
+ def handle_li(self):
+ # check list stack to determine style and depth
+ list_depth = len(self.tags['list'])
+ if list_depth:
+ list_type = self.tags['list'][-1]
+ else:
+ list_type = 'ul' # assign unordered if no tag
+
+ if list_type == 'ol':
+ list_style = styles['LIST_NUMBER']
+ else:
+ list_style = styles['LIST_BULLET']
+
+ self.paragraph = self.doc.add_paragraph(style=list_style)
+ self.paragraph.paragraph_format.left_indent = Inches(min(list_depth * LIST_INDENT, MAX_INDENT))
+ self.paragraph.paragraph_format.line_spacing = 1
+
+ def add_image_to_cell(self, cell, image):
+ # python-docx doesn't have method yet for adding images to table cells. For now we use this
+ paragraph = cell.add_paragraph()
+ run = paragraph.add_run()
+ run.add_picture(image)
+
+ def handle_img(self, current_attrs):
+ if not self.include_images:
+ self.skip = True
+ self.skip_tag = 'img'
+ return
+ src = current_attrs['src']
+ # fetch image
+ src_is_url = is_url(src)
+ if src_is_url:
+ try:
+ image = fetch_image(src)
+ except urllib.error.URLError:
+ image = None
+ else:
+ image = src
+ # add image to doc
+ if image:
+ try:
+ if isinstance(self.doc, docx.document.Document):
+ self.doc.add_picture(image)
+ else:
+ self.add_image_to_cell(self.doc, image)
+ except FileNotFoundError:
+ image = None
+ if not image:
+ if src_is_url:
+ self.doc.add_paragraph("" % src)
+ else:
+ # avoid exposing filepaths in document
+ self.doc.add_paragraph("" % get_filename_from_url(src))
+
+
+ def handle_table(self, html):
+ """
+ To handle nested tables, we will parse tables manually as follows:
+ Get table soup
+ Create docx table
+ Iterate over soup and fill docx table with new instances of this parser
+ Tell HTMLParser to ignore any tags until the corresponding closing table tag
+ """
+ doc = Document()
+ table_soup = BeautifulSoup(html, 'html.parser')
+ rows, cols_len = self.get_table_dimensions(table_soup)
+ table = doc.add_table(len(rows), cols_len)
+ table.style = doc.styles['Table Grid']
+ cell_row = 0
+ for index, row in enumerate(rows):
+ cols = self.get_table_columns(row)
+ cell_col = 0
+ for col in cols:
+ colspan = int(col.attrs.get('colspan', 1))
+ rowspan = int(col.attrs.get('rowspan', 1))
+
+ cell_html = self.get_cell_html(col)
+
+ if col.name == 'th':
+ cell_html = "%s" % cell_html
+ docx_cell = table.cell(cell_row, cell_col)
+ while docx_cell.text != '': # Skip the merged cell
+ cell_col += 1
+ docx_cell = table.cell(cell_row, cell_col)
+
+ cell_to_merge = table.cell(cell_row + rowspan - 1, cell_col + colspan - 1)
+ if docx_cell != cell_to_merge:
+ docx_cell.merge(cell_to_merge)
+
+ child_parser = HtmlToDocx()
+ child_parser.copy_settings_from(self)
+
+ child_parser.add_html_to_cell(cell_html or ' ', docx_cell) # occupy the position
+
+ cell_col += colspan
+ cell_row += 1
+
+ # skip all tags until corresponding closing tag
+ self.instances_to_skip = len(table_soup.find_all('table'))
+ self.skip_tag = 'table'
+ self.skip = True
+ self.table = None
+ return table
+
+ def handle_link(self, href, text):
+ # Link requires a relationship
+ is_external = href.startswith('http')
+ rel_id = self.paragraph.part.relate_to(
+ href,
+ docx.opc.constants.RELATIONSHIP_TYPE.HYPERLINK,
+ is_external=True # don't support anchor links for this library yet
+ )
+
+ # Create the w:hyperlink tag and add needed values
+ hyperlink = docx.oxml.shared.OxmlElement('w:hyperlink')
+ hyperlink.set(docx.oxml.shared.qn('r:id'), rel_id)
+
+
+ # Create sub-run
+ subrun = self.paragraph.add_run()
+ rPr = docx.oxml.shared.OxmlElement('w:rPr')
+
+ # add default color
+ c = docx.oxml.shared.OxmlElement('w:color')
+ c.set(docx.oxml.shared.qn('w:val'), "0000EE")
+ rPr.append(c)
+
+ # add underline
+ u = docx.oxml.shared.OxmlElement('w:u')
+ u.set(docx.oxml.shared.qn('w:val'), 'single')
+ rPr.append(u)
+
+ subrun._r.append(rPr)
+ subrun._r.text = text
+
+ # Add subrun to hyperlink
+ hyperlink.append(subrun._r)
+
+ # Add hyperlink to run
+ self.paragraph._p.append(hyperlink)
+
+ def handle_starttag(self, tag, attrs):
+ if self.skip:
+ return
+ if tag == 'head':
+ self.skip = True
+ self.skip_tag = tag
+ self.instances_to_skip = 0
+ return
+ elif tag == 'body':
+ return
+
+ current_attrs = dict(attrs)
+
+ if tag == 'span':
+ self.tags['span'].append(current_attrs)
+ return
+ elif tag == 'ol' or tag == 'ul':
+ self.tags['list'].append(tag)
+ return # don't apply styles for now
+ elif tag == 'br':
+ self.run.add_break()
+ return
+
+ self.tags[tag] = current_attrs
+ if tag in ['p', 'pre']:
+ self.paragraph = self.doc.add_paragraph()
+ self.apply_paragraph_style()
+
+ elif tag == 'li':
+ self.handle_li()
+
+ elif tag == "hr":
+
+ # This implementation was taken from:
+ # https://github.com/python-openxml/python-docx/issues/105#issuecomment-62806373
+
+ self.paragraph = self.doc.add_paragraph()
+ pPr = self.paragraph._p.get_or_add_pPr()
+ pBdr = OxmlElement('w:pBdr')
+ pPr.insert_element_before(pBdr,
+ 'w:shd', 'w:tabs', 'w:suppressAutoHyphens', 'w:kinsoku', 'w:wordWrap',
+ 'w:overflowPunct', 'w:topLinePunct', 'w:autoSpaceDE', 'w:autoSpaceDN',
+ 'w:bidi', 'w:adjustRightInd', 'w:snapToGrid', 'w:spacing', 'w:ind',
+ 'w:contextualSpacing', 'w:mirrorIndents', 'w:suppressOverlap', 'w:jc',
+ 'w:textDirection', 'w:textAlignment', 'w:textboxTightWrap',
+ 'w:outlineLvl', 'w:divId', 'w:cnfStyle', 'w:rPr', 'w:sectPr',
+ 'w:pPrChange'
+ )
+ bottom = OxmlElement('w:bottom')
+ bottom.set(qn('w:val'), 'single')
+ bottom.set(qn('w:sz'), '6')
+ bottom.set(qn('w:space'), '1')
+ bottom.set(qn('w:color'), 'auto')
+ pBdr.append(bottom)
+
+ elif re.match('h[1-9]', tag):
+ if isinstance(self.doc, docx.document.Document):
+ h_size = int(tag[1])
+ self.paragraph = self.doc.add_heading(level=min(h_size, 9))
+ else:
+ self.paragraph = self.doc.add_paragraph()
+
+ elif tag == 'img':
+ self.handle_img(current_attrs)
+ return
+
+ elif tag == 'table':
+ self.handle_table()
+ return
+
+ # set new run reference point in case of leading line breaks
+ if tag in ['p', 'li', 'pre']:
+ self.run = self.paragraph.add_run()
+
+ # add style
+ if not self.include_styles:
+ return
+ if 'style' in current_attrs and self.paragraph:
+ style = self.parse_dict_string(current_attrs['style'])
+ self.add_styles_to_paragraph(style)
+
+ def handle_endtag(self, tag):
+ if self.skip:
+ if not tag == self.skip_tag:
+ return
+
+ if self.instances_to_skip > 0:
+ self.instances_to_skip -= 1
+ return
+
+ self.skip = False
+ self.skip_tag = None
+ self.paragraph = None
+
+ if tag == 'span':
+ if self.tags['span']:
+ self.tags['span'].pop()
+ return
+ elif tag == 'ol' or tag == 'ul':
+ remove_last_occurence(self.tags['list'], tag)
+ return
+ elif tag == 'table':
+ self.table_no += 1
+ self.table = None
+ self.doc = self.document
+ self.paragraph = None
+
+ if tag in self.tags:
+ self.tags.pop(tag)
+ # maybe set relevant reference to None?
+
+ def handle_data(self, data):
+ if self.skip:
+ return
+
+ # Only remove white space if we're not in a pre block.
+ if 'pre' not in self.tags:
+ # remove leading and trailing whitespace in all instances
+ data = remove_whitespace(data, True, True)
+
+ if not self.paragraph:
+ self.paragraph = self.doc.add_paragraph()
+ self.apply_paragraph_style()
+
+ # There can only be one nested link in a valid html document
+ # You cannot have interactive content in an A tag, this includes links
+ # https://html.spec.whatwg.org/#interactive-content
+ link = self.tags.get('a')
+ if link:
+ self.handle_link(link['href'], data)
+ else:
+ # If there's a link, dont put the data directly in the run
+ self.run = self.paragraph.add_run(data)
+ spans = self.tags['span']
+ for span in spans:
+ if 'style' in span:
+ style = self.parse_dict_string(span['style'])
+ self.add_styles_to_run(style)
+
+ # add font style and name
+ for tag in self.tags:
+ if tag in font_styles:
+ font_style = font_styles[tag]
+ setattr(self.run.font, font_style, True)
+
+ if tag in font_names:
+ font_name = font_names[tag]
+ self.run.font.name = font_name
+
+ def ignore_nested_tables(self, tables_soup):
+ """
+ Returns array containing only the highest level tables
+ Operates on the assumption that bs4 returns child elements immediately after
+ the parent element in `find_all`. If this changes in the future, this method will need to be updated
+ :return:
+ """
+ new_tables = []
+ nest = 0
+ for table in tables_soup:
+ if nest:
+ nest -= 1
+ continue
+ new_tables.append(table)
+ nest = len(table.find_all('table'))
+ return new_tables
+
+ def get_table_rows(self, table_soup):
+ # If there's a header, body, footer or direct child tr tags, add row dimensions from there
+ return table_soup.select(', '.join(self.table_row_selectors), recursive=False)
+
+ def get_table_columns(self, row):
+ # Get all columns for the specified row tag.
+ return row.find_all(['th', 'td'], recursive=False) if row else []
+
+ def get_table_dimensions(self, table_soup):
+ # Get rows for the table
+ rows = self.get_table_rows(table_soup)
+ # Table is either empty or has non-direct children between table and tr tags
+ # Thus the row dimensions and column dimensions are assumed to be 0
+
+ cols = self.get_table_columns(rows[0]) if rows else []
+ # Add colspan calculation column number
+ col_count = 0
+ for col in cols:
+ colspan = col.attrs.get('colspan', 1)
+ col_count += int(colspan)
+
+ # return len(rows), col_count
+ return rows, col_count
+
+ def get_tables(self):
+ if not hasattr(self, 'soup'):
+ self.include_tables = False
+ return
+ # find other way to do it, or require this dependency?
+ self.tables = self.ignore_nested_tables(self.soup.find_all('table'))
+ self.table_no = 0
+
+ def run_process(self, html):
+ if self.bs and BeautifulSoup:
+ self.soup = BeautifulSoup(html, 'html.parser')
+ html = str(self.soup)
+ if self.include_tables:
+ self.get_tables()
+ self.feed(html)
+
+ def add_html_to_document(self, html, document):
+ if not isinstance(html, str):
+ raise ValueError('First argument needs to be a %s' % str)
+ elif not isinstance(document, docx.document.Document) and not isinstance(document, docx.table._Cell):
+ raise ValueError('Second argument needs to be a %s' % docx.document.Document)
+ self.set_initial_attrs(document)
+ self.run_process(html)
+
+ def add_html_to_cell(self, html, cell):
+ self.set_initial_attrs(cell)
+ self.run_process(html)
+
+ def parse_html_file(self, filename_html, filename_docx=None):
+ with open(filename_html, 'r') as infile:
+ html = infile.read()
+ self.set_initial_attrs()
+ self.run_process(html)
+ if not filename_docx:
+ path, filename = os.path.split(filename_html)
+ filename_docx = '%s/new_docx_file_%s' % (path, filename)
+ self.doc.save('%s.docx' % filename_docx)
+
+ def parse_html_string(self, html):
+ self.set_initial_attrs()
+ self.run_process(html)
+ return self.doc
\ No newline at end of file
diff --git a/ppstructure/utility.py b/ppstructure/utility.py
index 625185e6f5b090641befc35b3b4980c331687cff..2cf20eb53f87a8f8fbe2bdb4c3ead77f40120370 100644
--- a/ppstructure/utility.py
+++ b/ppstructure/utility.py
@@ -89,6 +89,11 @@ def init_args():
type=bool,
default=False,
help='Whether to enable layout of recovery')
+ parser.add_argument(
+ "--save_pdf",
+ type=bool,
+ default=False,
+ help='Whether to save pdf file')
return parser
|