未验证 提交 f68813eb 编写于 作者: U user1018 提交者: GitHub

optimize recovery (#8346)

* optimize recovery

* update
上级 70987157
...@@ -229,7 +229,9 @@ def main(args): ...@@ -229,7 +229,9 @@ def main(args):
if args.recovery and args.use_pdf2docx_api and flag_pdf: if args.recovery and args.use_pdf2docx_api and flag_pdf:
from pdf2docx.converter import Converter from pdf2docx.converter import Converter
docx_file = os.path.join(args.output, '{}.docx'.format(img_name)) os.makedirs(args.output, exist_ok=True)
docx_file = os.path.join(args.output,
'{}_api.docx'.format(img_name))
cv = Converter(image_file) cv = Converter(image_file)
cv.convert(docx_file) cv.convert(docx_file)
cv.close() cv.close()
......
...@@ -73,7 +73,7 @@ def convert_info_docx(img, res, save_folder, img_name): ...@@ -73,7 +73,7 @@ def convert_info_docx(img, res, save_folder, img_name):
text_run.font.size = shared.Pt(10) text_run.font.size = shared.Pt(10)
# save to docx # save to docx
docx_path = os.path.join(save_folder, '{}.docx'.format(img_name)) docx_path = os.path.join(save_folder, '{}_ocr.docx'.format(img_name))
doc.save(docx_path) doc.save(docx_path)
logger.info('docx save to {}'.format(docx_path)) logger.info('docx save to {}'.format(docx_path))
......
python-docx python-docx
PyMuPDF==1.19.0
beautifulsoup4 beautifulsoup4
fonttools>=4.24.0 fonttools>=4.24.0
fire>=0.3.0 fire>=0.3.0
......
...@@ -278,8 +278,6 @@ class HtmlToDocx(HTMLParser): ...@@ -278,8 +278,6 @@ class HtmlToDocx(HTMLParser):
cell_col += colspan cell_col += colspan
cell_row += 1 cell_row += 1
doc.save('1.docx')
def handle_data(self, data): def handle_data(self, data):
if self.skip: if self.skip:
return return
......
...@@ -14,4 +14,4 @@ lxml ...@@ -14,4 +14,4 @@ lxml
premailer premailer
openpyxl openpyxl
attrdict attrdict
PyMuPDF==1.19.0 PyMuPDF<1.21.0
\ No newline at end of file \ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册