From f68813eb2a1a3777be4ac3eb03a376d994746db9 Mon Sep 17 00:00:00 2001 From: user1018 <614803115@qq.com> Date: Thu, 17 Nov 2022 16:18:05 +0800 Subject: [PATCH] optimize recovery (#8346) * optimize recovery * update --- ppstructure/predict_system.py | 4 +++- ppstructure/recovery/recovery_to_doc.py | 2 +- ppstructure/recovery/requirements.txt | 1 - ppstructure/recovery/table_process.py | 2 -- requirements.txt | 2 +- 5 files changed, 5 insertions(+), 6 deletions(-) diff --git a/ppstructure/predict_system.py b/ppstructure/predict_system.py index bb061c99..b32b7062 100644 --- a/ppstructure/predict_system.py +++ b/ppstructure/predict_system.py @@ -229,7 +229,9 @@ def main(args): if args.recovery and args.use_pdf2docx_api and flag_pdf: from pdf2docx.converter import Converter - docx_file = os.path.join(args.output, '{}.docx'.format(img_name)) + os.makedirs(args.output, exist_ok=True) + docx_file = os.path.join(args.output, + '{}_api.docx'.format(img_name)) cv = Converter(image_file) cv.convert(docx_file) cv.close() diff --git a/ppstructure/recovery/recovery_to_doc.py b/ppstructure/recovery/recovery_to_doc.py index 1d8f8d9d..05018120 100644 --- a/ppstructure/recovery/recovery_to_doc.py +++ b/ppstructure/recovery/recovery_to_doc.py @@ -73,7 +73,7 @@ def convert_info_docx(img, res, save_folder, img_name): text_run.font.size = shared.Pt(10) # save to docx - docx_path = os.path.join(save_folder, '{}.docx'.format(img_name)) + docx_path = os.path.join(save_folder, '{}_ocr.docx'.format(img_name)) doc.save(docx_path) logger.info('docx save to {}'.format(docx_path)) diff --git a/ppstructure/recovery/requirements.txt b/ppstructure/recovery/requirements.txt index ec08f9d0..761b9d7c 100644 --- a/ppstructure/recovery/requirements.txt +++ b/ppstructure/recovery/requirements.txt @@ -1,5 +1,4 @@ python-docx -PyMuPDF==1.19.0 beautifulsoup4 fonttools>=4.24.0 fire>=0.3.0 diff --git a/ppstructure/recovery/table_process.py b/ppstructure/recovery/table_process.py index 982e6b76..77a6ef76 100644 --- a/ppstructure/recovery/table_process.py +++ b/ppstructure/recovery/table_process.py @@ -278,8 +278,6 @@ class HtmlToDocx(HTMLParser): cell_col += colspan cell_row += 1 - doc.save('1.docx') - def handle_data(self, data): if self.skip: return diff --git a/requirements.txt b/requirements.txt index f3d9ce89..b4f8b011 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,4 +14,4 @@ lxml premailer openpyxl attrdict -PyMuPDF==1.19.0 \ No newline at end of file +PyMuPDF<1.21.0 \ No newline at end of file -- GitLab