optimize recovery (#8346)

* optimize recovery * update

optimize recovery (#8346)
* optimize recovery * update
f68813eb · user1018 · GitHub · 70987157 · f68813eb · f68813eb
5 changed file
--- a/ppstructure/predict_system.py
+++ b/ppstructure/predict_system.py
@@ -229,7 +229,9 @@ def main(args):

        if args.recovery and args.use_pdf2docx_api and flag_pdf:
            from pdf2docx.converter import Converter
-            docx_file = os.path.join(args.output, '{}.docx'.format(img_name))
+            os.makedirs(args.output, exist_ok=True)
+            docx_file = os.path.join(args.output,
+                                     '{}_api.docx'.format(img_name))
            cv = Converter(image_file)
            cv.convert(docx_file)
            cv.close()

--- a/ppstructure/recovery/recovery_to_doc.py
+++ b/ppstructure/recovery/recovery_to_doc.py
@@ -73,7 +73,7 @@ def convert_info_docx(img, res, save_folder, img_name):
                text_run.font.size = shared.Pt(10)

    # save to docx
-    docx_path = os.path.join(save_folder, '{}.docx'.format(img_name))
+    docx_path = os.path.join(save_folder, '{}_ocr.docx'.format(img_name))
    doc.save(docx_path)
    logger.info('docx save to {}'.format(docx_path))


--- a/ppstructure/recovery/requirements.txt
+++ b/ppstructure/recovery/requirements.txt
 python-docx
-PyMuPDF==1.19.0
 beautifulsoup4
 fonttools>=4.24.0
 fire>=0.3.0

--- a/ppstructure/recovery/table_process.py
+++ b/ppstructure/recovery/table_process.py
@@ -278,8 +278,6 @@ class HtmlToDocx(HTMLParser):
                cell_col += colspan
            cell_row += 1

-        doc.save('1.docx')
-
    def handle_data(self, data):
        if self.skip:
            return

--- a/requirements.txt
+++ b/requirements.txt
@@ -14,4 +14,4 @@ lxml
 premailer
 openpyxl
 attrdict
-PyMuPDF==1.19.0
\ No newline at end of file
+PyMuPDF<1.21.0
\ No newline at end of file