diff --git a/ppstructure/predict_system.py b/ppstructure/predict_system.py index bb061c998f6f8b16c06f9ee94299af0f59c53eb2..b32b706299bc188d824ac984173c97adc378f8de 100644 --- a/ppstructure/predict_system.py +++ b/ppstructure/predict_system.py @@ -229,7 +229,9 @@ def main(args): if args.recovery and args.use_pdf2docx_api and flag_pdf: from pdf2docx.converter import Converter - docx_file = os.path.join(args.output, '{}.docx'.format(img_name)) + os.makedirs(args.output, exist_ok=True) + docx_file = os.path.join(args.output, + '{}_api.docx'.format(img_name)) cv = Converter(image_file) cv.convert(docx_file) cv.close() diff --git a/ppstructure/recovery/recovery_to_doc.py b/ppstructure/recovery/recovery_to_doc.py index 1d8f8d9d4babca7410d6625dbeac4c41668f58a7..05018120820d49216e292d1d4a726cba6458db66 100644 --- a/ppstructure/recovery/recovery_to_doc.py +++ b/ppstructure/recovery/recovery_to_doc.py @@ -73,7 +73,7 @@ def convert_info_docx(img, res, save_folder, img_name): text_run.font.size = shared.Pt(10) # save to docx - docx_path = os.path.join(save_folder, '{}.docx'.format(img_name)) + docx_path = os.path.join(save_folder, '{}_ocr.docx'.format(img_name)) doc.save(docx_path) logger.info('docx save to {}'.format(docx_path)) diff --git a/ppstructure/recovery/requirements.txt b/ppstructure/recovery/requirements.txt index ec08f9d0a28b54e3e082db4d32799f8384250c1d..761b9d7c3e34cedb335e2c93707619593ebede63 100644 --- a/ppstructure/recovery/requirements.txt +++ b/ppstructure/recovery/requirements.txt @@ -1,5 +1,4 @@ python-docx -PyMuPDF==1.19.0 beautifulsoup4 fonttools>=4.24.0 fire>=0.3.0 diff --git a/ppstructure/recovery/table_process.py b/ppstructure/recovery/table_process.py index 982e6b760f9291628d0514728dc8f684f183aa2c..77a6ef7659666ebcbe54dd0c107cb2d62e4c7273 100644 --- a/ppstructure/recovery/table_process.py +++ b/ppstructure/recovery/table_process.py @@ -278,8 +278,6 @@ class HtmlToDocx(HTMLParser): cell_col += colspan cell_row += 1 - doc.save('1.docx') - def handle_data(self, data): if self.skip: return diff --git a/requirements.txt b/requirements.txt index f3d9ce89e3e2ae9079598d37f75b9e4e63d871a6..b4f8b011ee49e288fece2f15815f68050ec92944 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,4 +14,4 @@ lxml premailer openpyxl attrdict -PyMuPDF==1.19.0 \ No newline at end of file +PyMuPDF<1.21.0 \ No newline at end of file