From 0f70eaf28514913c3d4b2d2da45bc04453acdf6f Mon Sep 17 00:00:00 2001
From: whjdark <44253501+whjdark@users.noreply.github.com>
Date: Thu, 20 Oct 2022 12:38:21 +0800
Subject: [PATCH] pdf2word v0.2.2

pdf2word v0.2.2
---
 ppstructure/pdf2word/pdf2word.py | 139 ++++++++++++++++++++++---------
 1 file changed, 98 insertions(+), 41 deletions(-)

diff --git a/ppstructure/pdf2word/pdf2word.py b/ppstructure/pdf2word/pdf2word.py
index 6b394094..a287fb24 100644
--- a/ppstructure/pdf2word/pdf2word.py
+++ b/ppstructure/pdf2word/pdf2word.py
@@ -7,8 +7,11 @@ import functools
 import cv2
 import platform
 import numpy as np
+import fitz
+from PIL import Image
+from pdf2docx.converter import Converter
 from qtpy.QtWidgets import QApplication, QWidget, QPushButton, QProgressBar, \
-                           QGridLayout, QMessageBox, QLabel, QFileDialog
+                           QGridLayout, QMessageBox, QLabel, QFileDialog, QCheckBox
 from qtpy.QtCore import Signal, QThread, QObject
 from qtpy.QtGui import QImage, QPixmap, QIcon
 
@@ -17,6 +20,7 @@ root = os.path.abspath(os.path.join(file, '../../'))
 sys.path.append(file)
 sys.path.insert(0, root)
 
+
 from ppstructure.predict_system import StructureSystem, save_structure_res
 from ppstructure.utility import parse_args, draw_structure_result
 from ppocr.utils.network import download_with_progressbar
@@ -24,7 +28,7 @@ from ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_in
 # from ScreenShotWidget import ScreenShotWidget
 
 __APPNAME__ = "pdf2word"
-__VERSION__ = "0.1.1"
+__VERSION__ = "0.2.2"
 
 URLs_EN = {
     # 下载超英文轻量级PP-OCRv3模型的检测模型并解压
@@ -75,9 +79,7 @@ def QImageToCvMat(incomingImage) -> np.array:
 
 
 def readImage(image_file) -> list:
-    if os.path.basename(image_file)[-3:] in ['pdf']:
-        import fitz
-        from PIL import Image
+    if os.path.basename(image_file)[-3:] == 'pdf':
         imgs = []
         with fitz.open(image_file) as pdf:
             for pg in range(0, pdf.pageCount):
@@ -102,17 +104,22 @@ def readImage(image_file) -> list:
 
 class Worker(QThread):
     progressBarValue = Signal(int)
+    progressBarRange = Signal(int)
     endsignal = Signal()
+    exceptedsignal = Signal(str) #发送一个异常信号
     loopFlag = True
 
-    def __init__(self, predictors, save_pdf, vis_font_path):
+    def __init__(self, predictors, save_pdf, vis_font_path, use_pdf2docx_api):
         super(Worker, self).__init__()
         self.predictors = predictors
         self.save_pdf = save_pdf
         self.vis_font_path = vis_font_path
         self.lang = 'EN'
         self.imagePaths = []
+        self.use_pdf2docx_api = use_pdf2docx_api
         self.outputDir = None
+        self.totalPageCnt = 0
+        self.pageCnt = 0
         self.setStackSize(1024*1024)
 
     def setImagePath(self, imagePaths):
@@ -123,61 +130,91 @@ class Worker(QThread):
 
     def setOutputDir(self, outputDir):
         self.outputDir = outputDir
+    
+    def setPDFParser(self, enabled):
+        self.use_pdf2docx_api = enabled
+
+    def resetPageCnt(self):
+        self.pageCnt = 0
+
+    def resetTotalPageCnt(self):
+        self.totalPageCnt = 0
 
-    def predictAndSave(self, imgs, img_name):
+    def ppocrPrecitor(self, imgs, img_name):
         all_res = []
+        # update progress bar ranges
+        self.totalPageCnt += len(imgs)
+        self.progressBarRange.emit(self.totalPageCnt)
+        # processing pages
         for index, img in enumerate(imgs):
             res, time_dict = self.predictors[self.lang](img)
 
             # save output
             save_structure_res(res, self.outputDir, img_name)
-            draw_img = draw_structure_result(img, res, self.vis_font_path)
-            img_save_path = os.path.join(self.outputDir, img_name, 'show_{}.jpg'.format(index))
-            if res != []:
-                cv2.imwrite(img_save_path, draw_img)
+            # draw_img = draw_structure_result(img, res, self.vis_font_path)
+            # img_save_path = os.path.join(self.outputDir, img_name, 'show_{}.jpg'.format(index))
+            # if res != []:
+            #     cv2.imwrite(img_save_path, draw_img)
 
             # recovery
             h, w, _ = img.shape
             res = sorted_layout_boxes(res, w)
             all_res += res
+            self.pageCnt += 1
+            self.progressBarValue.emit(self.pageCnt)
 
-        try:
-            convert_info_docx(img, all_res, self.outputDir, img_name, self.save_pdf)
-        except Exception as ex:
-            print(self,
-                "error in layout recovery image:{}, err msg: {}".format(
-                img_name, ex))
-
+        if all_res != []:
+            try:
+                convert_info_docx(imgs, all_res, self.outputDir, img_name)
+            except Exception as ex:
+                print("error in layout recovery image:{}, err msg: {}".
+                    format(img_name, ex))
+        print("Predict time : {:.3f}s".format(time_dict['all']))
         print('result save to {}'.format(self.outputDir)) 
 
     def run(self):
+        self.resetPageCnt()
+        self.resetTotalPageCnt()
         try:
-            findex = 0
             os.makedirs(self.outputDir, exist_ok=True)
             for i, image_file in enumerate(self.imagePaths):
-                if self.loopFlag == True:
+                if not self.loopFlag:
+                    break
+                # using use_pdf2docx_api for PDF parsing
+                if self.use_pdf2docx_api \
+                    and os.path.basename(image_file)[-3:] == 'pdf':
+                    self.totalPageCnt += 1
+                    self.progressBarRange.emit(self.totalPageCnt)
+                    print('===============using use_pdf2docx_api===============')
+                    img_name = os.path.basename(image_file).split('.')[0]
+                    docx_file = os.path.join(
+                        self.outputDir, '{}.docx'.format(img_name))
+                    cv = Converter(image_file)
+                    cv.convert(docx_file)
+                    cv.close()
+                    print('docx save to {}'.format(docx_file))
+                    self.pageCnt += 1
+                    self.progressBarValue.emit(self.pageCnt)
+                else:
+                    # using PPOCR for PDF/Image parsing
                     imgs = readImage(image_file)
                     if len(imgs) == 0:
                         continue
                     img_name = os.path.basename(image_file).split('.')[0]
                     os.makedirs(os.path.join(self.outputDir, img_name), exist_ok=True)
-                    self.predictAndSave(imgs, img_name)
-                    findex += 1
-                    self.progressBarValue.emit(findex)
-                else:
-                    break
+                    self.ppocrPrecitor(imgs, img_name)
+                # file processed
             self.endsignal.emit()
-            self.exec()
+            # self.exec()
         except Exception as e:
-            print(e)
-            raise
+            self.exceptedsignal.emit(str(e)) # 将异常发送给UI进程
 
 
 class APP_Image2Doc(QWidget):
     def __init__(self):
         super().__init__()
-        self.setFixedHeight(90)
-        self.setFixedWidth(400)
+        self.setFixedHeight(100)
+        self.setFixedWidth(420)
 
         # settings
         self.imagePaths = []
@@ -187,6 +224,7 @@ class APP_Image2Doc(QWidget):
         self.output_dir = None
         self.vis_font_path = os.path.join(root,
                 "doc", "fonts", "simfang.ttf")
+        self.use_pdf2docx_api = False
 
         # ProgressBar
         self.pb = QProgressBar()
@@ -207,10 +245,12 @@ class APP_Image2Doc(QWidget):
         }
 
         # 设置工作进程
-        self._thread = Worker(predictors, self.save_pdf, self.vis_font_path)
-        self._thread.progressBarValue.connect(self.handleProgressBarSingal)
+        self._thread = Worker(predictors, self.save_pdf, self.vis_font_path, self.use_pdf2docx_api)
+        self._thread.progressBarValue.connect(self.handleProgressBarUpdateSingal)
         self._thread.endsignal.connect(self.handleEndsignalSignal)
-        self._thread.finished.connect(QObject.deleteLater)
+        # self._thread.finished.connect(QObject.deleteLater)
+        self._thread.progressBarRange.connect(self.handleProgressBarRangeSingal)
+        self._thread.exceptedsignal.connect(self.handleThreadException)
         self.time_start = 0  # save start time
 
     def setupUi(self):
@@ -233,25 +273,30 @@ class APP_Image2Doc(QWidget):
         self.startCNButton.setIcon(QIcon(QPixmap("./icons/chinese.png")))
         layout.addWidget(self.startCNButton, 0, 1, 1, 1)
         self.startCNButton.clicked.connect(
-            functools.partial(self.handleStartSignal, 'CN'))
+            functools.partial(self.handleStartSignal, 'CN', False))
 
         self.startENButton = QPushButton("英文转换")
         self.startENButton.setIcon(QIcon(QPixmap("./icons/english.png")))
         layout.addWidget(self.startENButton, 0, 2, 1, 1)
         self.startENButton.clicked.connect(
-            functools.partial(self.handleStartSignal, 'EN'))
+            functools.partial(self.handleStartSignal, 'EN', False))
 
+        self.PDFParserButton = QPushButton('PDF解析', self)
+        layout.addWidget(self.PDFParserButton, 0, 3, 1, 1)
+        self.PDFParserButton.clicked.connect(
+            functools.partial(self.handleStartSignal, 'CN', True))
+        
         self.showResultButton = QPushButton("显示结果")
         self.showResultButton.setIcon(QIcon(QPixmap("./icons/folder-open.png")))
-        layout.addWidget(self.showResultButton, 0, 3, 1, 1)
+        layout.addWidget(self.showResultButton, 0, 4, 1, 1)
         self.showResultButton.clicked.connect(self.handleShowResultSignal)
 
         # ProgressBar
-        layout.addWidget(self.pb, 2, 0, 1, 4)
+        layout.addWidget(self.pb, 2, 0, 1, 5)
         # time estimate label
         self.timeEstLabel = QLabel(
             ("Time Left: --"))
-        layout.addWidget(self.timeEstLabel, 3, 0, 1, 4)
+        layout.addWidget(self.timeEstLabel, 3, 0, 1, 5)
 
         self.setLayout(layout)
 
@@ -355,7 +400,6 @@ class APP_Image2Doc(QWidget):
         if len(selectedFiles) > 0:
             self.imagePaths = selectedFiles
             self.screenShot = None # discard screenshot temp image
-            self.pb.setRange(0, len(self.imagePaths))
             self.pb.setValue(0)
 
     # def screenShotSlot(self):
@@ -370,7 +414,7 @@ class APP_Image2Doc(QWidget):
     #         self.pb.setRange(0, 1)
     #         self.pb.setValue(0)
 
-    def handleStartSignal(self, lang):
+    def handleStartSignal(self, lang='EN', pdfParser=False):
         if self.screenShot: # for screenShot
             img_name = 'screenshot_' + time.strftime("%Y%m%d%H%M%S", time.localtime())
             image = QImageToCvMat(self.screenShot)
@@ -386,10 +430,12 @@ class APP_Image2Doc(QWidget):
             self._thread.setOutputDir(self.output_dir)
             self._thread.setImagePath(self.imagePaths)
             self._thread.setLang(lang)
+            self._thread.setPDFParser(pdfParser)
             # disenble buttons
             self.openFileButton.setEnabled(False)
             self.startCNButton.setEnabled(False)
             self.startENButton.setEnabled(False)
+            self.PDFParserButton.setEnabled(False)
             # 启动工作进程
             self._thread.start()
             self.time_start = time.time() # log start time
@@ -411,7 +457,7 @@ class APP_Image2Doc(QWidget):
             QMessageBox.information(self, 
                 u'Information', "输出文件不存在")
 
-    def handleProgressBarSingal(self, i):
+    def handleProgressBarUpdateSingal(self, i):
         self.pb.setValue(i)
         # calculate time left of recognition
         lenbar = self.pb.maximum()
@@ -419,13 +465,24 @@ class APP_Image2Doc(QWidget):
         time_left = str(datetime.timedelta(seconds=avg_time * (lenbar - i))).split(".")[0]  # Remove microseconds
         self.timeEstLabel.setText(f"Time Left: {time_left}")  # show time left
 
+    def handleProgressBarRangeSingal(self, max):
+        self.pb.setRange(0, max)
+
     def handleEndsignalSignal(self):
         # enble buttons
         self.openFileButton.setEnabled(True)
         self.startCNButton.setEnabled(True)
         self.startENButton.setEnabled(True)
+        self.PDFParserButton.setEnabled(True)
         QMessageBox.information(self, u'Information', "转换结束")
 
+    def handleCBChangeSignal(self):
+        self._thread.setPDFParser(self.checkBox.isChecked())
+
+    def handleThreadException(self, message):
+        self._thread.quit()
+        QMessageBox.information(self, message)
+
 
 def main():
     app = QApplication(sys.argv)
-- 
GitLab