From 0f70eaf28514913c3d4b2d2da45bc04453acdf6f Mon Sep 17 00:00:00 2001 From: whjdark <44253501+whjdark@users.noreply.github.com> Date: Thu, 20 Oct 2022 12:38:21 +0800 Subject: [PATCH] pdf2word v0.2.2 pdf2word v0.2.2 --- ppstructure/pdf2word/pdf2word.py | 139 ++++++++++++++++++++++--------- 1 file changed, 98 insertions(+), 41 deletions(-) diff --git a/ppstructure/pdf2word/pdf2word.py b/ppstructure/pdf2word/pdf2word.py index 6b394094..a287fb24 100644 --- a/ppstructure/pdf2word/pdf2word.py +++ b/ppstructure/pdf2word/pdf2word.py @@ -7,8 +7,11 @@ import functools import cv2 import platform import numpy as np +import fitz +from PIL import Image +from pdf2docx.converter import Converter from qtpy.QtWidgets import QApplication, QWidget, QPushButton, QProgressBar, \ - QGridLayout, QMessageBox, QLabel, QFileDialog + QGridLayout, QMessageBox, QLabel, QFileDialog, QCheckBox from qtpy.QtCore import Signal, QThread, QObject from qtpy.QtGui import QImage, QPixmap, QIcon @@ -17,6 +20,7 @@ root = os.path.abspath(os.path.join(file, '../../')) sys.path.append(file) sys.path.insert(0, root) + from ppstructure.predict_system import StructureSystem, save_structure_res from ppstructure.utility import parse_args, draw_structure_result from ppocr.utils.network import download_with_progressbar @@ -24,7 +28,7 @@ from ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_in # from ScreenShotWidget import ScreenShotWidget __APPNAME__ = "pdf2word" -__VERSION__ = "0.1.1" +__VERSION__ = "0.2.2" URLs_EN = { # 下载超英文轻量级PP-OCRv3模型的检测模型并解压 @@ -75,9 +79,7 @@ def QImageToCvMat(incomingImage) -> np.array: def readImage(image_file) -> list: - if os.path.basename(image_file)[-3:] in ['pdf']: - import fitz - from PIL import Image + if os.path.basename(image_file)[-3:] == 'pdf': imgs = [] with fitz.open(image_file) as pdf: for pg in range(0, pdf.pageCount): @@ -102,17 +104,22 @@ def readImage(image_file) -> list: class Worker(QThread): progressBarValue = Signal(int) + progressBarRange = Signal(int) endsignal = Signal() + exceptedsignal = Signal(str) #发送一个异常信号 loopFlag = True - def __init__(self, predictors, save_pdf, vis_font_path): + def __init__(self, predictors, save_pdf, vis_font_path, use_pdf2docx_api): super(Worker, self).__init__() self.predictors = predictors self.save_pdf = save_pdf self.vis_font_path = vis_font_path self.lang = 'EN' self.imagePaths = [] + self.use_pdf2docx_api = use_pdf2docx_api self.outputDir = None + self.totalPageCnt = 0 + self.pageCnt = 0 self.setStackSize(1024*1024) def setImagePath(self, imagePaths): @@ -123,61 +130,91 @@ class Worker(QThread): def setOutputDir(self, outputDir): self.outputDir = outputDir + + def setPDFParser(self, enabled): + self.use_pdf2docx_api = enabled + + def resetPageCnt(self): + self.pageCnt = 0 + + def resetTotalPageCnt(self): + self.totalPageCnt = 0 - def predictAndSave(self, imgs, img_name): + def ppocrPrecitor(self, imgs, img_name): all_res = [] + # update progress bar ranges + self.totalPageCnt += len(imgs) + self.progressBarRange.emit(self.totalPageCnt) + # processing pages for index, img in enumerate(imgs): res, time_dict = self.predictors[self.lang](img) # save output save_structure_res(res, self.outputDir, img_name) - draw_img = draw_structure_result(img, res, self.vis_font_path) - img_save_path = os.path.join(self.outputDir, img_name, 'show_{}.jpg'.format(index)) - if res != []: - cv2.imwrite(img_save_path, draw_img) + # draw_img = draw_structure_result(img, res, self.vis_font_path) + # img_save_path = os.path.join(self.outputDir, img_name, 'show_{}.jpg'.format(index)) + # if res != []: + # cv2.imwrite(img_save_path, draw_img) # recovery h, w, _ = img.shape res = sorted_layout_boxes(res, w) all_res += res + self.pageCnt += 1 + self.progressBarValue.emit(self.pageCnt) - try: - convert_info_docx(img, all_res, self.outputDir, img_name, self.save_pdf) - except Exception as ex: - print(self, - "error in layout recovery image:{}, err msg: {}".format( - img_name, ex)) - + if all_res != []: + try: + convert_info_docx(imgs, all_res, self.outputDir, img_name) + except Exception as ex: + print("error in layout recovery image:{}, err msg: {}". + format(img_name, ex)) + print("Predict time : {:.3f}s".format(time_dict['all'])) print('result save to {}'.format(self.outputDir)) def run(self): + self.resetPageCnt() + self.resetTotalPageCnt() try: - findex = 0 os.makedirs(self.outputDir, exist_ok=True) for i, image_file in enumerate(self.imagePaths): - if self.loopFlag == True: + if not self.loopFlag: + break + # using use_pdf2docx_api for PDF parsing + if self.use_pdf2docx_api \ + and os.path.basename(image_file)[-3:] == 'pdf': + self.totalPageCnt += 1 + self.progressBarRange.emit(self.totalPageCnt) + print('===============using use_pdf2docx_api===============') + img_name = os.path.basename(image_file).split('.')[0] + docx_file = os.path.join( + self.outputDir, '{}.docx'.format(img_name)) + cv = Converter(image_file) + cv.convert(docx_file) + cv.close() + print('docx save to {}'.format(docx_file)) + self.pageCnt += 1 + self.progressBarValue.emit(self.pageCnt) + else: + # using PPOCR for PDF/Image parsing imgs = readImage(image_file) if len(imgs) == 0: continue img_name = os.path.basename(image_file).split('.')[0] os.makedirs(os.path.join(self.outputDir, img_name), exist_ok=True) - self.predictAndSave(imgs, img_name) - findex += 1 - self.progressBarValue.emit(findex) - else: - break + self.ppocrPrecitor(imgs, img_name) + # file processed self.endsignal.emit() - self.exec() + # self.exec() except Exception as e: - print(e) - raise + self.exceptedsignal.emit(str(e)) # 将异常发送给UI进程 class APP_Image2Doc(QWidget): def __init__(self): super().__init__() - self.setFixedHeight(90) - self.setFixedWidth(400) + self.setFixedHeight(100) + self.setFixedWidth(420) # settings self.imagePaths = [] @@ -187,6 +224,7 @@ class APP_Image2Doc(QWidget): self.output_dir = None self.vis_font_path = os.path.join(root, "doc", "fonts", "simfang.ttf") + self.use_pdf2docx_api = False # ProgressBar self.pb = QProgressBar() @@ -207,10 +245,12 @@ class APP_Image2Doc(QWidget): } # 设置工作进程 - self._thread = Worker(predictors, self.save_pdf, self.vis_font_path) - self._thread.progressBarValue.connect(self.handleProgressBarSingal) + self._thread = Worker(predictors, self.save_pdf, self.vis_font_path, self.use_pdf2docx_api) + self._thread.progressBarValue.connect(self.handleProgressBarUpdateSingal) self._thread.endsignal.connect(self.handleEndsignalSignal) - self._thread.finished.connect(QObject.deleteLater) + # self._thread.finished.connect(QObject.deleteLater) + self._thread.progressBarRange.connect(self.handleProgressBarRangeSingal) + self._thread.exceptedsignal.connect(self.handleThreadException) self.time_start = 0 # save start time def setupUi(self): @@ -233,25 +273,30 @@ class APP_Image2Doc(QWidget): self.startCNButton.setIcon(QIcon(QPixmap("./icons/chinese.png"))) layout.addWidget(self.startCNButton, 0, 1, 1, 1) self.startCNButton.clicked.connect( - functools.partial(self.handleStartSignal, 'CN')) + functools.partial(self.handleStartSignal, 'CN', False)) self.startENButton = QPushButton("英文转换") self.startENButton.setIcon(QIcon(QPixmap("./icons/english.png"))) layout.addWidget(self.startENButton, 0, 2, 1, 1) self.startENButton.clicked.connect( - functools.partial(self.handleStartSignal, 'EN')) + functools.partial(self.handleStartSignal, 'EN', False)) + self.PDFParserButton = QPushButton('PDF解析', self) + layout.addWidget(self.PDFParserButton, 0, 3, 1, 1) + self.PDFParserButton.clicked.connect( + functools.partial(self.handleStartSignal, 'CN', True)) + self.showResultButton = QPushButton("显示结果") self.showResultButton.setIcon(QIcon(QPixmap("./icons/folder-open.png"))) - layout.addWidget(self.showResultButton, 0, 3, 1, 1) + layout.addWidget(self.showResultButton, 0, 4, 1, 1) self.showResultButton.clicked.connect(self.handleShowResultSignal) # ProgressBar - layout.addWidget(self.pb, 2, 0, 1, 4) + layout.addWidget(self.pb, 2, 0, 1, 5) # time estimate label self.timeEstLabel = QLabel( ("Time Left: --")) - layout.addWidget(self.timeEstLabel, 3, 0, 1, 4) + layout.addWidget(self.timeEstLabel, 3, 0, 1, 5) self.setLayout(layout) @@ -355,7 +400,6 @@ class APP_Image2Doc(QWidget): if len(selectedFiles) > 0: self.imagePaths = selectedFiles self.screenShot = None # discard screenshot temp image - self.pb.setRange(0, len(self.imagePaths)) self.pb.setValue(0) # def screenShotSlot(self): @@ -370,7 +414,7 @@ class APP_Image2Doc(QWidget): # self.pb.setRange(0, 1) # self.pb.setValue(0) - def handleStartSignal(self, lang): + def handleStartSignal(self, lang='EN', pdfParser=False): if self.screenShot: # for screenShot img_name = 'screenshot_' + time.strftime("%Y%m%d%H%M%S", time.localtime()) image = QImageToCvMat(self.screenShot) @@ -386,10 +430,12 @@ class APP_Image2Doc(QWidget): self._thread.setOutputDir(self.output_dir) self._thread.setImagePath(self.imagePaths) self._thread.setLang(lang) + self._thread.setPDFParser(pdfParser) # disenble buttons self.openFileButton.setEnabled(False) self.startCNButton.setEnabled(False) self.startENButton.setEnabled(False) + self.PDFParserButton.setEnabled(False) # 启动工作进程 self._thread.start() self.time_start = time.time() # log start time @@ -411,7 +457,7 @@ class APP_Image2Doc(QWidget): QMessageBox.information(self, u'Information', "输出文件不存在") - def handleProgressBarSingal(self, i): + def handleProgressBarUpdateSingal(self, i): self.pb.setValue(i) # calculate time left of recognition lenbar = self.pb.maximum() @@ -419,13 +465,24 @@ class APP_Image2Doc(QWidget): time_left = str(datetime.timedelta(seconds=avg_time * (lenbar - i))).split(".")[0] # Remove microseconds self.timeEstLabel.setText(f"Time Left: {time_left}") # show time left + def handleProgressBarRangeSingal(self, max): + self.pb.setRange(0, max) + def handleEndsignalSignal(self): # enble buttons self.openFileButton.setEnabled(True) self.startCNButton.setEnabled(True) self.startENButton.setEnabled(True) + self.PDFParserButton.setEnabled(True) QMessageBox.information(self, u'Information', "转换结束") + def handleCBChangeSignal(self): + self._thread.setPDFParser(self.checkBox.isChecked()) + + def handleThreadException(self, message): + self._thread.quit() + QMessageBox.information(self, message) + def main(): app = QApplication(sys.argv) -- GitLab