未验证 提交 f91026dd 编写于 作者: E Evezerest 提交者: GitHub

Merge pull request #8005 from whjdark/patch-5

pdf2word v0.2.2
...@@ -7,8 +7,11 @@ import functools ...@@ -7,8 +7,11 @@ import functools
import cv2 import cv2
import platform import platform
import numpy as np import numpy as np
import fitz
from PIL import Image
from pdf2docx.converter import Converter
from qtpy.QtWidgets import QApplication, QWidget, QPushButton, QProgressBar, \ from qtpy.QtWidgets import QApplication, QWidget, QPushButton, QProgressBar, \
QGridLayout, QMessageBox, QLabel, QFileDialog QGridLayout, QMessageBox, QLabel, QFileDialog, QCheckBox
from qtpy.QtCore import Signal, QThread, QObject from qtpy.QtCore import Signal, QThread, QObject
from qtpy.QtGui import QImage, QPixmap, QIcon from qtpy.QtGui import QImage, QPixmap, QIcon
...@@ -17,6 +20,7 @@ root = os.path.abspath(os.path.join(file, '../../')) ...@@ -17,6 +20,7 @@ root = os.path.abspath(os.path.join(file, '../../'))
sys.path.append(file) sys.path.append(file)
sys.path.insert(0, root) sys.path.insert(0, root)
from ppstructure.predict_system import StructureSystem, save_structure_res from ppstructure.predict_system import StructureSystem, save_structure_res
from ppstructure.utility import parse_args, draw_structure_result from ppstructure.utility import parse_args, draw_structure_result
from ppocr.utils.network import download_with_progressbar from ppocr.utils.network import download_with_progressbar
...@@ -24,7 +28,7 @@ from ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_in ...@@ -24,7 +28,7 @@ from ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_in
# from ScreenShotWidget import ScreenShotWidget # from ScreenShotWidget import ScreenShotWidget
__APPNAME__ = "pdf2word" __APPNAME__ = "pdf2word"
__VERSION__ = "0.1.1" __VERSION__ = "0.2.2"
URLs_EN = { URLs_EN = {
# 下载超英文轻量级PP-OCRv3模型的检测模型并解压 # 下载超英文轻量级PP-OCRv3模型的检测模型并解压
...@@ -75,9 +79,7 @@ def QImageToCvMat(incomingImage) -> np.array: ...@@ -75,9 +79,7 @@ def QImageToCvMat(incomingImage) -> np.array:
def readImage(image_file) -> list: def readImage(image_file) -> list:
if os.path.basename(image_file)[-3:] in ['pdf']: if os.path.basename(image_file)[-3:] == 'pdf':
import fitz
from PIL import Image
imgs = [] imgs = []
with fitz.open(image_file) as pdf: with fitz.open(image_file) as pdf:
for pg in range(0, pdf.pageCount): for pg in range(0, pdf.pageCount):
...@@ -102,17 +104,22 @@ def readImage(image_file) -> list: ...@@ -102,17 +104,22 @@ def readImage(image_file) -> list:
class Worker(QThread): class Worker(QThread):
progressBarValue = Signal(int) progressBarValue = Signal(int)
progressBarRange = Signal(int)
endsignal = Signal() endsignal = Signal()
exceptedsignal = Signal(str) #发送一个异常信号
loopFlag = True loopFlag = True
def __init__(self, predictors, save_pdf, vis_font_path): def __init__(self, predictors, save_pdf, vis_font_path, use_pdf2docx_api):
super(Worker, self).__init__() super(Worker, self).__init__()
self.predictors = predictors self.predictors = predictors
self.save_pdf = save_pdf self.save_pdf = save_pdf
self.vis_font_path = vis_font_path self.vis_font_path = vis_font_path
self.lang = 'EN' self.lang = 'EN'
self.imagePaths = [] self.imagePaths = []
self.use_pdf2docx_api = use_pdf2docx_api
self.outputDir = None self.outputDir = None
self.totalPageCnt = 0
self.pageCnt = 0
self.setStackSize(1024*1024) self.setStackSize(1024*1024)
def setImagePath(self, imagePaths): def setImagePath(self, imagePaths):
...@@ -123,61 +130,91 @@ class Worker(QThread): ...@@ -123,61 +130,91 @@ class Worker(QThread):
def setOutputDir(self, outputDir): def setOutputDir(self, outputDir):
self.outputDir = outputDir self.outputDir = outputDir
def setPDFParser(self, enabled):
self.use_pdf2docx_api = enabled
def resetPageCnt(self):
self.pageCnt = 0
def resetTotalPageCnt(self):
self.totalPageCnt = 0
def predictAndSave(self, imgs, img_name): def ppocrPrecitor(self, imgs, img_name):
all_res = [] all_res = []
# update progress bar ranges
self.totalPageCnt += len(imgs)
self.progressBarRange.emit(self.totalPageCnt)
# processing pages
for index, img in enumerate(imgs): for index, img in enumerate(imgs):
res, time_dict = self.predictors[self.lang](img) res, time_dict = self.predictors[self.lang](img)
# save output # save output
save_structure_res(res, self.outputDir, img_name) save_structure_res(res, self.outputDir, img_name)
draw_img = draw_structure_result(img, res, self.vis_font_path) # draw_img = draw_structure_result(img, res, self.vis_font_path)
img_save_path = os.path.join(self.outputDir, img_name, 'show_{}.jpg'.format(index)) # img_save_path = os.path.join(self.outputDir, img_name, 'show_{}.jpg'.format(index))
if res != []: # if res != []:
cv2.imwrite(img_save_path, draw_img) # cv2.imwrite(img_save_path, draw_img)
# recovery # recovery
h, w, _ = img.shape h, w, _ = img.shape
res = sorted_layout_boxes(res, w) res = sorted_layout_boxes(res, w)
all_res += res all_res += res
self.pageCnt += 1
self.progressBarValue.emit(self.pageCnt)
try: if all_res != []:
convert_info_docx(img, all_res, self.outputDir, img_name, self.save_pdf) try:
except Exception as ex: convert_info_docx(imgs, all_res, self.outputDir, img_name)
print(self, except Exception as ex:
"error in layout recovery image:{}, err msg: {}".format( print("error in layout recovery image:{}, err msg: {}".
img_name, ex)) format(img_name, ex))
print("Predict time : {:.3f}s".format(time_dict['all']))
print('result save to {}'.format(self.outputDir)) print('result save to {}'.format(self.outputDir))
def run(self): def run(self):
self.resetPageCnt()
self.resetTotalPageCnt()
try: try:
findex = 0
os.makedirs(self.outputDir, exist_ok=True) os.makedirs(self.outputDir, exist_ok=True)
for i, image_file in enumerate(self.imagePaths): for i, image_file in enumerate(self.imagePaths):
if self.loopFlag == True: if not self.loopFlag:
break
# using use_pdf2docx_api for PDF parsing
if self.use_pdf2docx_api \
and os.path.basename(image_file)[-3:] == 'pdf':
self.totalPageCnt += 1
self.progressBarRange.emit(self.totalPageCnt)
print('===============using use_pdf2docx_api===============')
img_name = os.path.basename(image_file).split('.')[0]
docx_file = os.path.join(
self.outputDir, '{}.docx'.format(img_name))
cv = Converter(image_file)
cv.convert(docx_file)
cv.close()
print('docx save to {}'.format(docx_file))
self.pageCnt += 1
self.progressBarValue.emit(self.pageCnt)
else:
# using PPOCR for PDF/Image parsing
imgs = readImage(image_file) imgs = readImage(image_file)
if len(imgs) == 0: if len(imgs) == 0:
continue continue
img_name = os.path.basename(image_file).split('.')[0] img_name = os.path.basename(image_file).split('.')[0]
os.makedirs(os.path.join(self.outputDir, img_name), exist_ok=True) os.makedirs(os.path.join(self.outputDir, img_name), exist_ok=True)
self.predictAndSave(imgs, img_name) self.ppocrPrecitor(imgs, img_name)
findex += 1 # file processed
self.progressBarValue.emit(findex)
else:
break
self.endsignal.emit() self.endsignal.emit()
self.exec() # self.exec()
except Exception as e: except Exception as e:
print(e) self.exceptedsignal.emit(str(e)) # 将异常发送给UI进程
raise
class APP_Image2Doc(QWidget): class APP_Image2Doc(QWidget):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
self.setFixedHeight(90) self.setFixedHeight(100)
self.setFixedWidth(400) self.setFixedWidth(420)
# settings # settings
self.imagePaths = [] self.imagePaths = []
...@@ -187,6 +224,7 @@ class APP_Image2Doc(QWidget): ...@@ -187,6 +224,7 @@ class APP_Image2Doc(QWidget):
self.output_dir = None self.output_dir = None
self.vis_font_path = os.path.join(root, self.vis_font_path = os.path.join(root,
"doc", "fonts", "simfang.ttf") "doc", "fonts", "simfang.ttf")
self.use_pdf2docx_api = False
# ProgressBar # ProgressBar
self.pb = QProgressBar() self.pb = QProgressBar()
...@@ -207,10 +245,12 @@ class APP_Image2Doc(QWidget): ...@@ -207,10 +245,12 @@ class APP_Image2Doc(QWidget):
} }
# 设置工作进程 # 设置工作进程
self._thread = Worker(predictors, self.save_pdf, self.vis_font_path) self._thread = Worker(predictors, self.save_pdf, self.vis_font_path, self.use_pdf2docx_api)
self._thread.progressBarValue.connect(self.handleProgressBarSingal) self._thread.progressBarValue.connect(self.handleProgressBarUpdateSingal)
self._thread.endsignal.connect(self.handleEndsignalSignal) self._thread.endsignal.connect(self.handleEndsignalSignal)
self._thread.finished.connect(QObject.deleteLater) # self._thread.finished.connect(QObject.deleteLater)
self._thread.progressBarRange.connect(self.handleProgressBarRangeSingal)
self._thread.exceptedsignal.connect(self.handleThreadException)
self.time_start = 0 # save start time self.time_start = 0 # save start time
def setupUi(self): def setupUi(self):
...@@ -233,25 +273,30 @@ class APP_Image2Doc(QWidget): ...@@ -233,25 +273,30 @@ class APP_Image2Doc(QWidget):
self.startCNButton.setIcon(QIcon(QPixmap("./icons/chinese.png"))) self.startCNButton.setIcon(QIcon(QPixmap("./icons/chinese.png")))
layout.addWidget(self.startCNButton, 0, 1, 1, 1) layout.addWidget(self.startCNButton, 0, 1, 1, 1)
self.startCNButton.clicked.connect( self.startCNButton.clicked.connect(
functools.partial(self.handleStartSignal, 'CN')) functools.partial(self.handleStartSignal, 'CN', False))
self.startENButton = QPushButton("英文转换") self.startENButton = QPushButton("英文转换")
self.startENButton.setIcon(QIcon(QPixmap("./icons/english.png"))) self.startENButton.setIcon(QIcon(QPixmap("./icons/english.png")))
layout.addWidget(self.startENButton, 0, 2, 1, 1) layout.addWidget(self.startENButton, 0, 2, 1, 1)
self.startENButton.clicked.connect( self.startENButton.clicked.connect(
functools.partial(self.handleStartSignal, 'EN')) functools.partial(self.handleStartSignal, 'EN', False))
self.PDFParserButton = QPushButton('PDF解析', self)
layout.addWidget(self.PDFParserButton, 0, 3, 1, 1)
self.PDFParserButton.clicked.connect(
functools.partial(self.handleStartSignal, 'CN', True))
self.showResultButton = QPushButton("显示结果") self.showResultButton = QPushButton("显示结果")
self.showResultButton.setIcon(QIcon(QPixmap("./icons/folder-open.png"))) self.showResultButton.setIcon(QIcon(QPixmap("./icons/folder-open.png")))
layout.addWidget(self.showResultButton, 0, 3, 1, 1) layout.addWidget(self.showResultButton, 0, 4, 1, 1)
self.showResultButton.clicked.connect(self.handleShowResultSignal) self.showResultButton.clicked.connect(self.handleShowResultSignal)
# ProgressBar # ProgressBar
layout.addWidget(self.pb, 2, 0, 1, 4) layout.addWidget(self.pb, 2, 0, 1, 5)
# time estimate label # time estimate label
self.timeEstLabel = QLabel( self.timeEstLabel = QLabel(
("Time Left: --")) ("Time Left: --"))
layout.addWidget(self.timeEstLabel, 3, 0, 1, 4) layout.addWidget(self.timeEstLabel, 3, 0, 1, 5)
self.setLayout(layout) self.setLayout(layout)
...@@ -355,7 +400,6 @@ class APP_Image2Doc(QWidget): ...@@ -355,7 +400,6 @@ class APP_Image2Doc(QWidget):
if len(selectedFiles) > 0: if len(selectedFiles) > 0:
self.imagePaths = selectedFiles self.imagePaths = selectedFiles
self.screenShot = None # discard screenshot temp image self.screenShot = None # discard screenshot temp image
self.pb.setRange(0, len(self.imagePaths))
self.pb.setValue(0) self.pb.setValue(0)
# def screenShotSlot(self): # def screenShotSlot(self):
...@@ -370,7 +414,7 @@ class APP_Image2Doc(QWidget): ...@@ -370,7 +414,7 @@ class APP_Image2Doc(QWidget):
# self.pb.setRange(0, 1) # self.pb.setRange(0, 1)
# self.pb.setValue(0) # self.pb.setValue(0)
def handleStartSignal(self, lang): def handleStartSignal(self, lang='EN', pdfParser=False):
if self.screenShot: # for screenShot if self.screenShot: # for screenShot
img_name = 'screenshot_' + time.strftime("%Y%m%d%H%M%S", time.localtime()) img_name = 'screenshot_' + time.strftime("%Y%m%d%H%M%S", time.localtime())
image = QImageToCvMat(self.screenShot) image = QImageToCvMat(self.screenShot)
...@@ -386,10 +430,12 @@ class APP_Image2Doc(QWidget): ...@@ -386,10 +430,12 @@ class APP_Image2Doc(QWidget):
self._thread.setOutputDir(self.output_dir) self._thread.setOutputDir(self.output_dir)
self._thread.setImagePath(self.imagePaths) self._thread.setImagePath(self.imagePaths)
self._thread.setLang(lang) self._thread.setLang(lang)
self._thread.setPDFParser(pdfParser)
# disenble buttons # disenble buttons
self.openFileButton.setEnabled(False) self.openFileButton.setEnabled(False)
self.startCNButton.setEnabled(False) self.startCNButton.setEnabled(False)
self.startENButton.setEnabled(False) self.startENButton.setEnabled(False)
self.PDFParserButton.setEnabled(False)
# 启动工作进程 # 启动工作进程
self._thread.start() self._thread.start()
self.time_start = time.time() # log start time self.time_start = time.time() # log start time
...@@ -411,7 +457,7 @@ class APP_Image2Doc(QWidget): ...@@ -411,7 +457,7 @@ class APP_Image2Doc(QWidget):
QMessageBox.information(self, QMessageBox.information(self,
u'Information', "输出文件不存在") u'Information', "输出文件不存在")
def handleProgressBarSingal(self, i): def handleProgressBarUpdateSingal(self, i):
self.pb.setValue(i) self.pb.setValue(i)
# calculate time left of recognition # calculate time left of recognition
lenbar = self.pb.maximum() lenbar = self.pb.maximum()
...@@ -419,13 +465,24 @@ class APP_Image2Doc(QWidget): ...@@ -419,13 +465,24 @@ class APP_Image2Doc(QWidget):
time_left = str(datetime.timedelta(seconds=avg_time * (lenbar - i))).split(".")[0] # Remove microseconds time_left = str(datetime.timedelta(seconds=avg_time * (lenbar - i))).split(".")[0] # Remove microseconds
self.timeEstLabel.setText(f"Time Left: {time_left}") # show time left self.timeEstLabel.setText(f"Time Left: {time_left}") # show time left
def handleProgressBarRangeSingal(self, max):
self.pb.setRange(0, max)
def handleEndsignalSignal(self): def handleEndsignalSignal(self):
# enble buttons # enble buttons
self.openFileButton.setEnabled(True) self.openFileButton.setEnabled(True)
self.startCNButton.setEnabled(True) self.startCNButton.setEnabled(True)
self.startENButton.setEnabled(True) self.startENButton.setEnabled(True)
self.PDFParserButton.setEnabled(True)
QMessageBox.information(self, u'Information', "转换结束") QMessageBox.information(self, u'Information', "转换结束")
def handleCBChangeSignal(self):
self._thread.setPDFParser(self.checkBox.isChecked())
def handleThreadException(self, message):
self._thread.quit()
QMessageBox.information(self, message)
def main(): def main():
app = QApplication(sys.argv) app = QApplication(sys.argv)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册