Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleOCR
提交
f91026dd
P
PaddleOCR
项目概览
PaddlePaddle
/
PaddleOCR
1 年多 前同步成功
通知
1533
Star
32963
Fork
6643
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
108
列表
看板
标记
里程碑
合并请求
7
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleOCR
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
108
Issue
108
列表
看板
标记
里程碑
合并请求
7
合并请求
7
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
f91026dd
编写于
10月 20, 2022
作者:
E
Evezerest
提交者:
GitHub
10月 20, 2022
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #8005 from whjdark/patch-5
pdf2word v0.2.2
上级
3907c72a
0f70eaf2
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
98 addition
and
41 deletion
+98
-41
ppstructure/pdf2word/pdf2word.py
ppstructure/pdf2word/pdf2word.py
+98
-41
未找到文件。
ppstructure/pdf2word/pdf2word.py
浏览文件 @
f91026dd
...
...
@@ -7,8 +7,11 @@ import functools
import
cv2
import
platform
import
numpy
as
np
import
fitz
from
PIL
import
Image
from
pdf2docx.converter
import
Converter
from
qtpy.QtWidgets
import
QApplication
,
QWidget
,
QPushButton
,
QProgressBar
,
\
QGridLayout
,
QMessageBox
,
QLabel
,
QFileDialog
QGridLayout
,
QMessageBox
,
QLabel
,
QFileDialog
,
QCheckBox
from
qtpy.QtCore
import
Signal
,
QThread
,
QObject
from
qtpy.QtGui
import
QImage
,
QPixmap
,
QIcon
...
...
@@ -17,6 +20,7 @@ root = os.path.abspath(os.path.join(file, '../../'))
sys
.
path
.
append
(
file
)
sys
.
path
.
insert
(
0
,
root
)
from
ppstructure.predict_system
import
StructureSystem
,
save_structure_res
from
ppstructure.utility
import
parse_args
,
draw_structure_result
from
ppocr.utils.network
import
download_with_progressbar
...
...
@@ -24,7 +28,7 @@ from ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_in
# from ScreenShotWidget import ScreenShotWidget
__APPNAME__
=
"pdf2word"
__VERSION__
=
"0.
1.1
"
__VERSION__
=
"0.
2.2
"
URLs_EN
=
{
# 下载超英文轻量级PP-OCRv3模型的检测模型并解压
...
...
@@ -75,9 +79,7 @@ def QImageToCvMat(incomingImage) -> np.array:
def
readImage
(
image_file
)
->
list
:
if
os
.
path
.
basename
(
image_file
)[
-
3
:]
in
[
'pdf'
]:
import
fitz
from
PIL
import
Image
if
os
.
path
.
basename
(
image_file
)[
-
3
:]
==
'pdf'
:
imgs
=
[]
with
fitz
.
open
(
image_file
)
as
pdf
:
for
pg
in
range
(
0
,
pdf
.
pageCount
):
...
...
@@ -102,17 +104,22 @@ def readImage(image_file) -> list:
class
Worker
(
QThread
):
progressBarValue
=
Signal
(
int
)
progressBarRange
=
Signal
(
int
)
endsignal
=
Signal
()
exceptedsignal
=
Signal
(
str
)
#发送一个异常信号
loopFlag
=
True
def
__init__
(
self
,
predictors
,
save_pdf
,
vis_font_path
):
def
__init__
(
self
,
predictors
,
save_pdf
,
vis_font_path
,
use_pdf2docx_api
):
super
(
Worker
,
self
).
__init__
()
self
.
predictors
=
predictors
self
.
save_pdf
=
save_pdf
self
.
vis_font_path
=
vis_font_path
self
.
lang
=
'EN'
self
.
imagePaths
=
[]
self
.
use_pdf2docx_api
=
use_pdf2docx_api
self
.
outputDir
=
None
self
.
totalPageCnt
=
0
self
.
pageCnt
=
0
self
.
setStackSize
(
1024
*
1024
)
def
setImagePath
(
self
,
imagePaths
):
...
...
@@ -123,61 +130,91 @@ class Worker(QThread):
def
setOutputDir
(
self
,
outputDir
):
self
.
outputDir
=
outputDir
def
setPDFParser
(
self
,
enabled
):
self
.
use_pdf2docx_api
=
enabled
def
resetPageCnt
(
self
):
self
.
pageCnt
=
0
def
resetTotalPageCnt
(
self
):
self
.
totalPageCnt
=
0
def
p
redictAndSave
(
self
,
imgs
,
img_name
):
def
p
pocrPrecitor
(
self
,
imgs
,
img_name
):
all_res
=
[]
# update progress bar ranges
self
.
totalPageCnt
+=
len
(
imgs
)
self
.
progressBarRange
.
emit
(
self
.
totalPageCnt
)
# processing pages
for
index
,
img
in
enumerate
(
imgs
):
res
,
time_dict
=
self
.
predictors
[
self
.
lang
](
img
)
# save output
save_structure_res
(
res
,
self
.
outputDir
,
img_name
)
draw_img
=
draw_structure_result
(
img
,
res
,
self
.
vis_font_path
)
img_save_path
=
os
.
path
.
join
(
self
.
outputDir
,
img_name
,
'show_{}.jpg'
.
format
(
index
))
if
res
!=
[]:
cv2
.
imwrite
(
img_save_path
,
draw_img
)
#
draw_img = draw_structure_result(img, res, self.vis_font_path)
#
img_save_path = os.path.join(self.outputDir, img_name, 'show_{}.jpg'.format(index))
#
if res != []:
#
cv2.imwrite(img_save_path, draw_img)
# recovery
h
,
w
,
_
=
img
.
shape
res
=
sorted_layout_boxes
(
res
,
w
)
all_res
+=
res
self
.
pageCnt
+=
1
self
.
progressBarValue
.
emit
(
self
.
pageCnt
)
try
:
convert_info_docx
(
img
,
all_res
,
self
.
outputDir
,
img_name
,
self
.
save_pdf
)
except
Exception
as
ex
:
print
(
self
,
"error in layout recovery image:{}, err msg: {}"
.
format
(
img_name
,
ex
))
if
all_res
!=
[]
:
try
:
convert_info_docx
(
imgs
,
all_res
,
self
.
outputDir
,
img_name
)
except
Exception
as
ex
:
print
(
"error in layout recovery image:{}, err msg: {}"
.
format
(
img_name
,
ex
))
print
(
"Predict time : {:.3f}s"
.
format
(
time_dict
[
'all'
]))
print
(
'result save to {}'
.
format
(
self
.
outputDir
))
def
run
(
self
):
self
.
resetPageCnt
()
self
.
resetTotalPageCnt
()
try
:
findex
=
0
os
.
makedirs
(
self
.
outputDir
,
exist_ok
=
True
)
for
i
,
image_file
in
enumerate
(
self
.
imagePaths
):
if
self
.
loopFlag
==
True
:
if
not
self
.
loopFlag
:
break
# using use_pdf2docx_api for PDF parsing
if
self
.
use_pdf2docx_api
\
and
os
.
path
.
basename
(
image_file
)[
-
3
:]
==
'pdf'
:
self
.
totalPageCnt
+=
1
self
.
progressBarRange
.
emit
(
self
.
totalPageCnt
)
print
(
'===============using use_pdf2docx_api==============='
)
img_name
=
os
.
path
.
basename
(
image_file
).
split
(
'.'
)[
0
]
docx_file
=
os
.
path
.
join
(
self
.
outputDir
,
'{}.docx'
.
format
(
img_name
))
cv
=
Converter
(
image_file
)
cv
.
convert
(
docx_file
)
cv
.
close
()
print
(
'docx save to {}'
.
format
(
docx_file
))
self
.
pageCnt
+=
1
self
.
progressBarValue
.
emit
(
self
.
pageCnt
)
else
:
# using PPOCR for PDF/Image parsing
imgs
=
readImage
(
image_file
)
if
len
(
imgs
)
==
0
:
continue
img_name
=
os
.
path
.
basename
(
image_file
).
split
(
'.'
)[
0
]
os
.
makedirs
(
os
.
path
.
join
(
self
.
outputDir
,
img_name
),
exist_ok
=
True
)
self
.
predictAndSave
(
imgs
,
img_name
)
findex
+=
1
self
.
progressBarValue
.
emit
(
findex
)
else
:
break
self
.
ppocrPrecitor
(
imgs
,
img_name
)
# file processed
self
.
endsignal
.
emit
()
self
.
exec
()
#
self.exec()
except
Exception
as
e
:
print
(
e
)
raise
self
.
exceptedsignal
.
emit
(
str
(
e
))
# 将异常发送给UI进程
class
APP_Image2Doc
(
QWidget
):
def
__init__
(
self
):
super
().
__init__
()
self
.
setFixedHeight
(
9
0
)
self
.
setFixedWidth
(
4
0
0
)
self
.
setFixedHeight
(
10
0
)
self
.
setFixedWidth
(
4
2
0
)
# settings
self
.
imagePaths
=
[]
...
...
@@ -187,6 +224,7 @@ class APP_Image2Doc(QWidget):
self
.
output_dir
=
None
self
.
vis_font_path
=
os
.
path
.
join
(
root
,
"doc"
,
"fonts"
,
"simfang.ttf"
)
self
.
use_pdf2docx_api
=
False
# ProgressBar
self
.
pb
=
QProgressBar
()
...
...
@@ -207,10 +245,12 @@ class APP_Image2Doc(QWidget):
}
# 设置工作进程
self
.
_thread
=
Worker
(
predictors
,
self
.
save_pdf
,
self
.
vis_font_path
)
self
.
_thread
.
progressBarValue
.
connect
(
self
.
handleProgressBarSingal
)
self
.
_thread
=
Worker
(
predictors
,
self
.
save_pdf
,
self
.
vis_font_path
,
self
.
use_pdf2docx_api
)
self
.
_thread
.
progressBarValue
.
connect
(
self
.
handleProgressBar
Update
Singal
)
self
.
_thread
.
endsignal
.
connect
(
self
.
handleEndsignalSignal
)
self
.
_thread
.
finished
.
connect
(
QObject
.
deleteLater
)
# self._thread.finished.connect(QObject.deleteLater)
self
.
_thread
.
progressBarRange
.
connect
(
self
.
handleProgressBarRangeSingal
)
self
.
_thread
.
exceptedsignal
.
connect
(
self
.
handleThreadException
)
self
.
time_start
=
0
# save start time
def
setupUi
(
self
):
...
...
@@ -233,25 +273,30 @@ class APP_Image2Doc(QWidget):
self
.
startCNButton
.
setIcon
(
QIcon
(
QPixmap
(
"./icons/chinese.png"
)))
layout
.
addWidget
(
self
.
startCNButton
,
0
,
1
,
1
,
1
)
self
.
startCNButton
.
clicked
.
connect
(
functools
.
partial
(
self
.
handleStartSignal
,
'CN'
))
functools
.
partial
(
self
.
handleStartSignal
,
'CN'
,
False
))
self
.
startENButton
=
QPushButton
(
"英文转换"
)
self
.
startENButton
.
setIcon
(
QIcon
(
QPixmap
(
"./icons/english.png"
)))
layout
.
addWidget
(
self
.
startENButton
,
0
,
2
,
1
,
1
)
self
.
startENButton
.
clicked
.
connect
(
functools
.
partial
(
self
.
handleStartSignal
,
'EN'
))
functools
.
partial
(
self
.
handleStartSignal
,
'EN'
,
False
))
self
.
PDFParserButton
=
QPushButton
(
'PDF解析'
,
self
)
layout
.
addWidget
(
self
.
PDFParserButton
,
0
,
3
,
1
,
1
)
self
.
PDFParserButton
.
clicked
.
connect
(
functools
.
partial
(
self
.
handleStartSignal
,
'CN'
,
True
))
self
.
showResultButton
=
QPushButton
(
"显示结果"
)
self
.
showResultButton
.
setIcon
(
QIcon
(
QPixmap
(
"./icons/folder-open.png"
)))
layout
.
addWidget
(
self
.
showResultButton
,
0
,
3
,
1
,
1
)
layout
.
addWidget
(
self
.
showResultButton
,
0
,
4
,
1
,
1
)
self
.
showResultButton
.
clicked
.
connect
(
self
.
handleShowResultSignal
)
# ProgressBar
layout
.
addWidget
(
self
.
pb
,
2
,
0
,
1
,
4
)
layout
.
addWidget
(
self
.
pb
,
2
,
0
,
1
,
5
)
# time estimate label
self
.
timeEstLabel
=
QLabel
(
(
"Time Left: --"
))
layout
.
addWidget
(
self
.
timeEstLabel
,
3
,
0
,
1
,
4
)
layout
.
addWidget
(
self
.
timeEstLabel
,
3
,
0
,
1
,
5
)
self
.
setLayout
(
layout
)
...
...
@@ -355,7 +400,6 @@ class APP_Image2Doc(QWidget):
if
len
(
selectedFiles
)
>
0
:
self
.
imagePaths
=
selectedFiles
self
.
screenShot
=
None
# discard screenshot temp image
self
.
pb
.
setRange
(
0
,
len
(
self
.
imagePaths
))
self
.
pb
.
setValue
(
0
)
# def screenShotSlot(self):
...
...
@@ -370,7 +414,7 @@ class APP_Image2Doc(QWidget):
# self.pb.setRange(0, 1)
# self.pb.setValue(0)
def
handleStartSignal
(
self
,
lang
):
def
handleStartSignal
(
self
,
lang
=
'EN'
,
pdfParser
=
False
):
if
self
.
screenShot
:
# for screenShot
img_name
=
'screenshot_'
+
time
.
strftime
(
"%Y%m%d%H%M%S"
,
time
.
localtime
())
image
=
QImageToCvMat
(
self
.
screenShot
)
...
...
@@ -386,10 +430,12 @@ class APP_Image2Doc(QWidget):
self
.
_thread
.
setOutputDir
(
self
.
output_dir
)
self
.
_thread
.
setImagePath
(
self
.
imagePaths
)
self
.
_thread
.
setLang
(
lang
)
self
.
_thread
.
setPDFParser
(
pdfParser
)
# disenble buttons
self
.
openFileButton
.
setEnabled
(
False
)
self
.
startCNButton
.
setEnabled
(
False
)
self
.
startENButton
.
setEnabled
(
False
)
self
.
PDFParserButton
.
setEnabled
(
False
)
# 启动工作进程
self
.
_thread
.
start
()
self
.
time_start
=
time
.
time
()
# log start time
...
...
@@ -411,7 +457,7 @@ class APP_Image2Doc(QWidget):
QMessageBox
.
information
(
self
,
u
'Information'
,
"输出文件不存在"
)
def
handleProgressBarSingal
(
self
,
i
):
def
handleProgressBar
Update
Singal
(
self
,
i
):
self
.
pb
.
setValue
(
i
)
# calculate time left of recognition
lenbar
=
self
.
pb
.
maximum
()
...
...
@@ -419,13 +465,24 @@ class APP_Image2Doc(QWidget):
time_left
=
str
(
datetime
.
timedelta
(
seconds
=
avg_time
*
(
lenbar
-
i
))).
split
(
"."
)[
0
]
# Remove microseconds
self
.
timeEstLabel
.
setText
(
f
"Time Left:
{
time_left
}
"
)
# show time left
def
handleProgressBarRangeSingal
(
self
,
max
):
self
.
pb
.
setRange
(
0
,
max
)
def
handleEndsignalSignal
(
self
):
# enble buttons
self
.
openFileButton
.
setEnabled
(
True
)
self
.
startCNButton
.
setEnabled
(
True
)
self
.
startENButton
.
setEnabled
(
True
)
self
.
PDFParserButton
.
setEnabled
(
True
)
QMessageBox
.
information
(
self
,
u
'Information'
,
"转换结束"
)
def
handleCBChangeSignal
(
self
):
self
.
_thread
.
setPDFParser
(
self
.
checkBox
.
isChecked
())
def
handleThreadException
(
self
,
message
):
self
.
_thread
.
quit
()
QMessageBox
.
information
(
self
,
message
)
def
main
():
app
=
QApplication
(
sys
.
argv
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录