Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleOCR
提交
f91026dd
P
PaddleOCR
项目概览
PaddlePaddle
/
PaddleOCR
大约 1 年 前同步成功
通知
1528
Star
32962
Fork
6643
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
108
列表
看板
标记
里程碑
合并请求
7
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleOCR
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
108
Issue
108
列表
看板
标记
里程碑
合并请求
7
合并请求
7
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
f91026dd
编写于
10月 20, 2022
作者:
E
Evezerest
提交者:
GitHub
10月 20, 2022
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #8005 from whjdark/patch-5
pdf2word v0.2.2
上级
3907c72a
0f70eaf2
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
98 addition
and
41 deletion
+98
-41
ppstructure/pdf2word/pdf2word.py
ppstructure/pdf2word/pdf2word.py
+98
-41
未找到文件。
ppstructure/pdf2word/pdf2word.py
浏览文件 @
f91026dd
...
@@ -7,8 +7,11 @@ import functools
...
@@ -7,8 +7,11 @@ import functools
import
cv2
import
cv2
import
platform
import
platform
import
numpy
as
np
import
numpy
as
np
import
fitz
from
PIL
import
Image
from
pdf2docx.converter
import
Converter
from
qtpy.QtWidgets
import
QApplication
,
QWidget
,
QPushButton
,
QProgressBar
,
\
from
qtpy.QtWidgets
import
QApplication
,
QWidget
,
QPushButton
,
QProgressBar
,
\
QGridLayout
,
QMessageBox
,
QLabel
,
QFileDialog
QGridLayout
,
QMessageBox
,
QLabel
,
QFileDialog
,
QCheckBox
from
qtpy.QtCore
import
Signal
,
QThread
,
QObject
from
qtpy.QtCore
import
Signal
,
QThread
,
QObject
from
qtpy.QtGui
import
QImage
,
QPixmap
,
QIcon
from
qtpy.QtGui
import
QImage
,
QPixmap
,
QIcon
...
@@ -17,6 +20,7 @@ root = os.path.abspath(os.path.join(file, '../../'))
...
@@ -17,6 +20,7 @@ root = os.path.abspath(os.path.join(file, '../../'))
sys
.
path
.
append
(
file
)
sys
.
path
.
append
(
file
)
sys
.
path
.
insert
(
0
,
root
)
sys
.
path
.
insert
(
0
,
root
)
from
ppstructure.predict_system
import
StructureSystem
,
save_structure_res
from
ppstructure.predict_system
import
StructureSystem
,
save_structure_res
from
ppstructure.utility
import
parse_args
,
draw_structure_result
from
ppstructure.utility
import
parse_args
,
draw_structure_result
from
ppocr.utils.network
import
download_with_progressbar
from
ppocr.utils.network
import
download_with_progressbar
...
@@ -24,7 +28,7 @@ from ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_in
...
@@ -24,7 +28,7 @@ from ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_in
# from ScreenShotWidget import ScreenShotWidget
# from ScreenShotWidget import ScreenShotWidget
__APPNAME__
=
"pdf2word"
__APPNAME__
=
"pdf2word"
__VERSION__
=
"0.
1.1
"
__VERSION__
=
"0.
2.2
"
URLs_EN
=
{
URLs_EN
=
{
# 下载超英文轻量级PP-OCRv3模型的检测模型并解压
# 下载超英文轻量级PP-OCRv3模型的检测模型并解压
...
@@ -75,9 +79,7 @@ def QImageToCvMat(incomingImage) -> np.array:
...
@@ -75,9 +79,7 @@ def QImageToCvMat(incomingImage) -> np.array:
def
readImage
(
image_file
)
->
list
:
def
readImage
(
image_file
)
->
list
:
if
os
.
path
.
basename
(
image_file
)[
-
3
:]
in
[
'pdf'
]:
if
os
.
path
.
basename
(
image_file
)[
-
3
:]
==
'pdf'
:
import
fitz
from
PIL
import
Image
imgs
=
[]
imgs
=
[]
with
fitz
.
open
(
image_file
)
as
pdf
:
with
fitz
.
open
(
image_file
)
as
pdf
:
for
pg
in
range
(
0
,
pdf
.
pageCount
):
for
pg
in
range
(
0
,
pdf
.
pageCount
):
...
@@ -102,17 +104,22 @@ def readImage(image_file) -> list:
...
@@ -102,17 +104,22 @@ def readImage(image_file) -> list:
class
Worker
(
QThread
):
class
Worker
(
QThread
):
progressBarValue
=
Signal
(
int
)
progressBarValue
=
Signal
(
int
)
progressBarRange
=
Signal
(
int
)
endsignal
=
Signal
()
endsignal
=
Signal
()
exceptedsignal
=
Signal
(
str
)
#发送一个异常信号
loopFlag
=
True
loopFlag
=
True
def
__init__
(
self
,
predictors
,
save_pdf
,
vis_font_path
):
def
__init__
(
self
,
predictors
,
save_pdf
,
vis_font_path
,
use_pdf2docx_api
):
super
(
Worker
,
self
).
__init__
()
super
(
Worker
,
self
).
__init__
()
self
.
predictors
=
predictors
self
.
predictors
=
predictors
self
.
save_pdf
=
save_pdf
self
.
save_pdf
=
save_pdf
self
.
vis_font_path
=
vis_font_path
self
.
vis_font_path
=
vis_font_path
self
.
lang
=
'EN'
self
.
lang
=
'EN'
self
.
imagePaths
=
[]
self
.
imagePaths
=
[]
self
.
use_pdf2docx_api
=
use_pdf2docx_api
self
.
outputDir
=
None
self
.
outputDir
=
None
self
.
totalPageCnt
=
0
self
.
pageCnt
=
0
self
.
setStackSize
(
1024
*
1024
)
self
.
setStackSize
(
1024
*
1024
)
def
setImagePath
(
self
,
imagePaths
):
def
setImagePath
(
self
,
imagePaths
):
...
@@ -123,61 +130,91 @@ class Worker(QThread):
...
@@ -123,61 +130,91 @@ class Worker(QThread):
def
setOutputDir
(
self
,
outputDir
):
def
setOutputDir
(
self
,
outputDir
):
self
.
outputDir
=
outputDir
self
.
outputDir
=
outputDir
def
setPDFParser
(
self
,
enabled
):
self
.
use_pdf2docx_api
=
enabled
def
resetPageCnt
(
self
):
self
.
pageCnt
=
0
def
resetTotalPageCnt
(
self
):
self
.
totalPageCnt
=
0
def
p
redictAndSave
(
self
,
imgs
,
img_name
):
def
p
pocrPrecitor
(
self
,
imgs
,
img_name
):
all_res
=
[]
all_res
=
[]
# update progress bar ranges
self
.
totalPageCnt
+=
len
(
imgs
)
self
.
progressBarRange
.
emit
(
self
.
totalPageCnt
)
# processing pages
for
index
,
img
in
enumerate
(
imgs
):
for
index
,
img
in
enumerate
(
imgs
):
res
,
time_dict
=
self
.
predictors
[
self
.
lang
](
img
)
res
,
time_dict
=
self
.
predictors
[
self
.
lang
](
img
)
# save output
# save output
save_structure_res
(
res
,
self
.
outputDir
,
img_name
)
save_structure_res
(
res
,
self
.
outputDir
,
img_name
)
draw_img
=
draw_structure_result
(
img
,
res
,
self
.
vis_font_path
)
#
draw_img = draw_structure_result(img, res, self.vis_font_path)
img_save_path
=
os
.
path
.
join
(
self
.
outputDir
,
img_name
,
'show_{}.jpg'
.
format
(
index
))
#
img_save_path = os.path.join(self.outputDir, img_name, 'show_{}.jpg'.format(index))
if
res
!=
[]:
#
if res != []:
cv2
.
imwrite
(
img_save_path
,
draw_img
)
#
cv2.imwrite(img_save_path, draw_img)
# recovery
# recovery
h
,
w
,
_
=
img
.
shape
h
,
w
,
_
=
img
.
shape
res
=
sorted_layout_boxes
(
res
,
w
)
res
=
sorted_layout_boxes
(
res
,
w
)
all_res
+=
res
all_res
+=
res
self
.
pageCnt
+=
1
self
.
progressBarValue
.
emit
(
self
.
pageCnt
)
try
:
if
all_res
!=
[]
:
convert_info_docx
(
img
,
all_res
,
self
.
outputDir
,
img_name
,
self
.
save_pdf
)
try
:
except
Exception
as
ex
:
convert_info_docx
(
imgs
,
all_res
,
self
.
outputDir
,
img_name
)
print
(
self
,
except
Exception
as
ex
:
"error in layout recovery image:{}, err msg: {}"
.
format
(
print
(
"error in layout recovery image:{}, err msg: {}"
.
img_name
,
ex
))
format
(
img_name
,
ex
))
print
(
"Predict time : {:.3f}s"
.
format
(
time_dict
[
'all'
]))
print
(
'result save to {}'
.
format
(
self
.
outputDir
))
print
(
'result save to {}'
.
format
(
self
.
outputDir
))
def
run
(
self
):
def
run
(
self
):
self
.
resetPageCnt
()
self
.
resetTotalPageCnt
()
try
:
try
:
findex
=
0
os
.
makedirs
(
self
.
outputDir
,
exist_ok
=
True
)
os
.
makedirs
(
self
.
outputDir
,
exist_ok
=
True
)
for
i
,
image_file
in
enumerate
(
self
.
imagePaths
):
for
i
,
image_file
in
enumerate
(
self
.
imagePaths
):
if
self
.
loopFlag
==
True
:
if
not
self
.
loopFlag
:
break
# using use_pdf2docx_api for PDF parsing
if
self
.
use_pdf2docx_api
\
and
os
.
path
.
basename
(
image_file
)[
-
3
:]
==
'pdf'
:
self
.
totalPageCnt
+=
1
self
.
progressBarRange
.
emit
(
self
.
totalPageCnt
)
print
(
'===============using use_pdf2docx_api==============='
)
img_name
=
os
.
path
.
basename
(
image_file
).
split
(
'.'
)[
0
]
docx_file
=
os
.
path
.
join
(
self
.
outputDir
,
'{}.docx'
.
format
(
img_name
))
cv
=
Converter
(
image_file
)
cv
.
convert
(
docx_file
)
cv
.
close
()
print
(
'docx save to {}'
.
format
(
docx_file
))
self
.
pageCnt
+=
1
self
.
progressBarValue
.
emit
(
self
.
pageCnt
)
else
:
# using PPOCR for PDF/Image parsing
imgs
=
readImage
(
image_file
)
imgs
=
readImage
(
image_file
)
if
len
(
imgs
)
==
0
:
if
len
(
imgs
)
==
0
:
continue
continue
img_name
=
os
.
path
.
basename
(
image_file
).
split
(
'.'
)[
0
]
img_name
=
os
.
path
.
basename
(
image_file
).
split
(
'.'
)[
0
]
os
.
makedirs
(
os
.
path
.
join
(
self
.
outputDir
,
img_name
),
exist_ok
=
True
)
os
.
makedirs
(
os
.
path
.
join
(
self
.
outputDir
,
img_name
),
exist_ok
=
True
)
self
.
predictAndSave
(
imgs
,
img_name
)
self
.
ppocrPrecitor
(
imgs
,
img_name
)
findex
+=
1
# file processed
self
.
progressBarValue
.
emit
(
findex
)
else
:
break
self
.
endsignal
.
emit
()
self
.
endsignal
.
emit
()
self
.
exec
()
#
self.exec()
except
Exception
as
e
:
except
Exception
as
e
:
print
(
e
)
self
.
exceptedsignal
.
emit
(
str
(
e
))
# 将异常发送给UI进程
raise
class
APP_Image2Doc
(
QWidget
):
class
APP_Image2Doc
(
QWidget
):
def
__init__
(
self
):
def
__init__
(
self
):
super
().
__init__
()
super
().
__init__
()
self
.
setFixedHeight
(
9
0
)
self
.
setFixedHeight
(
10
0
)
self
.
setFixedWidth
(
4
0
0
)
self
.
setFixedWidth
(
4
2
0
)
# settings
# settings
self
.
imagePaths
=
[]
self
.
imagePaths
=
[]
...
@@ -187,6 +224,7 @@ class APP_Image2Doc(QWidget):
...
@@ -187,6 +224,7 @@ class APP_Image2Doc(QWidget):
self
.
output_dir
=
None
self
.
output_dir
=
None
self
.
vis_font_path
=
os
.
path
.
join
(
root
,
self
.
vis_font_path
=
os
.
path
.
join
(
root
,
"doc"
,
"fonts"
,
"simfang.ttf"
)
"doc"
,
"fonts"
,
"simfang.ttf"
)
self
.
use_pdf2docx_api
=
False
# ProgressBar
# ProgressBar
self
.
pb
=
QProgressBar
()
self
.
pb
=
QProgressBar
()
...
@@ -207,10 +245,12 @@ class APP_Image2Doc(QWidget):
...
@@ -207,10 +245,12 @@ class APP_Image2Doc(QWidget):
}
}
# 设置工作进程
# 设置工作进程
self
.
_thread
=
Worker
(
predictors
,
self
.
save_pdf
,
self
.
vis_font_path
)
self
.
_thread
=
Worker
(
predictors
,
self
.
save_pdf
,
self
.
vis_font_path
,
self
.
use_pdf2docx_api
)
self
.
_thread
.
progressBarValue
.
connect
(
self
.
handleProgressBarSingal
)
self
.
_thread
.
progressBarValue
.
connect
(
self
.
handleProgressBar
Update
Singal
)
self
.
_thread
.
endsignal
.
connect
(
self
.
handleEndsignalSignal
)
self
.
_thread
.
endsignal
.
connect
(
self
.
handleEndsignalSignal
)
self
.
_thread
.
finished
.
connect
(
QObject
.
deleteLater
)
# self._thread.finished.connect(QObject.deleteLater)
self
.
_thread
.
progressBarRange
.
connect
(
self
.
handleProgressBarRangeSingal
)
self
.
_thread
.
exceptedsignal
.
connect
(
self
.
handleThreadException
)
self
.
time_start
=
0
# save start time
self
.
time_start
=
0
# save start time
def
setupUi
(
self
):
def
setupUi
(
self
):
...
@@ -233,25 +273,30 @@ class APP_Image2Doc(QWidget):
...
@@ -233,25 +273,30 @@ class APP_Image2Doc(QWidget):
self
.
startCNButton
.
setIcon
(
QIcon
(
QPixmap
(
"./icons/chinese.png"
)))
self
.
startCNButton
.
setIcon
(
QIcon
(
QPixmap
(
"./icons/chinese.png"
)))
layout
.
addWidget
(
self
.
startCNButton
,
0
,
1
,
1
,
1
)
layout
.
addWidget
(
self
.
startCNButton
,
0
,
1
,
1
,
1
)
self
.
startCNButton
.
clicked
.
connect
(
self
.
startCNButton
.
clicked
.
connect
(
functools
.
partial
(
self
.
handleStartSignal
,
'CN'
))
functools
.
partial
(
self
.
handleStartSignal
,
'CN'
,
False
))
self
.
startENButton
=
QPushButton
(
"英文转换"
)
self
.
startENButton
=
QPushButton
(
"英文转换"
)
self
.
startENButton
.
setIcon
(
QIcon
(
QPixmap
(
"./icons/english.png"
)))
self
.
startENButton
.
setIcon
(
QIcon
(
QPixmap
(
"./icons/english.png"
)))
layout
.
addWidget
(
self
.
startENButton
,
0
,
2
,
1
,
1
)
layout
.
addWidget
(
self
.
startENButton
,
0
,
2
,
1
,
1
)
self
.
startENButton
.
clicked
.
connect
(
self
.
startENButton
.
clicked
.
connect
(
functools
.
partial
(
self
.
handleStartSignal
,
'EN'
))
functools
.
partial
(
self
.
handleStartSignal
,
'EN'
,
False
))
self
.
PDFParserButton
=
QPushButton
(
'PDF解析'
,
self
)
layout
.
addWidget
(
self
.
PDFParserButton
,
0
,
3
,
1
,
1
)
self
.
PDFParserButton
.
clicked
.
connect
(
functools
.
partial
(
self
.
handleStartSignal
,
'CN'
,
True
))
self
.
showResultButton
=
QPushButton
(
"显示结果"
)
self
.
showResultButton
=
QPushButton
(
"显示结果"
)
self
.
showResultButton
.
setIcon
(
QIcon
(
QPixmap
(
"./icons/folder-open.png"
)))
self
.
showResultButton
.
setIcon
(
QIcon
(
QPixmap
(
"./icons/folder-open.png"
)))
layout
.
addWidget
(
self
.
showResultButton
,
0
,
3
,
1
,
1
)
layout
.
addWidget
(
self
.
showResultButton
,
0
,
4
,
1
,
1
)
self
.
showResultButton
.
clicked
.
connect
(
self
.
handleShowResultSignal
)
self
.
showResultButton
.
clicked
.
connect
(
self
.
handleShowResultSignal
)
# ProgressBar
# ProgressBar
layout
.
addWidget
(
self
.
pb
,
2
,
0
,
1
,
4
)
layout
.
addWidget
(
self
.
pb
,
2
,
0
,
1
,
5
)
# time estimate label
# time estimate label
self
.
timeEstLabel
=
QLabel
(
self
.
timeEstLabel
=
QLabel
(
(
"Time Left: --"
))
(
"Time Left: --"
))
layout
.
addWidget
(
self
.
timeEstLabel
,
3
,
0
,
1
,
4
)
layout
.
addWidget
(
self
.
timeEstLabel
,
3
,
0
,
1
,
5
)
self
.
setLayout
(
layout
)
self
.
setLayout
(
layout
)
...
@@ -355,7 +400,6 @@ class APP_Image2Doc(QWidget):
...
@@ -355,7 +400,6 @@ class APP_Image2Doc(QWidget):
if
len
(
selectedFiles
)
>
0
:
if
len
(
selectedFiles
)
>
0
:
self
.
imagePaths
=
selectedFiles
self
.
imagePaths
=
selectedFiles
self
.
screenShot
=
None
# discard screenshot temp image
self
.
screenShot
=
None
# discard screenshot temp image
self
.
pb
.
setRange
(
0
,
len
(
self
.
imagePaths
))
self
.
pb
.
setValue
(
0
)
self
.
pb
.
setValue
(
0
)
# def screenShotSlot(self):
# def screenShotSlot(self):
...
@@ -370,7 +414,7 @@ class APP_Image2Doc(QWidget):
...
@@ -370,7 +414,7 @@ class APP_Image2Doc(QWidget):
# self.pb.setRange(0, 1)
# self.pb.setRange(0, 1)
# self.pb.setValue(0)
# self.pb.setValue(0)
def
handleStartSignal
(
self
,
lang
):
def
handleStartSignal
(
self
,
lang
=
'EN'
,
pdfParser
=
False
):
if
self
.
screenShot
:
# for screenShot
if
self
.
screenShot
:
# for screenShot
img_name
=
'screenshot_'
+
time
.
strftime
(
"%Y%m%d%H%M%S"
,
time
.
localtime
())
img_name
=
'screenshot_'
+
time
.
strftime
(
"%Y%m%d%H%M%S"
,
time
.
localtime
())
image
=
QImageToCvMat
(
self
.
screenShot
)
image
=
QImageToCvMat
(
self
.
screenShot
)
...
@@ -386,10 +430,12 @@ class APP_Image2Doc(QWidget):
...
@@ -386,10 +430,12 @@ class APP_Image2Doc(QWidget):
self
.
_thread
.
setOutputDir
(
self
.
output_dir
)
self
.
_thread
.
setOutputDir
(
self
.
output_dir
)
self
.
_thread
.
setImagePath
(
self
.
imagePaths
)
self
.
_thread
.
setImagePath
(
self
.
imagePaths
)
self
.
_thread
.
setLang
(
lang
)
self
.
_thread
.
setLang
(
lang
)
self
.
_thread
.
setPDFParser
(
pdfParser
)
# disenble buttons
# disenble buttons
self
.
openFileButton
.
setEnabled
(
False
)
self
.
openFileButton
.
setEnabled
(
False
)
self
.
startCNButton
.
setEnabled
(
False
)
self
.
startCNButton
.
setEnabled
(
False
)
self
.
startENButton
.
setEnabled
(
False
)
self
.
startENButton
.
setEnabled
(
False
)
self
.
PDFParserButton
.
setEnabled
(
False
)
# 启动工作进程
# 启动工作进程
self
.
_thread
.
start
()
self
.
_thread
.
start
()
self
.
time_start
=
time
.
time
()
# log start time
self
.
time_start
=
time
.
time
()
# log start time
...
@@ -411,7 +457,7 @@ class APP_Image2Doc(QWidget):
...
@@ -411,7 +457,7 @@ class APP_Image2Doc(QWidget):
QMessageBox
.
information
(
self
,
QMessageBox
.
information
(
self
,
u
'Information'
,
"输出文件不存在"
)
u
'Information'
,
"输出文件不存在"
)
def
handleProgressBarSingal
(
self
,
i
):
def
handleProgressBar
Update
Singal
(
self
,
i
):
self
.
pb
.
setValue
(
i
)
self
.
pb
.
setValue
(
i
)
# calculate time left of recognition
# calculate time left of recognition
lenbar
=
self
.
pb
.
maximum
()
lenbar
=
self
.
pb
.
maximum
()
...
@@ -419,13 +465,24 @@ class APP_Image2Doc(QWidget):
...
@@ -419,13 +465,24 @@ class APP_Image2Doc(QWidget):
time_left
=
str
(
datetime
.
timedelta
(
seconds
=
avg_time
*
(
lenbar
-
i
))).
split
(
"."
)[
0
]
# Remove microseconds
time_left
=
str
(
datetime
.
timedelta
(
seconds
=
avg_time
*
(
lenbar
-
i
))).
split
(
"."
)[
0
]
# Remove microseconds
self
.
timeEstLabel
.
setText
(
f
"Time Left:
{
time_left
}
"
)
# show time left
self
.
timeEstLabel
.
setText
(
f
"Time Left:
{
time_left
}
"
)
# show time left
def
handleProgressBarRangeSingal
(
self
,
max
):
self
.
pb
.
setRange
(
0
,
max
)
def
handleEndsignalSignal
(
self
):
def
handleEndsignalSignal
(
self
):
# enble buttons
# enble buttons
self
.
openFileButton
.
setEnabled
(
True
)
self
.
openFileButton
.
setEnabled
(
True
)
self
.
startCNButton
.
setEnabled
(
True
)
self
.
startCNButton
.
setEnabled
(
True
)
self
.
startENButton
.
setEnabled
(
True
)
self
.
startENButton
.
setEnabled
(
True
)
self
.
PDFParserButton
.
setEnabled
(
True
)
QMessageBox
.
information
(
self
,
u
'Information'
,
"转换结束"
)
QMessageBox
.
information
(
self
,
u
'Information'
,
"转换结束"
)
def
handleCBChangeSignal
(
self
):
self
.
_thread
.
setPDFParser
(
self
.
checkBox
.
isChecked
())
def
handleThreadException
(
self
,
message
):
self
.
_thread
.
quit
()
QMessageBox
.
information
(
self
,
message
)
def
main
():
def
main
():
app
=
QApplication
(
sys
.
argv
)
app
=
QApplication
(
sys
.
argv
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录