Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleOCR
提交
839357fa
P
PaddleOCR
项目概览
PaddlePaddle
/
PaddleOCR
大约 1 年 前同步成功
通知
1528
Star
32962
Fork
6643
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
108
列表
看板
标记
里程碑
合并请求
7
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleOCR
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
108
Issue
108
列表
看板
标记
里程碑
合并请求
7
合并请求
7
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
839357fa
编写于
9月 07, 2022
作者:
W
whjdark
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
new table gt format
上级
70ad319a
变更
3
显示空白变更内容
内联
并排
Showing
3 changed file
with
53 addition
and
171 deletion
+53
-171
PPOCRLabel/PPOCRLabel.py
PPOCRLabel/PPOCRLabel.py
+33
-46
PPOCRLabel/libs/dataPartitionDialog.py
PPOCRLabel/libs/dataPartitionDialog.py
+0
-113
PPOCRLabel/libs/utils.py
PPOCRLabel/libs/utils.py
+20
-12
未找到文件。
PPOCRLabel/PPOCRLabel.py
浏览文件 @
839357fa
...
...
@@ -2449,13 +2449,6 @@ class MainWindow(QMainWindow):
export PPLabel and CSV to JSON (PubTabNet)
'''
import
pandas
as
pd
from
libs.dataPartitionDialog
import
DataPartitionDialog
# data partition user input
partitionDialog
=
DataPartitionDialog
(
parent
=
self
)
partitionDialog
.
exec
()
if
partitionDialog
.
getStatus
()
==
False
:
return
# automatically save annotations
self
.
saveFilestate
()
...
...
@@ -2479,27 +2472,18 @@ class MainWindow(QMainWindow):
else
:
labeldict
[
file
]
=
[]
train_split
,
val_split
,
test_split
=
partitionDialog
.
getDataPartition
()
# check validate
if
train_split
+
val_split
+
test_split
>
100
:
msg
=
"The sum of training, validation and testing data should be less than 100%"
QMessageBox
.
information
(
self
,
"Information"
,
msg
)
return
print
(
train_split
,
val_split
,
test_split
)
train_split
,
val_split
,
test_split
=
float
(
train_split
)
/
100.
,
float
(
val_split
)
/
100.
,
float
(
test_split
)
/
100.
train_id
=
int
(
len
(
labeldict
)
*
train_split
)
val_id
=
int
(
len
(
labeldict
)
*
(
train_split
+
val_split
))
print
(
'Data partition: train:'
,
train_id
,
'validation:'
,
val_id
-
train_id
,
'test:'
,
len
(
labeldict
)
-
val_id
)
TableRec_excel_dir
=
os
.
path
.
join
(
self
.
lastOpenDir
,
'tableRec_excel_output'
)
json_results
=
[]
imgid
=
0
# read table recognition output
TableRec_excel_dir
=
os
.
path
.
join
(
self
.
lastOpenDir
,
'tableRec_excel_output'
)
# save txt
fid
=
open
(
"{}/gt.txt"
.
format
(
self
.
lastOpenDir
),
"w"
,
encoding
=
'utf-8'
)
for
image_path
in
labeldict
.
keys
():
# load csv annotations
filename
,
_
=
os
.
path
.
splitext
(
os
.
path
.
basename
(
image_path
))
csv_path
=
os
.
path
.
join
(
TableRec_excel_dir
,
filename
+
'.xlsx'
)
csv_path
=
os
.
path
.
join
(
TableRec_excel_dir
,
filename
+
'.xlsx'
)
if
not
os
.
path
.
exists
(
csv_path
):
continue
...
...
@@ -2518,28 +2502,31 @@ class MainWindow(QMainWindow):
cells
=
[]
for
anno
in
labeldict
[
image_path
]:
tokens
=
list
(
anno
[
'transcription'
])
obb
=
anno
[
'points'
]
hbb
=
OBB2HBB
(
np
.
array
(
obb
)).
tolist
()
cells
.
append
({
'tokens'
:
tokens
,
'bbox'
:
hbb
})
# data split
if
imgid
<
train_id
:
split
=
'train'
elif
imgid
<
val_id
:
split
=
'val'
else
:
split
=
'test'
# save dict
html
=
{
'structure'
:
{
'tokens'
:
token_list
},
'cells'
:
cells
}
json_results
.
append
({
'filename'
:
os
.
path
.
basename
(
image_path
),
'split'
:
split
,
'imgid'
:
imgid
,
'html'
:
html
})
imgid
+=
1
# save json
with
open
(
"{}/annotation.json"
.
format
(
self
.
lastOpenDir
),
"w"
,
encoding
=
'utf-8'
)
as
fid
:
fid
.
write
(
json
.
dumps
(
json_results
,
ensure_ascii
=
False
))
msg
=
'JSON sucessfully saved in {}/annotation.json'
.
format
(
self
.
lastOpenDir
)
cells
.
append
({
'tokens'
:
tokens
,
'bbox'
:
anno
[
'points'
]
})
# 构造标注信息
html
=
{
'structure'
:
{
'tokens'
:
token_list
},
'cells'
:
cells
}
d
=
{
'filename'
:
os
.
path
.
basename
(
image_path
),
'html'
:
html
}
d
[
'gt'
]
=
rebuild_html_from_ppstructure_label
(
d
)
# imgid += 1
fid
.
write
(
'{}
\n
'
.
format
(
json
.
dumps
(
d
,
ensure_ascii
=
False
)))
# convert to PP-Structure label format
fid
.
close
()
msg
=
'JSON sucessfully saved in {}/gt.txt'
.
format
(
self
.
lastOpenDir
)
QMessageBox
.
information
(
self
,
"Information"
,
msg
)
def
autolcm
(
self
):
...
...
PPOCRLabel/libs/dataPartitionDialog.py
已删除
100644 → 0
浏览文件 @
70ad319a
try
:
from
PyQt5.QtGui
import
*
from
PyQt5.QtCore
import
*
from
PyQt5.QtWidgets
import
*
except
ImportError
:
from
PyQt4.QtGui
import
*
from
PyQt4.QtCore
import
*
from
libs.utils
import
newIcon
import
time
import
datetime
import
json
import
cv2
import
numpy
as
np
BB
=
QDialogButtonBox
class
DataPartitionDialog
(
QDialog
):
def
__init__
(
self
,
parent
=
None
):
super
().
__init__
()
self
.
parnet
=
parent
self
.
title
=
'DATA PARTITION'
self
.
train_ratio
=
70
self
.
val_ratio
=
15
self
.
test_ratio
=
15
self
.
initUI
()
def
initUI
(
self
):
self
.
setWindowTitle
(
self
.
title
)
self
.
setWindowModality
(
Qt
.
ApplicationModal
)
self
.
flag_accept
=
True
if
self
.
parnet
.
lang
==
'ch'
:
msg
=
"导出JSON前请保存所有图像的标注且关闭EXCEL!"
else
:
msg
=
"Please save all the annotations and close the EXCEL before exporting JSON!"
info_msg
=
QLabel
(
msg
,
self
)
info_msg
.
setWordWrap
(
True
)
info_msg
.
setStyleSheet
(
"color: red"
)
info_msg
.
setFont
(
QFont
(
'Arial'
,
12
))
train_lbl
=
QLabel
(
'Train split: '
,
self
)
train_lbl
.
setFont
(
QFont
(
'Arial'
,
15
))
val_lbl
=
QLabel
(
'Valid split: '
,
self
)
val_lbl
.
setFont
(
QFont
(
'Arial'
,
15
))
test_lbl
=
QLabel
(
'Test split: '
,
self
)
test_lbl
.
setFont
(
QFont
(
'Arial'
,
15
))
self
.
train_input
=
QLineEdit
(
self
)
self
.
train_input
.
setFont
(
QFont
(
'Arial'
,
15
))
self
.
val_input
=
QLineEdit
(
self
)
self
.
val_input
.
setFont
(
QFont
(
'Arial'
,
15
))
self
.
test_input
=
QLineEdit
(
self
)
self
.
test_input
.
setFont
(
QFont
(
'Arial'
,
15
))
self
.
train_input
.
setText
(
str
(
self
.
train_ratio
))
self
.
val_input
.
setText
(
str
(
self
.
val_ratio
))
self
.
test_input
.
setText
(
str
(
self
.
test_ratio
))
validator
=
QIntValidator
(
0
,
100
)
self
.
train_input
.
setValidator
(
validator
)
self
.
val_input
.
setValidator
(
validator
)
self
.
test_input
.
setValidator
(
validator
)
gridlayout
=
QGridLayout
()
gridlayout
.
addWidget
(
info_msg
,
0
,
0
,
1
,
2
)
gridlayout
.
addWidget
(
train_lbl
,
1
,
0
)
gridlayout
.
addWidget
(
val_lbl
,
2
,
0
)
gridlayout
.
addWidget
(
test_lbl
,
3
,
0
)
gridlayout
.
addWidget
(
self
.
train_input
,
1
,
1
)
gridlayout
.
addWidget
(
self
.
val_input
,
2
,
1
)
gridlayout
.
addWidget
(
self
.
test_input
,
3
,
1
)
bb
=
BB
(
BB
.
Ok
|
BB
.
Cancel
,
Qt
.
Horizontal
,
self
)
bb
.
button
(
BB
.
Ok
).
setIcon
(
newIcon
(
'done'
))
bb
.
button
(
BB
.
Cancel
).
setIcon
(
newIcon
(
'undo'
))
bb
.
accepted
.
connect
(
self
.
validate
)
bb
.
rejected
.
connect
(
self
.
cancel
)
gridlayout
.
addWidget
(
bb
,
4
,
0
,
1
,
2
)
self
.
setLayout
(
gridlayout
)
self
.
show
()
def
validate
(
self
):
self
.
flag_accept
=
True
self
.
accept
()
def
cancel
(
self
):
self
.
flag_accept
=
False
self
.
reject
()
def
getStatus
(
self
):
return
self
.
flag_accept
def
getDataPartition
(
self
):
self
.
train_ratio
=
int
(
self
.
train_input
.
text
())
self
.
val_ratio
=
int
(
self
.
val_input
.
text
())
self
.
test_ratio
=
int
(
self
.
test_input
.
text
())
return
self
.
train_ratio
,
self
.
val_ratio
,
self
.
test_ratio
def
closeEvent
(
self
,
event
):
self
.
flag_accept
=
False
self
.
reject
()
PPOCRLabel/libs/utils.py
浏览文件 @
839357fa
...
...
@@ -176,18 +176,6 @@ def boxPad(box, imgShape, pad : int) -> np.array:
return
box
def
OBB2HBB
(
obb
)
->
np
.
array
:
"""
Convert Oriented Bounding Box to Horizontal Bounding Box.
"""
hbb
=
np
.
zeros
(
4
,
dtype
=
np
.
int32
)
hbb
[
0
]
=
min
(
obb
[:,
0
])
hbb
[
1
]
=
min
(
obb
[:,
1
])
hbb
[
2
]
=
max
(
obb
[:,
0
])
hbb
[
3
]
=
max
(
obb
[:,
1
])
return
hbb
def
expand_list
(
merged
,
html_list
):
'''
Fill blanks according to merged cells
...
...
@@ -232,6 +220,26 @@ def convert_token(html_list):
return
token_list
def
rebuild_html_from_ppstructure_label
(
label_info
):
from
html
import
escape
html_code
=
label_info
[
'html'
][
'structure'
][
'tokens'
].
copy
()
to_insert
=
[
i
for
i
,
tag
in
enumerate
(
html_code
)
if
tag
in
(
'<td>'
,
'>'
)
]
for
i
,
cell
in
zip
(
to_insert
[::
-
1
],
label_info
[
'html'
][
'cells'
][::
-
1
]):
if
cell
[
'tokens'
]:
cell
=
[
escape
(
token
)
if
len
(
token
)
==
1
else
token
for
token
in
cell
[
'tokens'
]
]
cell
=
''
.
join
(
cell
)
html_code
.
insert
(
i
+
1
,
cell
)
html_code
=
''
.
join
(
html_code
)
html_code
=
'<html><body><table>{}</table></body></html>'
.
format
(
html_code
)
return
html_code
def
stepsInfo
(
lang
=
'en'
):
if
lang
==
'ch'
:
msg
=
"1. 安装与运行:使用上述命令安装与运行程序。
\n
"
\
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录